def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.status = 0 self.trans = googletrans.Translator() self.sentence = sentence.strip("\n").replace(" ", "") en_trans = self.trans.translate(sentence).text en_trans = sg.tokenize(en_trans) try: tree = list(en_parser.parse(en_trans)) self.tree = tree[0] # print(self.tree) self.rel = [] except: self.status = 1
def get_sentence_embeddings(sentences, ngram='bigrams', model='concat_wiki_twitter'): """ Returns a numpy matrix of embeddings for one of the published models. It handles tokenization and can be given raw sentences. Arguments: - ngram: 'unigrams' or 'bigrams' - model: 'wiki', 'twitter', or 'concat_wiki_twitter' - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...] """ wiki_embeddings = None twitter_embbedings = None tokenized_sentences_NLTK_tweets = None tokenized_sentences_SNLP = None if model == "wiki" or model == 'concat_wiki_twitter': tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(sentences) # just a trick to make things faster tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ') assert (len(tokenized_sentences_SNLP) == len(sentences)) wiki_embeddings = get_embeddings_for_preprocessed_sentences(tokenized_sentences_SNLP, \ MODEL_PG_3KBOOKS_BIGRAMS, FASTTEXT_EXEC_PATH) if model == "wiki": return wiki_embeddings elif model == "concat_wiki_twitter": return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1) sys.exit(-1)
def tokenize(content): """Breaks up text-based content into tokens in the style of PTB corpus""" _path_to_jar = os.path.abspath( 'summarize/stanford-postagger/stanford-postagger.jar') token_list = [] st = StanfordTokenizer(path_to_jar=_path_to_jar) content = content.lower() token_list = st.tokenize(content) return token_list
def stanfordNERInit(): os.environ[ 'CLASSPATH'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/stanford-ner.jar:C:/users/home/stanford-ner/stanford-ner-2017-06-09/lib/*:C:/users/home/stanford-postagger-full-2017-06-09/stanford-postagger.jar' os.environ[ 'STANFORD_MODELS'] = 'C:/users/home/stanford-ner/stanford-ner-2017-06-09/classifiers/' sent_detection = nltk.data.load('tokenizers/punkt/english.pickle') st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') tokenizer = StanfordTokenizer() return sent_detection, st, tokenizer
def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True})
def sentence_embeddings(wikiuni, snlpjar, fasttext, sentences, ngram='unigrams', model='concat_wiki_twitter'): """ Generate embeddings from a list of sentences. Parameters: ----------- wikiuni: string Path to the Wikipedia embeddings parser: string Path to the folder containing the Stanford Parser jar: string Path to the JAR file of the Stanford tagger fasttext: string Path to the executable of FastText sentences: list List containing raw sentences e.g., ['Once upon a time', 'This is another sentence.', ...] ngram: string (unigram|bigram) ngram used in Wikipedia embeddings model: string (wiki|twitter|concat_wiki_twitter) """ wiki_embeddings = None twitter_embbedings = None tokenized_sentences_NLTK_tweets = None tokenized_sentences_SNLP = None if model == "wiki" or model == 'concat_wiki_twitter': tknzr = StanfordTokenizer(snlpjar, encoding='utf-8') s = ' <delimiter> '.join(sentences) #just a trick to make things faster tkn_sentences_SNLP = tokenize_sentences(tknzr, [s]) tkn_sentences_SNLP = tkn_sentences_SNLP[0].split(' <delimiter> ') assert(len(tkn_sentences_SNLP) == len(sentences)) if ngram == 'unigrams': wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \ wikiuni, fasttext) # We are not using Twitter or Bigrams so far # else: # wiki_embeddings = sent2embeddings(tkn_sentences_SNLP, \ # MODEL_WIKI_BIGRAMS, FASTTEXT_EXEC_PATH) # if model == "twitter" or model == 'concat_wiki_twitter': # tknzr = TweetTokenizer() # tkn_sentences_NLTK_tweets = tokenize_sentences(tknzr, sentences) # if ngram == 'unigrams': # twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \ # MODEL_TWITTER_UNIGRAMS, FASTTEXT_EXEC_PATH) # else: # twitter_embbedings = sent2embeddings(tkn_sentences_NLTK_tweets, \ # MODEL_TWITTER_BIGRAMS, FASTTEXT_EXEC_PATH) # if model == "wiki": return wiki_embeddings #elif model == "twitter": # return twitter_embbedings #elif model == "concat_wiki_twitter": # return np.concatenate((wiki_embeddings, twitter_embbedings), axis=1) sys.exit(-1)
def tokenize_q(qa, phase): qas = len(qa) MyTokenizer = StanfordTokenizer() for i, row in enumerate((qa)): row['question_toked'] = MyTokenizer.tokenize( row['question'].lower())[:14] if i % 50000 == 0: json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w')) if i == qas - 1: json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
def tokenize_stopwords_stemmer(texts): #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts=texts[0] #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") #path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子 java_path = 'C:/Program Files/Java/jdk1.8.0_121/bin/java.exe' os.environ['JAVAHOME'] = java_path texts_tokenized=tokenizer.tokenize(Str_texts)#输入必须是字符串,进行分词 #print(texts_tokenized) p1=r'[-@<#$%^&*].+' pa1=re.compile(p1) #re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例) texts_filtered0 = [ document for document in texts_tokenized if not document in pa1.findall(document) ] p2=r'.+[-_\/].+' #将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式 pa2=re.compile(p2) texts_filtered=[] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_')>-1 : #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list) texts_filtered = texts_filtered + document.split('_') elif document.find('-')>-1: texts_filtered = texts_filtered + document.split('-') elif document.find('.')>-1: texts_filtered = texts_filtered + document.split('.') elif document.find('/')>-1: texts_filtered = texts_filtered + document.split('/') else: texts_filtered.append(document) texts_filtered = [ document for document in texts_filtered if document != '' and document != "''" and document != "``" ]#过滤掉空格,单引号和-- #stopwords english_stopwords = stopwords.words('english')#得到停词 texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords]#过滤掉停词 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%','\n' ,'<','>','/','\"','\'','{','}','!','~','`' ,'$','^','/*','*/','/**','**/','**','-','_','+','=',r'-?-',r'@?']#得到标点 texts_filtered = [ document for document in texts_filtered_stopwords if not document in english_punctuations]#过滤掉标点 #print texts_filtered temp = texts_filtered[:] #实现去除带'comment'元素的代码 for i in temp: if 'comment' in i: texts_filtered.remove(i) #print(texts_filtered) #texts_filtered=[re.sub(r'^[1-9]\d*$'.format(punctuation), '', x) for x in texts_filtered] # ^[1-9]\d*$过滤掉整数 porter = nltk.PorterStemmer() #词干提取算法 texts_Stemmered=[porter.stem(t) for t in texts_filtered] #列表类型,提取词干 return texts_Stemmered #返回一个列表
def __init__(self): self.dm_single_close_quote = u'\u2019' # unicode self.dm_double_close_quote = u'\u201d' self.END_TOKENS = [ '.', '!', '?', '...', "'", "`", '"', self.dm_single_close_quote, self.dm_double_close_quote, ")" ] # acceptable ways to end a sentence # We use these to separate the summary sentences in the .bin datafiles self.SENTENCE_START = '<s>' self.SENTENCE_END = '</s>' self.tokenizer = StanfordTokenizer('stanford-postagger.jar', options={"tokenizeNLs": True})
def __init__(self): # set envirinment variable # TO DO: update to Docker path os.environ['CLASSPATH'] = resource_filename(__name__, 'tokenizers/') # load tokenizer and tagger # TO DO: again, update to Docker path self.STANFORD_TOKENIZER = StanfordTokenizer( resource_filename(__name__, 'tokenizers/stanford-ner-3.6.0.jar')) self.SMO_tagger = StanfordNERTagger( resource_filename(__name__, 'classifiers/ner-orgs_2016-03-28_all.ser.gz'))
def spans(txt): english_tokenizer = StanfordTokenizer( 'C:/Users/dongfangxu9/PycharmProjects/pos_tagger/stanford-postagger.jar', options={ "americanize": True, }, java_options='-mx1000m') tokens = english_tokenizer.tokenize(txt) offset = 0 for token in tokens: offset = txt.find(token, offset) yield token, offset, offset + len(token) offset += len(token)
def __init__(self, sentence): en_parser = StanfordParser(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar', path_to_models_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1-models.jar', ) sg = StanfordTokenizer(path_to_jar='../stanford-corenlp-full-2018-02-27/stanford-corenlp-3.9.1.jar') self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).text) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel=[]
def __init__( self, singleword_spells, multiword_spells, tokenize_by="text", #tokenize_by="sentence", punkt_tokenizer='tokenizers/punkt/english.pickle', path_stanford_jar="/home/david/Descargas/stanford-corenlp-3.8.0.jar" ): self.singleword_spells = singleword_spells self.multiword_spells = multiword_spells self.multiword_spells_joint = [ "_".join(s.split()) for s in multiword_spells ] self.tokenize_by = tokenize_by self.toktok = StanfordTokenizer(path_to_jar=path_stanford_jar) self.sent_detector = nltk.data.load(punkt_tokenizer)
def get_sentence_embeddings(sentences, train, d): """ Returns a numpy matrix of embeddings for one of the published models. It handles tokenization and can be given raw sentences. Arguments: - ngram: 'unigrams' or 'bigrams' - model: 'wiki', 'twitter', or 'concat_wiki_twitter' - sentences: a list of raw sentences ['Once upon a time', 'This is another sentence.', ...] """ tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(sentences) #just a trick to make things faster tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s])[0] # tokenized_sentences_SNLP = tokenized_sentences_SNLP[0].split(' <delimiter> ') assert (len(tokenized_sentences_SNLP) == len(sentences)) wiki_embeddings = get_embeddings_for_preprocessed_sentences( tokenized_sentences_SNLP, MODEL_WIKI_UNIGRAMS, FASTTEXT_EXEC_PATH, train, d) return wiki_embeddings
def __init__(self, paths_json): set_environment_paths(paths_json) self.sentence_sequences = [] self.valence_sequences = [] self.sentence_trees = [] self.valence_trees = [] self.CompleteWordIndices = [] self.model = "" self.models2run = [] self.neg_scope_method = "" self.neg_res_method = "" self.sent_comp_method = "" valence_dict_path = paths_json["VALENCE_DICT"] with open(valence_dict_path) as json_file: self.VALENCE_DICT = json.loads(json_file.read()) negtool_negscopes_path = paths_json["NEGTOOL_NEGSCOPE"] self.negtool_neg_scopes_file = open(negtool_negscopes_path, "r") self.negtool_neg_scopes_file_current_line = 0 self.use_negtool = False meaning_spec_distribution_dict_path = paths_json[ "MEANING_SPEC_DISTRIBUTION_DICT_PATH"] with open(meaning_spec_distribution_dict_path) as json_file: self.distribution_dict = json.loads(json_file.read()) #window neg scope self.window_size = 4 self.review_id = 0 self.sentence_id = 0 #for negtool purposes #constants self.contractions = ["n't", "'m", "'ll", "'d", "'s", "'ve", "'re"] #parser and tokenizer initialization # self.PARSER = StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") self.TOKENIZER = StanfordTokenizer() #using server self.CORENLP = StanfordCoreNLP('http://localhost:9000')
def __init__(self, data, path, all_names): self.data = data self.path = path self.english_postagger = StanfordPOSTagger( path + 'models/english-left3words-distsim.tagger', path + 'lib/stanford-postagger-3.4.1.jar', java_options='-Xmx2g') self.english_tokenizer = StanfordTokenizer( path + 'lib/stanford-postagger-3.4.1.jar', 'utf-8') self.all_names = all_names self.pos = self.extract_POS() self.nms = self.extract_names() self.wg1 = self.extract_wordgrams(1) self.wg2 = self.extract_wordgrams(2) self.cg1 = self.extract_chargrams(1) self.cg2 = self.extract_chargrams(2) self.cg3 = self.extract_chargrams(3) self.bl = self.extract_breaklines() self.ws = self.extract_websites()
def Tok_handler(self, sentence, parser): if parser == "spacy": try: import spacy, en_core_web_sm except ImportError: print("Can't import spacy") nlp = en_core_web_sm.load() doc = nlp(sentence) return [str(token) for token in doc] elif parser == "nltk": try: import nltk from nltk.tokenize.stanford import StanfordTokenizer os.environ["CLASSPATH"] = "./StanfordNLP/jars" os.environ["STANFORD_MODELS"] = "./StanfordNLP/models" except ImportError: print("Can't import spacy") tokenizer = StanfordTokenizer() return tokenizer.tokenize(sentence)
def __init__(self, sentence): en_parser = StanfordParser( path_to_jar= '../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path= '../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) sg = StanfordTokenizer( path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar' ) self.trans = googletrans.Translator() self.sentence = sentence result1 = sg.tokenize(self.trans.translate(sentence).get_text()) tree = list(en_parser.parse(result1)) self.tree = tree[0] self.rel = []
def tokenize(corenlp, review, span=False): r_dict = corenlp._request('ssplit', review) tokens2 = StanfordTokenizer().tokenize(review) print(r_dict) print(tokens2) tokens = [ token['word'] for s in r_dict['sentences'] for token in s['tokens'] ] sentences = [] current_sentence = [] for token in tokens: if (not bool(re.compile(r'[^\!\?]').search(token)) or token == "."): #only ! or ? current_sentence.append(token) sentences.append(current_sentence) current_sentence = [] else: current_sentence.append(token) #return [" ".join(sentence[:-1])+sentence[-1] for sentence in sentences] #return sentences return sentences #return tokenized sentences
import emoji from nltk.tokenize.stanford import StanfordTokenizer s = "Good muffins :-X cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." tokens = StanfordTokenizer().tokenize(s) print(tokens) a = {'a', 'b', 'c'} b = {'b', 'c', 'd'} c = {'a', 'd', 'e'} d = a | b | c print(d)
reader_obj = csv.reader(csvfile) for row in reader_obj: # row[0] -> id # row[1] -> title # row[2] -> content # row[3] -> tags #soup = BeautifulSoup(row[2]) #if soup.code != None: # codecount += 1 title_list += " " + row[1] check = int(time.time() - start) if check >= 10: print count, time.time() - start, "seconds" start = time.time() if not count % 10000: word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\ ).tokenize(title_list) fout.write(' '.join(word_list).encode('utf-8') + "\n") title_list = "" print(count) #word_list = nltk.word_tokenize(row[1].encode('utf-8')) #if not codecount%10000: # print codecount count += 1 #print(soup.get_text()) #pdb.set_trace() word_list = StanfordTokenizer(path_to_jar="/Users/apple/Downloads//stanford-postagger-2015-01-29/stanford-postagger-2015-01-30/stanford-postagger.jar"\ ).tokenize(title_list) fout.write(' '.join(word_list).encode('utf-8') + "\n") title_list = [] print(count)
print("Tokenizing all requests.") tweet_tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True) tokenized_datasets_original_tweet = [[ tweet_tokenizer.tokenize(request) for request in dataset ] for dataset in datasets] print("Retokenizing with Stanford tokenizer. This may take a long time.") path_pos = "/playpen/home/tongn/stanford-postagger-full-2017-06-09/" jar_pos = "stanford-postagger.jar" tokenizer = StanfordTokenizer(path_pos + jar_pos) tokenizer = StanfordTokenizer(tagger_path) # tokenized_datasets_original = [ # [tokenizer.tokenize(' '.join(request).strip()) # for request in dataset] # for dataset in tokenized_datasets_original_tweet] tokenized_datasets_original = tokenized_datasets_original_tweet """ Convert all tokens to lowercase """ tokenized_datasets = [[[token.lower() for token in request] for request in dataset] for dataset in tokenized_datasets_original] """ Build the whole vocabulary
#sqlite3 connection dbname = '/home/aahu/Dropbox/ryancompton.net/assets/praw_drugs/drugs.db' conn = sqlalchemy.create_engine('sqlite+pysqlite:///' + dbname, module=sqlite3.dbapi2) def load_subreddit(tablename, conn): df = pd.read_sql(tablename, conn) return df # <codecell> from nltk.tokenize.stanford import StanfordTokenizer stanfordTokenizer = StanfordTokenizer( path_to_jar= '/home/aahu/Downloads/stanford-corenlp-full-2015-01-30/stanford-corenlp-3.5.1.jar' ) def my_tokenize(text): return nltk.word_tokenize(text) #return nltk.wordpunct_tokenize(text) #return stanfordTokenizer.tokenize(text) #return nltk.tokenize.TreebankWordTokenizer().tokenize(text) def build_tfidf_transformer(docs=[], tokenizer=my_tokenize, max_doc_count=2000, vocab_limit=10000): """
# unique_normalized_tokens = set(tokens) # # wnl = nltk.WordNetLemmatizer() # vocabulary = [wnl.lemmatize(t) for t in unique_normalized_tokens] # print("Aantal vocabulary: ", len(vocabulary)) # from nltk.parse import stanford from nltk.parse.stanford import StanfordParser from nltk.tokenize.stanford import StanfordTokenizer parser = StanfordParser( model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") tokenizer = StanfordTokenizer( "/home/spijkervet/stanford/stanford-postagger-full/stanford-postagger-3.9.1.jar" ) # Stanford parser uses punctuation! # processed_sentences = [s.translate(s.maketrans('','', string.punctuation)).lower() for s in sentences] processed_sentences = [s.lower() for s in sentences] pickle_name = "hp_trees_parser.pickle" if not os.path.isfile(pickle_name): hp_trees = [] for s in tqdm(processed_sentences): # tree = list(parser.raw_parse(s)) tree = parser.raw_parse(s) # for node in tree: # print(node) hp_trees.append(tree)
def __init__(self): self.tk = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8')
def __init__(self, jar_path): self.tokenizer = StanfordTokenizer(jar_path)
def run(self, data): for corpus in data: corpus.tokenized_contents = StanfordTokenizer().tokenize( corpus.contents) return data
gold = [] for each in lis: item = each.split("\t") gold.append(item[1]) item_email = ' '.join(e for e in item[2:]) item_email = item_email.replace('</br>',' ').replace(':',' ')#.split() #print(item_email) emails.append(item_email) emails_len.append(len(item_email)) max_words_email = max(emails_len) total_email = len(emails) embedding_size = 700 all_mail_vec = [] tknzr = StanfordTokenizer(SNLP_TAGGER_JAR, encoding='utf-8') s = ' <delimiter> '.join(emails) tokenized_sentences_SNLP = tokenize_sentences(tknzr, [s]) emails = tokenized_sentences_SNLP[0].split(' <delimiter> ') embs_email = model.embed_sentences(emails) #each_mail_vec = np.zeros((max_words_email, embedding_size)) #print(vector.shape) f.close() f = open('emails_dataset/emailExplanations_Dec23.sorted.txt','r') concepts = {'REMINDER':[], 'HUMOR' : [], 'EVENT': [], 'EMPLOYEE': [], 'MEETING' : [], 'POLICY' : [], 'CONTACT' : []}
def stanford_tokenize(s): return StanfordTokenizer().tokenize(s)
# get next where clause where = wheres[:andid] # parse current where clause sql['conds'].append(parse_where(where)) # striped processed where clause out of wheres wheres = wheres[andid + 3:] # parse last where clause sql['conds'].append(parse_where(wheres)) return sql if __name__ == '__main__': stanford = StanfordTokenizer() with open('pairs_ronny{}.csv'.format(teststr), 'r+') as f: lines_pairs = f.readlines()[1:] with open('table_ronny.csv', 'r') as f: lines_table = f.readlines() # parse table entry_table = parse_table(lines_table, stanford) with open('dummy_tok.tables.jsonl', 'w') as f: json.dump(entry_table, f) # parse all pairs f = open('dummy_tok{}.jsonl'.format(teststr), 'w+') for line in lines_pairs: entry = dict(phase=2)