def segment_en(texts, flag_keep_number=False): tk = StanfordTokenizer() results = {} for text in texts: if flag_keep_number: words = tk.tokenize(text) else: words = map(replace_number, tk.tokenize(text)) segmented = ' '.join(words).lower() results[text] = segmented return results
def readwordarr(isTokenize=True): posWords = [] negWords = [] stopwords = getstopword() if isTokenize: tokenizer = StanfordTokenizer() with open(negfilepath, 'r', encoding='utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding='utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) else: with open(negfilepath, 'r', encoding='utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding='utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) return posWords, negWords
def readwordarr(isTokenize = True): posWords = [] negWords = [] stopwords = getstopword() if isTokenize: tokenizer = StanfordTokenizer() with open(negfilepath, 'r', encoding = 'utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding = 'utf-8') as sentences: arr = tokenizer.tokenize(sentences.read()) for line in arr: linearr = line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) else: with open(negfilepath, 'r', encoding = 'utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr=line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) negWords.append(list(wordset)) with open(posfilepath, 'r', encoding = 'utf-8') as sentences: lines = sentences.readlines() for line in lines: linearr=line.split() wordset = set() for word in linearr: if word in stopwords: continue wordset.add(word) posWords.append(list(wordset)) return posWords,negWords
class WordSegment(object): def __init__(self, user_dict=None): self.conf_io = conf.load("io") self.conf_corenlp = conf.load("stanford_corenlp") self.conf_embedding = conf.load("embedding") conf_tokenizer = self.conf_corenlp["tokenizer"] conf_postagger = self.conf_corenlp["postagger"] prefix = self.conf_corenlp["prefix"] self.enTokenizer = StanfordTokenizer( path_to_jar=prefix + conf_tokenizer["path_to_jar"] ) self.zh_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_zh"], path_to_jar=prefix + conf_postagger["path_to_jar"] ) self.en_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_en"], path_to_jar=prefix + conf_postagger["path_to_jar"] ) # TODO: # 這裡要加上自定義字典 def get_tokens(self, text): tokens = self.enTokenizer.tokenize(text) return self.en_tagger.tag(tokens) def get_new_words(self, text): pass
def tokenize_stopwords_stemmer(texts): #texts:列表存放的字符串 #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts = texts[0] #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") texts_tokenized = tokenizer.tokenize(Str_texts) #输入必须是字符串 p1 = r'[-@<#$%^&*].+' pa1 = re.compile(p1) texts_filtered0 = [ document for document in texts_tokenized if not document in pa1.findall(document) ] p2 = r'.+[-_\./].+' pa2 = re.compile(p2) texts_filtered = [] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_') > -1: texts_filtered = texts_filtered + document.split('_') elif document.find('-') > -1: texts_filtered = texts_filtered + document.split('-') elif document.find('.') > -1: texts_filtered = texts_filtered + document.split('.') else: texts_filtered.append(document) texts_filtered = [ document for document in texts_filtered if document != '' and document != "''" and document != "``" ] #stopwords english_stopwords = stopwords.words('english') #得到停词 texts_filtered_stopwords = [ document for document in texts_filtered if not document in english_stopwords ] # english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/', '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?' ] #得到标点 texts_filtered = [ document for document in texts_filtered_stopwords if not document in english_punctuations ] # porter = nltk.PorterStemmer() texts_Stemmered = [porter.stem(t) for t in texts_filtered] #列表类型 return texts_Stemmered #返回一个列表 """
def segment(texts): tk = StanfordTokenizer() results = {} for text in texts: words = tk.tokenize(text) segmented = ' '.join(words).lower() results[text] = segmented return results
class POSTagger: """POSTagger creates a POS tagger for german language. Different tagger are available to use.""" STAN = "stanford-hgc-tagger" SFT = "stanford-fast-tagger" TT = "tree-tagger" SPACY = "spacy-tagger" # paths to Stanford tagger modules __path_to_jar = "C:/Users/din_m/MA/Stanford Tagger/stanford-postagger.jar" __model_file_name = "C:/Users/din_m/MA/Stanford Tagger/models/" def __init__(self, tagger): """Initialize a new POS tagger. Takes tagger parameter as an argument to define the kind of tagger.""" self.__tokenizer = StanfordTokenizer(path_to_jar=POSTagger.__path_to_jar) if tagger == POSTagger.STAN: self.tagger_name = POSTagger.STAN self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-hgc.tagger") elif tagger == POSTagger.SFT: self.tagger_name = POSTagger.SFT self.__tagger = StanfordPOSTagger(path_to_jar=POSTagger.__path_to_jar, model_filename=POSTagger.__model_file_name + "german-fast.tagger") elif tagger == POSTagger.TT: self.tagger_name = POSTagger.TT self.__tagger = treetaggerwrapper.TreeTagger(TAGLANG='de') # SpaCy takes really long to initialize (about 5-7 minutes), but performs well and fast afterwards elif tagger == POSTagger.SPACY: self.tagger_name = POSTagger.SPACY self.__tagger = spacy.load('de') else: raise Exception("Wrong tagger parameter.") def tag(self, text): """POS tag tokenized text.""" if self.tagger_name == POSTagger.SFT or self.tagger_name == POSTagger.STAN: tokens = self.__tokenizer.tokenize(text) return self.__tagger.tag(tokens) elif self.tagger_name == POSTagger.TT: tags = self.__tagger.tag_text(text) tuple_list = [] tag_list = treetaggerwrapper.make_tags(tags) for item in tag_list: tuple_list.append((item[0], item[1])) return tuple_list elif self.tagger_name == POSTagger.SPACY: tags = self.__tagger(text) tuple_list = [] for word in tags: tuple_list.append((word.orth_, word.tag_)) return tuple_list else: pass #tagger = POSTagger("spacy-tagger") #doc = tagger.tag(u"Bei mir zu Hause denken sie bestimmt, daß ich noch krank sei.") #print(tagger.tag("Ich werde morgen in die Schule gehen.")) #print(tagger.tag("Hat Aglaja den Brief etwa der Alten gezeigt?«"))
def tokenize_q(qa, phase): qas = len(qa) MyTokenizer = StanfordTokenizer() for i, row in enumerate(tqdm(qa)): row['question_toked'] = MyTokenizer.tokenize(row['question'].lower())[:14] if i % 50000 == 0: json.dump(qa, open('vqa_' + phase + '_toked_' + str(i) + '.json', 'w')) if i == qas - 1: json.dump(qa, open('vqa_' + phase + '_toked.json', 'w'))
def Tokenize_stopwords_stemmer(texts): #print time() #用斯坦福的分词采用这一段,用普通分词时不用这个 #tokenize Str_texts = texts[0] print os.getcwd() #tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer(path_to_jar=r"stanford-parser.jar") texts_tokenized = tokenizer.tokenize(Str_texts) #输入必须是字符串 #print time() p2 = r'.+[-_\./"].+' pa2 = re.compile(p2) texts_filtered = [] for document in texts_tokenized: if document in pa2.findall(document): if document.find('_') > -1: texts_filtered = texts_filtered + document.split('_') elif document.find('-') > -1: texts_filtered = texts_filtered + document.split('-') elif document.find('.') > -1: texts_filtered = texts_filtered + document.split('.') else: texts_filtered.append(document) #print time() p1 = r'[-@<#$%^&*].+' pa1 = re.compile(p1) p3 = r'.+">' pa3 = re.compile(p3) english_stopwords = stopwords.words('english') #得到停词 english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '$', '^', '/*', '*/', '/**', '**/', '**', '-', '_', '+', '=', r'-?-', r'@?' ] #得到标点 texts_filtered0 = [] for document in texts_filtered: if document in pa1.findall(document) or document in pa3.findall( document ) or document == '' or document == "''" or document == "``" or document in english_stopwords or document in english_punctuations: pass else: texts_filtered0.append(document) #print time() porter = nltk.PorterStemmer() texts_Stemmered = [porter.stem(t) for t in texts_filtered0] #列表类型 #print time() return texts_Stemmered #返回一个列表
def tokenize_stopwords_stemmer(texts): Str_texts = texts[0] # tokenizer = StanfordTokenizer(path_to_jar=r"/Users/apple/Documents/tools/stanford-parser-full-2015-04-20/stanford-parser.jar") tokenizer = StanfordTokenizer( path_to_jar=r"C:\Users\zw\Desktop\stanford-parser.jar") # path_to_jar: 用来定位jar包,r是防止字符转义的,如果路径中出现'\t'的话 不加r的话\t就会被转义 而加了'r'之后'\t'就能保留原有的样子 java_path = 'E:soft/Java/jdk1.8.0_121/bin/java.exe' os.environ['JAVAHOME'] = java_path texts_tokenized = tokenizer.tokenize(Str_texts) # 输入必须是字符串,进行分词 # print(texts_tokenized) p1 = r'[-@<#$%^&*].+' pa1 = re.compile(p1) # re.compile()函数,将正则表达式的字符串形式编译为Pattern实例,然后使用Pattern实例处理文本并获得匹配结果(一个Match实例) texts_filtered0 = [document for document in texts_tokenized if not document in pa1.findall(document)] p2 = r'.+[-_\/].+' # 将r'.+[-_\./].+'改为r'.+[-_\/].+',可以保留数字间的句号,比如保留3.1.2这样的格式 pa2 = re.compile(p2) texts_filtered = [] for document in texts_filtered0: if document in pa2.findall(document): if document.find('_') > -1: # split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list) texts_filtered = texts_filtered + document.split('_') elif document.find('-') > -1: texts_filtered = texts_filtered + document.split('-') elif document.find('.') > -1: texts_filtered = texts_filtered + document.split('.') elif document.find('/') > -1: texts_filtered = texts_filtered + document.split('/') else: texts_filtered.append(document) texts_filtered = [document for document in texts_filtered if document != '' and document != "''" and document != "``"] # 过滤掉空格,单引号和-- # # stopwords # english_stopwords =stopwords.words('english') # 得到停词 # texts_filtered_stopwords = [document for document in texts_filtered if not document in english_stopwords] # 过滤掉停词 english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', '\n', '||', '<', '>', '/', '\"', '\'', '{', '}', '!', '~', '`', '0', '$', '^', '/*', '*/', '/**', '**/', '**', '-', '_', '__', '|', '+', '=', r'-?-', r'@?'] # 得到标点 texts_filtered = [document for document in texts_filtered if not document in english_punctuations] # 过滤掉标点 return texts_filtered
def stanford_tokenizer(str): tokenizer = StanfordTokenizer( path_to_jar= 'D:/software/stanford-parser-full-3.7/stanford-parser-3.7.0-models.jar' ) # sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." return tokenizer.tokenize(str) # if __name__=='__main__': # sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." # result = stanford_tokenizer(sent) # print(result) # st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # from nltk.tokenize import StanfordTokenizer # s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." # StanfordTokenizer().tokenize(s) # s = "The colour of the wall is blue." # StanfordTokenizer(options={"americanize": True}).tokenize(s)
def tokenize(text_list, clean_html=False, tokenizer="twitter", remove_reps=True, spell_correct=True): if tokenizer == "stanford": tolkenizer_obj = StanfordTokenizer() elif tokenizer == "twitter": tolkenizer_obj = TweetTokenizer() else: tolkenizer_obj = StringTokenizer() token_list = [] for text in text_list: if clean_html: text = BeautifulSoup(text).get_text() if remove_reps: text = re.sub(r'(.)\1{2,}', r'\1\1', text) tokens = tolkenizer_obj.tokenize(text) if spell_correct: tokens = [spell(t) for t in tokens] token_list.append(tokens) return token_list
tags = semcor.tagged_sents(tag = 'sem') n = 0 correct = 0 base = 0 total = 0 for sent in corp: sentence = ' '.join(sent) print sentence parsed = list(parser.parse(tokenizer.tokenize(sentence))) for term in tags[n]: if len(term)==1 and isinstance(term[0], basestring) and isinstance(term, Tree) and len(wordnet.synsets(term[0])) > 1: if isinstance(term.label(), unicode): continue syn = term.label().synset() word = term[0] sense_standard = syn print word for pair in parsed[0].triples(): if pair[0][0] == word: pos = pair[0][1] if pair[2][0] == word:
from nltk.tag.stanford import StanfordNERTagger, StanfordPOSTagger from nltk.tokenize import StanfordTokenizer from wordsegment import load, segment CUR_DIRECTORY = '/home/wmq/Desktop/DeepText/StanfordNLP' SEGMENT_PATH = CUR_DIRECTORY + '/stanford-segmenter-3.8.0.jar' NER_MODEL_PATH = CUR_DIRECTORY + '/english.all.3class.distsim.crf.ser.gz' NER_JAR_PATH = CUR_DIRECTORY + '/stanford-ner.jar' POS_MODEL_PATH = CUR_DIRECTORY + '/english-left3words-distsim.tagger' POS_JAR_PATH = CUR_DIRECTORY + '/stanford-postagger.jar' ner_tagger = StanfordNERTagger(NER_MODEL_PATH, NER_JAR_PATH, java_options='') pos_tagger = StanfordPOSTagger(POS_MODEL_PATH, POS_JAR_PATH, java_options='') tokenizer = StanfordTokenizer(SEGMENT_PATH) load() s = "@user nah pretty sure it's jackson's great jokes" ws = tokenizer.tokenize(s) print(' '.join(ws)) # print (' '.join(segment('#happythankgiving'))) # s = 'i got to to go formal with my best friend @ phi mu at jsu'.split() # ner_sent = ner_tagger.tag(s) # pos_sent = pos_tagger.tag(s) # print (ner_sent) # print (pos_sent)
from __future__ import absolute_import
if __name__ == '__main__': #very important # res = request([["excellent"],["poor"]]) poshit = 1510000000032 neghit = 771000000037 print(poshit) print(neghit) stopword = ["-LSB-", "-RSB-", "-LRB-", "-RRB-"] tokenizer = StanfordTokenizer() filename = "F:/course/sentimentcode/rt-polarity.neg" file_object = codecs.open(filename, 'r', 'utf-8') allres = [] try: all_the_text = file_object.read() arr = tokenizer.tokenize(all_the_text) la = len(arr) correct = 0 for line in arr: ax = line.split() wordarr = [] for word in ax: if word in stopword: continue wordarr.append(word) list = nltk.pos_tag(wordarr) result = getPhraseByPos(list) if len(result) == 0: continue allres.append(result) posres = f(allres, 1)
t = line.split("<e1>") text.append(t[0]) e1start = len(t[0]) t = t[1].split("</e1>") e1 = t[0] text.append(t[0]) e1end = len(t[0]) + e1start t = t[1].split("<e2>") text.append(t[0]) e2start = len(t[0]) + e1end t = t[1].split("</e2>") text.append(t[0]) e2 = t[0] e2end = len(t[0]) + e2start text.append(t[1]) text = " ".join(tokenizer.tokenize("".join(text))) txtfile.write(text) txtfile.write("\n") offset = 0 err = False while e1 != text[e1start + offset:e1end + offset]: offset += 1 if e1end + offset > len(text): break if e1end + offset > len(text): offset = 0 e1 = " ".join(tokenizer.tokenize(e1)) e1end = e1start + len(e1) while e1 != text[e1start + offset:e1end + offset]: offset += 1 if e1end + offset > len(text):
from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tokenize import StanfordTokenizer workingdirectory = os.getcwd() segmenter = StanfordSegmenter( path_to_jar=os.path.join(workingdirectory, 'stanford-segmenter.jar'), path_to_slf4j=os.path.join(workingdirectory, 'slf4j-api.jar'), path_to_sihan_corpora_dict=os.path.join(workingdirectory, 'data'), path_to_model=os.path.join(workingdirectory, 'data', 'pku.gz'), path_to_dict=os.path.join(workingdirectory, 'data', 'dict-chris6.ser.gz')) tokenizer = StanfordTokenizer( path_to_jar=os.path.join(workingdirectory, 'stanford-parser.jar')) n = 1 for line in open(sourcefile): token = segmenter.segment(line) words = tokenizer.tokenize(token) with open('%s%s.txt' % (prefix, n), "w", encoding='utf-8') as resultfile: resultwrite = csv.writer(resultfile) for word in words: resultwrite.writerow([word]) n = n + 1 print('Done') elif (tokeniser == 'stanfordctb'): from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tokenize import StanfordTokenizer workingdirectory = os.getcwd() segmenter = StanfordSegmenter( path_to_jar=os.path.join(workingdirectory, 'stanford-segmenter.jar'), path_to_slf4j=os.path.join(workingdirectory, 'slf4j-api.jar'), path_to_sihan_corpora_dict=os.path.join(workingdirectory, 'data'),
# model_path=u"edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" # ) # # mylist = list(eng_parser.parse(sentence.split())) # print(len(mylist)) # print("句法分析结果", mylist) # 依存句法分析 # 对于依存关系的标签说明:http://universaldependencies.org/u/dep/all.html#al-u-dep/det eng_dependency_parser = StanfordDependencyParser( path_to_jar=r"D:\stanford-parser-full-2016-10-31\stanford-parser.jar", path_to_models_jar= r"D:\stanford-parser-full-2016-10-31\stanford-parser-3.7.0-models.jar", model_path=u'edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz') outputs = ' '.join(tokenizer.tokenize("Dole was defeated by Clinton")) print(outputs) result = list(eng_dependency_parser.parse(outputs.split())) for each in result[0].triples(): print(each) # if each[1]=='dobj': # # print(each) # print(each[0][0]) # print(each[2][0]) # print("依存句法分析结果:") # for row in result[0].triples(): # print(row) # print(result[0]) # 中文分词 # 还要研究一下,一下代码报错
path_to_slf4j="/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/slf4j-api.jar", path_to_sihan_corpora_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data", path_to_model="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/pku.gz", path_to_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/dict-chris6.ser.gz" ) str="我在我在博客园开了一个博客。" print (segmenter.segment(str)) #英文分词 from nltk.tokenize import StanfordTokenizer tokenizer=StanfordTokenizer(path_to_jar=r"/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/stanford-parser.jar") sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." print (tokenizer.tokenize(sent)) #中文命名实体识别 from nltk.tag import StanfordNERTagger chi_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/chinese.misc.distsim.crf.ser.gz' ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar') print (chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。\r\n'.split())) #英文命名实体识别 from nltk.tag import StanfordNERTagger eng_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz' ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar') print (eng_tagger.tag('Rami Eid is studying at Stony Brook University in NY'.split()))
'interest_3': 2, 'interest_4': 3, 'interest_5': 4, 'interest_6': 5 } bayes = [[], [], [], [], [], []] count = [0, 0, 0, 0, 0, 0] n = 0 for instance in senseval.instances('interest.pos')[0:1599]: count[sense[instance.senses[0]]] += 1 sentence = ' '.join(w for (w, p) in instance.context) parsed = list(parser.parse(tokenizer.tokenize(sentence))) for triple in parsed[0].triples(): related = 0 if triple[0][0] in interest: word = triple[2][0] related = 1 if triple[2][0] in interest: word = triple[0][0] related = 1 if related == 1: exist = 0 for item in bayes[sense[instance.senses[0]]]: if item[0] == word: item[1] += 1 exist = 1 if exist == 0:
aparser = argparse.ArgumentParser( description="Run CoreNLP tokenizer on a TSV definition file") aparser.add_argument('input_filepath', type=str, help='input file path') aparser.add_argument('output_filepath', type=str, help='output file path') aparser.add_argument('corenlp_postagger_path', type=str, help="path to stanford-postagger.jar") opt = aparser.parse_args() tokenizer = StanfordTokenizer(path_to_jar=opt.corenlp_postagger_path, options={ "ptb3Escaping": "false", "tokenizePerLine": "true", "tokenizeNLs": "true" }) entries = [] definitions = [] with open(opt.input_filepath) as ifp: for line in ifp: parts = line.strip().split('\t') entries.append(parts[:-1]) definitions.append(parts[-1]) def_str = "\n".join(definitions) tokens = tokenizer.tokenize(def_str) def_str = " ".join(tokens) definitions = def_str.split("*NL*") with open(opt.output_filepath, 'w') as ofp: for entry, definition in zip(entries, definitions): ofp.write("{}\t{}\n".format('\t'.join(entry), definition.strip()))
def test_tokenizer(): tokenizer = StanfordTokenizer() sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." print tokenizer.tokenize(sent)
train_question_2 = [] train_labels = [] map_index_vec = dict() map_word_index = dict() tokenizer = StanfordTokenizer(options={"ptb3Escaping": True}) words = set() for col in ['question1', 'question2']: sentences = [] print('Processing column: %s' % col) for i, sentence in enumerate(train_data[col]): if i % 10000 == 0: print('Sentence: %d' % i) split = tokenizer.tokenize(sentence) new_sentence = [] for word in split: word = word.encode('utf-8').strip() word = word.lower() if word in glove_model: if word not in words: words.add(word) new_sentence.append(word) else: if 'unk' not in words: words.add('unk') new_sentence.append('unk') sentences.append(" ".join(new_sentence))
aparser = argparse.ArgumentParser( description="Run CoreNLP tokenizer on a TSV definition file") aparser.add_argument( 'input_filepath', type=str, help='input file path') aparser.add_argument( 'output_filepath', type=str, help='output file path') aparser.add_argument( 'corenlp_postagger_path', type=str, help="path to stanford-postagger.jar") opt = aparser.parse_args() tokenizer = StanfordTokenizer(path_to_jar=opt.corenlp_postagger_path, options={"ptb3Escaping": "false", "tokenizePerLine": "true", "tokenizeNLs": "true"}) entries = [] definitions = [] with open(opt.input_filepath) as ifp: for line in ifp: parts = line.strip().split('\t') entries.append(parts[:-1]) definitions.append(parts[-1]) def_str = "\n".join(definitions) tokens = tokenizer.tokenize(def_str) def_str = " ".join(tokens) definitions = def_str.split("*NL*") with open(opt.output_filepath, 'w') as ofp: for entry, definition in zip(entries, definitions): ofp.write("{}\t{}\n".format('\t'.join(entry), definition.strip()))
import nltk from nltk.tag import StanfordPOSTagger from nltk.tokenize import StanfordTokenizer from nltk.stem import WordNetLemmatizer from nltk.stem import SnowballStemmer snowball_stemmer = SnowballStemmer('english') wordnet_lemmatizer = WordNetLemmatizer() tokenizer = StanfordTokenizer() eng_tagger = StanfordPOSTagger('english-bidirectional-distsim.tagger') text="Dan's parents were overweight.,Dan was overweight as well.,The doctors told his parents it was unhealthy.,His parents understood and decided to make a change.,They got themselves and Dan on a diet.".split(',') for sen in text: token_list=tokenizer.tokenize(sen[:-1]) tagged_sen=eng_tagger.tag(token_list) new_sen=[] for (word,tag) in tagged_sen: # print word,tag if tag[0]=='V': lemma_word=wordnet_lemmatizer.lemmatize(word,pos='v') else: lemma_word=wordnet_lemmatizer.lemmatize(word) stem_word=snowball_stemmer.stem(lemma_word) new_sen.append(stem_word) print " ".join(new_sen)
def en_standseg(sent): tokenizer = StanfordTokenizer( path_to_jar=r"E:\tools\stanfordNLTK\jar\stanford-parser.jar") print(tokenizer.tokenize(sent))
class KeywordExtractor(object): def __init__(self, **kwargs): self.conf_io = conf.load("io") self.conf_corenlp = conf.load("stanford_corenlp") self.conf_embedding = conf.load("embedding") conf_segmenter = self.conf_corenlp["segmenter"] conf_tokenizer = self.conf_corenlp["tokenizer"] conf_postagger = self.conf_corenlp["postagger"] prefix = self.conf_corenlp["prefix"] self.segmenter = StanfordSegmenter( path_to_jar=prefix + conf_segmenter["path_to_jar"], path_to_sihan_corpora_dict=prefix + conf_segmenter["path_to_sihan_corpora_dict"], path_to_model=prefix + conf_segmenter["path_to_model"], path_to_dict=prefix + conf_segmenter["path_to_dict"], path_to_slf4j=prefix + conf_segmenter["path_to_slf4j"], encoding=conf_segmenter["encoding"]) self.enTokenizer = StanfordTokenizer(path_to_jar=prefix + conf_tokenizer["path_to_jar"]) self.zh_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_zh"], path_to_jar=prefix + conf_postagger["path_to_jar"]) self.en_tagger = StanfordPOSTagger( prefix + conf_postagger["tagger_en"], path_to_jar=prefix + conf_postagger["path_to_jar"]) self.frequency = defaultdict(int) pynlpir.open() pynlpir.nlpir.ImportUserDict(conf.load("pynlpir")["user_dict"], Overwrite=False) try: self.excluded_docs = kwargs["excluded_docs"] except: self.excluded_docs = [""] # experimental features self.f_token_indexes = prefix + conf.load("pynlpir")["user_dict"] def _getDocs(self, num_docs, **kwargs): # from pymongo import MongoClient # dbClient = MongoClient("mongodb://127.0.0.1:27017") # db = dbClient["nick"] # collection = db["tickets"] # cursor = collection.find({}).limit(num_docs) # return enumerate(cursor) separated = kwargs["separated"] i = 0 if separated is True: samples_dir = conf.load('samples')['dir'] docs = [ os.path.join(samples_dir, x) for x in os.listdir(samples_dir) ] while i < num_docs: with open(docs[i]) as f: try: yield i, json.load(f) i += 1 except: i += 1 # TODO: may cause problem on the generator index num_docs += 1 return else: samples_loc = conf.load('samples')['single'] docs = json.loads(open(samples_loc).read()) while i < num_docs: try: yield i, docs[i] i += 1 except: i += 1 # TODO: may cause problem on the generator index num_docs += 1 return #try: # from pymongo import MongoClient # dbClient = MongoClient("mongodb://127.0.0.1:27017") # db = dbClient["nick"] # collection = db["tickets"] # cursor = collection.find({}).limit(num_docs) # return enumerate(cursor) #except ImportError: # i = 0 # docs = [os.path.join(conf.load("samples"), x) for x in os.listdir(conf.load("samples"))] # while i < num_docs: # with open(docs[i]) as f: # i += 1 # yield json.load(f) # return def saveToDoclist(self, num_docs, **kwargs): file_docs = open( self.conf_io["prefix"] + self.conf_io["output_data_directory"] + 'num_docs-' + str(num_docs) + '.doclist', 'wb') separated = kwargs["separated"] docs = self._getDocs(num_docs, separated=separated) for ind, i in docs: try: text = i["title"].replace("\n", " ").replace( "\r", " ") + " " + i["body"].replace('\n', ' ').replace( "\r", " ") except Exception as e: print e continue else: file_docs.write("%s\n" % (text.encode("utf-8"))) file_docs.close() def calculateTokens(self, line, **kwargs): line = """{}""".format(line) doc = [ excluded_doc for excluded_doc in self.excluded_docs if excluded_doc not in line.split('\n')[0] ] if doc == []: print "skipped:", line.split('\n')[0] return '' doc_len_lower_bound = int(kwargs["doc_len_lower_bound"]) doc_len_upper_bound = int(kwargs["doc_len_upper_bound"]) if len(line.decode('utf-8')) <= doc_len_lower_bound or len( line.decode('utf-8')) >= doc_len_upper_bound: return '' allowed_list = ["noun", "intransitive verb", "noun-verb", "adjective"] # if you want to try the stanford coreNLP tokenizer in other languages... _en_tokens = [token.lower() for token in pre.GetEnTokens()(line)] with Timer('stanford_seg') as t: _en_tokens_tokenized = self.enTokenizer.tokenize( ' '.join(_en_tokens)) en_tokens = [ token for token in _en_tokens_tokenized if token.lower() not in pre.FilterKeywords().getStopwordsAsJSON()["en"] ] en_tokens = [ token for token in en_tokens if token.lower() not in pre.FilterKeywords().getCustomStopwordsAsList() ] en_tokens = list(set(en_tokens)) # now we have English tokens... tokens_in_each_doc = [] with Timer('stanford_tag') as t: tags = self.en_tagger.tag(en_tokens) for word, tag in tags: if tag in ["NN", "FW", "VBD", "NNS", "VBP"]: tokens_in_each_doc.append(word) # _token_list = [i[0] for i in pynlpir.get_key_words(line.decode("utf-8"), weighted=True)] + en_tokens if str(kwargs["method"]) == "keyword": _token_list = [ i[0] for i in pynlpir.get_key_words(line.decode("utf-8"), weighted=True) ] elif str(kwargs["method"]) == "normal": # for i in pynlpir.segment(line.decode("utf-8"), pos_names='child'): # print i[0], i[1] if "2G" in line.decode("utf-8"): # hot fix for a bug line = line.replace("2G", "") _token_list = [ i[0] for i in pynlpir.segment(line.decode("utf-8"), pos_names='child') if i[1] in allowed_list ] else: _token_list = [ i[0] for i in pynlpir.segment(line.decode("utf-8"), pos_names='child') if i[1] in allowed_list ] __token_list = [ token for token in _token_list if token not in pre.FilterKeywords().getStopwordsAsJSON()["zh"] ] token_list = [ token for token in __token_list if token.lower() not in pre.FilterKeywords().getStopwordsAsJSON()["en"] and token.lower() not in pre.FilterKeywords().getCustomStopwordsAsList() ] zh_tokens = [token for token in token_list if token not in _en_tokens] token_list = zh_tokens + tokens_in_each_doc #remove item in token_list that appears only few times for token in token_list: self.frequency[token.lower()] += 1 tokens = ','.join(token_list) print "Done tokenizing text: ", tokens return tokens def getKeywordsAndSave(self, *args, **kwargs): import pickle freq_lower_bound = int(kwargs["freq_lower_bound"]) token_len_lower_bound = int(kwargs["token_len_lower_bound"]) doc_len_lower_bound = int(kwargs["doc_len_lower_bound"]) doc_len_upper_bound = int(kwargs["doc_len_upper_bound"]) if str(kwargs["method"]) == "keyword": file_keywords = open( self.conf_io["prefix"] + self.conf_io["output_data_directory"] + str(kwargs["target_name"]) + '.fine.keywords', 'w') elif str(kwargs["method"]) == "normal": file_keywords = open( self.conf_io["prefix"] + self.conf_io["output_data_directory"] + str(kwargs["target_name"]) + '.keywords', 'w') tokens = [] token_indexes = {} if bool(kwargs["static_file"]) is True: source_name = self.conf_io["prefix"] + self.conf_io[ "output_data_directory"] + str(kwargs["source_name"]) with open(source_name, 'r') as f: _ind = 0 for ind, line in enumerate(f): try: with Timer('calculateTokens') as t: tokens.append( self.calculateTokens( line, method=str(kwargs["method"]), doc_len_lower_bound=doc_len_lower_bound, doc_len_upper_bound=doc_len_upper_bound)) # [experimental feature] # this is to be used with LDA # to show what raw doc is associated with each topic token_indexes[ind] = _ind _ind += 1 except Exception as e: if e is KeyboardInterrupt: break print e print "error with ", line continue else: pass for line in tokens: if line is not None: filtered_tokens = [ token for token in line.split(',') if self.frequency[token.lower()] > freq_lower_bound and len(token) > token_len_lower_bound ] filtered_tokens = ','.join(filtered_tokens) file_keywords.write('%s\n' % (filtered_tokens.encode('utf-8'))) file_keywords.flush() f.close() # experimental json.dump(token_indexes, open(self.f_token_indexes + "token_indexes.pickle", "w"), ensure_ascii=True) else: doc_list = args[0] for ind, line in enumerate(list(doc_list)): try: tokens.append( self.calculateTokens( line, method=str(kwargs["method"]), doc_len_lower_bound=doc_len_lower_bound, doc_len_upper_bound=doc_len_upper_bound)) except Exception as e: if e is KeyboardInterrupt: break print e print "error with ", line continue else: pass for line in tokens: if line is not None: filtered_tokens = [ token for token in line.split(',') if self.frequency[token.lower()] > freq_lower_bound and len(token) > token_len_lower_bound ] filtered_tokens = ','.join(filtered_tokens) file_keywords.write('%s\n' % (filtered_tokens.encode('utf-8'))) file_keywords.flush() file_keywords.close() pynlpir.close() return True def _loadTopicModel(self, **kwargs): try: str(kwargs["method"]) except: print "You must specify a topic modeling method! Only tfidf is supported now." else: self.method = str(kwargs["method"]) if self.method != 'tfidf': print "Error. We will use method=tfidf in the following analysis." self.method = 'tfidf' self.conf_tfidf = self.conf_embedding[self.method] _corpora = corpora.MmCorpus(self.conf_embedding["prefix"] + self.conf_tfidf["corpus_save_to"] + '.mm') self.dictionary = corpora.Dictionary.load( self.conf_embedding["prefix"] + self.conf_tfidf["dict_save_to"] + '.dict') _model = models.TfidfModel.load(self.conf_embedding["prefix"] + self.conf_tfidf["model_save_to"] + '.tfidf.model') return _model, _corpora def refineKeywords(self, **kwargs): #TODO: Whether setting TF-IDF threshold? top_k = int(kwargs["top_k"]) file_keywords = open( self.conf_io["prefix"] + self.conf_io["output_data_directory"] + str(kwargs["target_name"]) + '.filtered.keywords', 'w') _model, _corpora = self._loadTopicModel(method='tfidf') for corpus in _corpora: # take the top-10 tf-idf weight tokens within each document, also, we set an absolute weight to it corpus = _model[corpus] sorted_corpus_per_doc = [ token for token in sorted(corpus, key=lambda x: -x[1])[:top_k] ] tokens = [ self.dictionary.id2token[_token[0]] for _token in sorted_corpus_per_doc ] tokens = ','.join(tokens) file_keywords.write('%s\n' % (tokens.encode('utf-8'))) file_keywords.close() return True
if __name__ == '__main__':#very important # res = request([["excellent"],["poor"]]) poshit = 1510000000032 neghit = 771000000037 print(poshit) print(neghit) stopword = ["-LSB-","-RSB-","-LRB-","-RRB-"] tokenizer = StanfordTokenizer() filename = "F:/course/sentimentcode/rt-polarity.neg" file_object = codecs.open(filename,'r','utf-8') allres = [] try: all_the_text = file_object.read() arr = tokenizer.tokenize(all_the_text) la = len(arr) correct = 0 for line in arr: ax = line.split() wordarr = [] for word in ax: if word in stopword: continue wordarr.append(word) list = nltk.pos_tag(wordarr) result = getPhraseByPos(list) if len(result)==0: continue allres.append(result) posres = f(allres,1)