def splitAllWord(typeOfDataset="dev"): from nltk.tokenize.stanford_segmenter import StanfordSegmenter segmenter = StanfordSegmenter() segmenter.default_config('zh') maxCount = 2000000 pathOfDev = "dataset/task1/%s.tsv" % typeOfDataset dfOfDev = pd.read_csv(pathOfDev, delimiter="\t") pathOfNewDev = "%s_split.tsv" % typeOfDataset count = 0 with open(pathOfNewDev, "w", encoding='utf-8') as fw: for row in dfOfDev.iterrows(): if count >= maxCount: break if count % 100 == 0: print("[%s]count = %s" % (typeOfDataset, count)) label = row[1]['label'] fw.write(str(label)) fw.write("\t") sentence = row[1]['text_a'] segmentOfSentence = segmenter.segment(sentence) for word in segmentOfSentence.split(): fw.write(word) fw.write(" ") fw.write("\n") count += 1
def segment(labels, reviews): segmented = [] print('Creating BOW') # seg = StanfordSegmenter('../../datasets/data-hauyi/stanford-segmenter-2018-10-16') os.environ[ "STANFORD_SEGMENTER"] = '../datasets/data-hauyi/stanford-segmenter-2018-10-16' seg = StanfordSegmenter( '../datasets/data-hauyi/stanford-segmenter-2018-10-16/stanford-segmenter-3.9.2.jar' ) seg.default_config('zh', ) count = 0 file_out = open('reviews.txt', 'a+') for i in range(len(reviews)): # print(i) s = seg.segment(reviews[i]) l = labels[i] # print(s) line = str(l) + ' ' + s file_out.write(line) segmented.append(s) # print('Tokenize: ') # print(seg.tokenize(s)) count = count + 1 # if count > 5: # break print('Count: ', count) return (segmented)
def __init__(self): self.root_path = '../Models/stanfordNLP/' # word segmenter self.segmenter = StanfordSegmenter( path_to_jar=self.root_path + "stanford-segmenter.jar", path_to_slf4j=self.root_path + "log4j-over-slf4j.jar", path_to_sihan_corpora_dict=self.root_path + "segmenter/", path_to_model=self.root_path + "segmenter/pku.gz", path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz") # pos tagger self.posTagger = StanfordPOSTagger( self.root_path + 'pos-tagger/chinese-distsim.tagger', path_to_jar=self.root_path + "stanford-postagger.jar") # named entity recognizer self.nerTagger = StanfordNERTagger( self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz', path_to_jar=self.root_path + 'stanford-ner.jar') self.parser = StanfordDependencyParser( model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz', path_to_jar=self.root_path + 'stanford-parser.jar', path_to_models_jar=self.root_path + 'stanford-parser-3.7.0-models.jar', encoding='gbk')
def segmenter(sentence): r""" Stanford Word Segmenter for Chinese. Split Chinese sentence into a sequence of words. Args: sentence:A Chinese sentence Returns: A list decode in utf-8 Example: sentence="广东外语外贸大学是一所具有鲜明国际化特色的广东省属重点大学,是华南地区国际化人才培养和外国语言文化、对外经济贸易、国际战略研究的重要基地。" [u'\u5e7f\u4e1c', u'\u5916\u8bed', u'\u5916\u8d38', u'\u5927\u5b66', u'\u662f', u'\u4e00', u'\u6240', u'\u5177\u6709', u'\u9c9c\u660e', u'\u56fd\u9645\u5316', u'\u7279\u8272', u'\u7684', u'\u5e7f\u4e1c', u'\u7701\u5c5e', u'\u91cd\u70b9', u'\u5927\u5b66', u'\uff0c', u'\u662f', u'\u534e\u5357', u'\u5730\u533a', u'\u56fd\u9645\u5316', u'\u4eba\u624d', u'\u57f9\u517b', u'\u548c', u'\u5916\u56fd', u'\u8bed\u8a00', u'\u6587\u5316', u'\u3001', u'\u5bf9\u5916', u'\u7ecf\u6d4e', u'\u8d38\u6613', u'\u3001', u'\u56fd\u9645', u'\u6218\u7565', u'\u7814\u7a76', u'\u7684', u'\u91cd\u8981', u'\u57fa\u5730', u'\u3002'] """ from nltk.tokenize.stanford_segmenter import StanfordSegmenter #初始化斯坦福中文分词器 segmenter = StanfordSegmenter( path_to_jar= 'D:/python/nltk-3.1/nltk/chin/stanford-segmenter-2014-08-27/stanford-segmenter-3.4.1.jar', path_to_sihan_corpora_dict= 'D:/python/nltk-3.1/nltk/chin/stanford-segmenter-2014-08-27/data', path_to_model= 'D:/python/nltk-3.1/nltk/chin./stanford-segmenter-2014-08-27/data/pku.gz', path_to_dict= 'D:/python/nltk-3.1/nltk/chin./stanford-segmenter-2014-08-27/data/dict-chris6.ser.gz' ) #加载中文分词模型 sent = segmenter.segment(sentence) #分词 return sent.split()
def segment_sentences(sentence_list): segmenter = StanfordSegmenter( java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar'), path_to_slf4j=os.path.join(stanford_corenlp_path, 'slf4j-api-1.7.25.jar'), path_to_sihan_corpora_dict=os.path.join( stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data'), path_to_model=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz'), path_to_dict=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz'), sihan_post_processing='true') result = segmenter.segment_sents(sentence_list) result = result.strip() segmented_list = re.split(os.linesep, result) if len(segmented_list[-1]) == 0: segmented_list = segmented_list[:-1] if len(segmented_list) != len(sentence_list): for i in range(len(segmented_list)): ss = ''.join(segmented_list[i].split()) if ss != sentence_list[i]: print(i, '|', segmented_list[i], '|', sentence_list[i]) # break print(len(segmented_list), len(sentence_list)) assert len(segmented_list) == len(sentence_list) return segmented_list
def ch_standseg(mystr): segmenter = StanfordSegmenter( path_to_jar=r"E:\tools\stanfordNLTK\jar\stanford-segmenter.jar", path_to_slf4j=r"E:\tools\stanfordNLTK\jar\slf4j-api.jar", path_to_sihan_corpora_dict=r"E:\tools\stanfordNLTK\jar\data", path_to_model=r"E:\tools\stanfordNLTK\jar\data\pku.gz", path_to_dict=r"E:\tools\stanfordNLTK\jar\data\dict-chris6.ser.gz") result = segmenter.segment(mystr) print(result)
def __init__(self): file_path = path.realpath(__file__) dir_path = path.dirname(file_path) self.path_to_jar = path.join(dir_path, 'stanford-segmenter-3.9.2.jar') self.path_to_model = path.join(dir_path, 'data/ctb.gz') # pku.gz self.path_to_dict = path.join(dir_path, 'data/dict-chris6.ser.gz') self.path_to_sihan_corpora_dict = path.join(dir_path, 'data/') self.seg = StanfordSegmenter( path_to_jar=self.path_to_jar, java_class='edu.stanford.nlp.ie.crf.CRFClassifier', path_to_model=self.path_to_model, path_to_dict=self.path_to_dict, path_to_sihan_corpora_dict=self.path_to_sihan_corpora_dict)
def __init__(self): print('stanford segmenter init...') # stanford_corenlp_path = r'/media/mcislab/sdb1/home/mcislab/zwt/stanford_core_nlp' stanford_corenlp_path = r"D:\Desktop\stanford corenlp" self.segmenter = StanfordSegmenter( java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar'), path_to_slf4j=os.path.join(stanford_corenlp_path, 'slf4j-api-1.7.25.jar'), path_to_sihan_corpora_dict=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data'), path_to_model=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz'), path_to_dict=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz'), sihan_post_processing='true' )
def segment(): """ split a Chinese sentence into words :return: """ segmenter = StanfordSegmenter( java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\stanford-segmenter-3.9.1.jar", path_to_slf4j=r"D:\Desktop\stanford corenlp\slf4j-api-1.7.25.jar", path_to_sihan_corpora_dict=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data", path_to_model=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data\pku.gz", path_to_dict=r"D:\Desktop\stanford corenlp\stanford-segmenter-2018-02-27\data\dict-chris6.ser.gz", sihan_post_processing='true' ) # path to jar files should be changed # result = segmenter.segment(s) result = segmenter.segment_sents(["一个人在切西红柿", "这个把手该换了", "别把手放在我的肩膀上", "他正在量和服尺寸"]) print(result)
class StanfordTokenizer: """ class for segmenting Chinese sentences uses stanford segmenter 3.9.1 """ def __init__(self): stanford_corenlp_path = r'/media/mcislab/sdb1/home/mcislab/zwt/stanford_core_nlp' self.segmenter = StanfordSegmenter( java_class=r"edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'stanford-segmenter-3.9.1.jar'), path_to_slf4j=os.path.join(stanford_corenlp_path, 'slf4j-api-1.7.25.jar'), path_to_sihan_corpora_dict=os.path.join( stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data'), path_to_model=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'pku.gz'), path_to_dict=os.path.join(stanford_corenlp_path, 'stanford-segmenter-2018-02-27', 'data', 'dict-chris6.ser.gz'), sihan_post_processing='true') def segment_sents(self, sentences): result = self.segmenter.segment_sents(sentences) result = result.strip() segmented_list = re.split(os.linesep, result) if len(segmented_list[-1]) == 0: segmented_list = segmented_list[:-1] print(len(segmented_list), len(sentences)) assert len(segmented_list) == len(sentences) return segmented_list def tokenize(self, captions_for_images): image_id_list = [] caption_list = [] for (image_id, captions) in captions_for_images.items(): for caption in captions: caption_list.append(caption['caption']) image_id_list.append(image_id) segmented_caption_list = self.segment_sents(caption_list) assert len(image_id_list) == len(caption_list) and len( caption_list) == len(segmented_caption_list) tokenized_captions_for_images = {} for i in range(len(image_id_list)): image_id = image_id_list[i] if image_id not in tokenized_captions_for_images: tokenized_captions_for_images[image_id] = [] tokenized_captions_for_images[image_id].append( segmented_caption_list[i]) return tokenized_captions_for_images
def stanford_seg(path, stage='train'): df_dir = os.path.join(path, '{}.csv'.format(stage)) data = pd.read_csv(df_dir, error_bad_lines=False, dtype=object) data = data.dropna(axis=0, how='any').reset_index(drop=True) data_dir = '/home/trueto/stanford_segmenter/' seg = StanfordSegmenter(path_to_jar=data_dir + 'stanford-segmenter.jar', java_class='edu.stanford.nlp.ie.crf.CRFClassifier', path_to_sihan_corpora_dict=data_dir + "data", path_to_model=data_dir + 'data/pku.gz', path_to_dict=data_dir + "data/dict-chris6.ser.gz") columns = data.columns for column in columns: if column in ['question1', 'question2']: column_file = os.path.join(path, 'cut', '{}_{}.txt'.format(stage, column)) data[column].to_csv(column_file, index=False) cut_file = os.path.join(path, 'cut', '{}_{}_cut.txt'.format(stage, column)) with open(cut_file, 'w') as f: f.write(seg.segment_file(column_file))
def __init__(self, language): self.language = language model = self.models.get(language) dic = self.dics.get(language) if not model: raise (Exception( "Unsupported language for tokenizer: {}".format(language))) # Initialize Stanford Tokenizer self.tm_tokenize = StanfordSegmenter( path_to_jar=os.path.join(stanford_tokenizer_home, 'stanford-segmenter-3.6.0.jar'), path_to_model=os.path.join(stanford_tokenizer_home, 'data', model), path_to_dict=os.path.join(stanford_tokenizer_home, 'data', dic), path_to_sihan_corpora_dict=os.path.join(stanford_tokenizer_home, 'data'), path_to_slf4j=os.path.join(stanford_tokenizer_home, 'slf4j-api.jar'))
def get_stanford_segmenter(): if not os.path.isdir(STANFORD_SEGMENTER_DIR): download_stanford_segmenter() global STANFORD_SEGMENTER if not STANFORD_SEGMENTER: STANFORD_SEGMENTER = StanfordSegmenter( path_to_jar=STANFORD_SEGMENTER_JAR, path_to_sihan_corpora_dict=STANFORD_SIHAN_CORPORA_DICT, path_to_model=STANFORD_MODEL, path_to_dict=STANFORD_DICT, verbose=True ) return STANFORD_SEGMENTER
def Segmenter(segmenter_folder_name='', segmenter_jarname = '', segmenter_folder='', segmenter_jarpath='', segmenter_corpora='', segmenter_model='', segmenter_dictpath='', segmenter_slfpath=''): ### default_segmenter_folder_name='stanford-segmenter-2017-06-09' if len(segmenter_folder_name)==0: segmenter_folder_name = default_segmenter_folder_name ### default_segmenter_jarname = 'stanford-segmenter-3.8.0.jar' if len(segmenter_jarname) == 0: segmenter_jarname = default_segmenter_jarname ### default_segmenter_folder = os.path.join(os.path.expanduser('~'), 'Stanford NLP', segmenter_folder_name) if len(segmenter_folder)==0: segmenter_folder = default_segmenter_folder ### default_segmenter_jarpath = os.path.join(segmenter_folder, segmenter_jarname) if len(segmenter_jarpath) == 0: segmenter_jarpath = default_segmenter_jarpath ### default_segmenter_corpora = os.path.join(segmenter_folder, 'data') if len(segmenter_corpora) == 0: segmenter_corpora = default_segmenter_corpora ### default_segmenter_model = os.path.join(segmenter_folder, 'data', 'pku.gz') if len(segmenter_model) == 0: segmenter_model = default_segmenter_model ### default_segmenter_dictpath = os.path.join(segmenter_folder, 'data', 'dict-chris6.ser.gz') if len(segmenter_dictpath) == 0: segmenter_dictpath = default_segmenter_dictpath ### default_segmenter_slfpath = os.path.join(segmenter_folder, 'slf4j-api.jar') if len(segmenter_slfpath) == 0: segmenter_slfpath = default_segmenter_slfpath ### nltk.internals.config_java("") ###### segmenter = StanfordSegmenter(java_class="edu.stanford.nlp.ie.crf.CRFClassifier", path_to_jar=segmenter_jarpath, path_to_sihan_corpora_dict=segmenter_corpora, path_to_model=segmenter_model, path_to_dict=segmenter_dictpath, path_to_slf4j=segmenter_slfpath) # segmenter.default_config('zh') ###### return segmenter
class TMStanfordTokenizer(): models = {'ZH': 'ctb.gz', 'AR': 'arabic-segmenter-atb+bn+arztrain.ser.gz'} dics = {'ZH': 'dict-chris6.ser.gz', 'AR': ''} def __init__(self, language): self.language = language model = self.models.get(language) dic = self.dics.get(language) if not model: raise (Exception( "Unsupported language for tokenizer: {}".format(language))) # Initialize Stanford Tokenizer self.tm_tokenize = StanfordSegmenter( path_to_jar=os.path.join(stanford_tokenizer_home, 'stanford-segmenter-3.6.0.jar'), path_to_model=os.path.join(stanford_tokenizer_home, 'data', model), path_to_dict=os.path.join(stanford_tokenizer_home, 'data', dic), path_to_sihan_corpora_dict=os.path.join(stanford_tokenizer_home, 'data'), path_to_slf4j=os.path.join(stanford_tokenizer_home, 'slf4j-api.jar')) #Input: String #Output: 这 是 斯坦福 中文 分词 器 测试 def process(self, sentences): text = self.tm_tokenize.segment(sentences).strip('\n') if re.search(TOK_PATTERN, text): # Check if the text have tags text = XmlUtils.join_tags(text, JOIN_PATTERN) return text def tokenize_sent(self, text): if self.language == 'ZH': return [s + '。' for s in text.split('。') if s] # Split by sentence chinese #self.tm_tokenize.segment_sents(text) return [text]
class SegmentWorker: def __init__(self): file_path = path.realpath(__file__) dir_path = path.dirname(file_path) self.path_to_jar = path.join(dir_path, 'stanford-segmenter-3.9.2.jar') self.path_to_model = path.join(dir_path, 'data/ctb.gz') # pku.gz self.path_to_dict = path.join(dir_path, 'data/dict-chris6.ser.gz') self.path_to_sihan_corpora_dict = path.join(dir_path, 'data/') self.seg = StanfordSegmenter( path_to_jar=self.path_to_jar, java_class='edu.stanford.nlp.ie.crf.CRFClassifier', path_to_model=self.path_to_model, path_to_dict=self.path_to_dict, path_to_sihan_corpora_dict=self.path_to_sihan_corpora_dict) def seg_file(self, file_to_segment): """segment a file and return the result string""" seg_result = self.seg.segment_file(file_to_segment) translator = str.maketrans('', '', string.digits) seg_result = seg_result.translate(translator) seg_result = re.sub('[\\\\.!/_,$%^*(+\\"\']+|[+—!,:;。?、~@#¥%…&*()]+', '', seg_result) # print(seg_result) return seg_result def seg_file2list(self, file_to_segment): """segment a text file and return array of tokens""" seg_result = self.seg_file(file_to_segment) # print(seg_result) return seg_result.split() def seg_file2file(self, origin_file, dest_file): """segment a text file and write result tokens to another file""" seg_result = self.seg_file(origin_file) seg_result = re.sub('\\s+', ' ', seg_result) # print(seg_result) with open(dest_file, 'w', encoding='UTF-8') as f: f.write(seg_result)
from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tokenize.stanford import CoreNLPTokenizer # from nltk.tokenize.stanford import path = "D:/www/data/nlpsoftware/stanford-segmenter" segmenter = StanfordSegmenter( path_to_jar=path + "/stanford-segmenter.jar", path_to_sihan_corpora_dict=path + "/data", path_to_model=path + "/data/pku.gz", path_to_dict=path + "/data/dict-chris6.ser.gz", java_class='edu.stanford.nlp.ie.crf.CRFClassifier') # sentence = u"这是斯坦福中文分词器测试" sentence = u"工信处女干事每月经过 下属 科室都要亲口交代24口交换机等技术性器件的安装工作" segmenter.tokenize_sents(u"工信处") result = segmenter.segment(sentence) result2 = segmenter.segment_file( "D:/www/data/nlpdata/icwb2-data/testing/pku_test.utf8") clean_content = "D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content.txt" # clean_content_out="D:\\www\\data\\Weibo Data\\Weibo Data\\nlp/clean_content_out.txt" # result3 = segmenter.segment_file(clean_content) print(type(result2)) # with open(clean_content_out,'wb+') as f: # f.writelines([(s+"\r\n").encode('utf8') for s in clean_content_out]) print(result2) # outfile = open("D:/www/data/nlpsoftware/outfile.txt",'w') # outfile.write(result) # outfile.close() #
def segment(sentence): path = '/media/razor/Files/Python27/nltk_data/stanford-segmenter-2015-12-09/' segmenter = StanfordSegmenter(path + 'stanford-segmenter-3.6.0.jar', path + 'slf4j-api.jar', path + 'data/pku.gz', path + 'data/dict-chris6.ser.gz') return segmenter.segment(u'我爱北京天安门')
from nltk.tokenize.stanford_segmenter import StanfordSegmenter seg = StanfordSegmenter(path_to_slf4j='/YOUR_PATH/slf4j-api.jar') seg = StanfordSegmenter() seg.default_config('zh') sent = u'这是斯坦福中文分词器测试' print(seg.segment(sent))
class NLPCore: """ nlp processing including Stanford Word Segmenter, Stanford POS Tagger, Stanford Named Entity Recognizer and Stanford Parser """ def __init__(self): self.root_path = '../Models/stanfordNLP/' # word segmenter self.segmenter = StanfordSegmenter( path_to_jar=self.root_path + "stanford-segmenter.jar", path_to_slf4j=self.root_path + "log4j-over-slf4j.jar", path_to_sihan_corpora_dict=self.root_path + "segmenter/", path_to_model=self.root_path + "segmenter/pku.gz", path_to_dict=self.root_path + "segmenter/dict-chris6.ser.gz") # pos tagger self.posTagger = StanfordPOSTagger( self.root_path + 'pos-tagger/chinese-distsim.tagger', path_to_jar=self.root_path + "stanford-postagger.jar") # named entity recognizer self.nerTagger = StanfordNERTagger( self.root_path + 'ner/chinese.misc.distsim.crf.ser.gz', path_to_jar=self.root_path + 'stanford-ner.jar') self.parser = StanfordDependencyParser( model_path=self.root_path + 'lexparser/chinesePCFG.ser.gz', path_to_jar=self.root_path + 'stanford-parser.jar', path_to_models_jar=self.root_path + 'stanford-parser-3.7.0-models.jar', encoding='gbk') def split_sent_stanford(self, textPair): """ Stanford Word Segmenter, input should be raw text :return: also TextPair with raw string of results """ t1 = self.segmenter.segment(textPair.t1) t2 = self.segmenter.segment(textPair.t1) if DEBUG: print(t1, t2) return text_pair.TextPair(t1, t2, textPair.label) def split_sents_stanford(self, textPairs): """ Stanford Word Segmenter, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1 for textPair in textPairs] sents2 = [textPair.t2 for textPair in textPairs] split1 = self.segmenter.segment_sents(sents1).split('\n') split2 = self.segmenter.segment_sents(sents2).split('\n') rlist = [] for i in range(len(textPairs)): rlist.append( text_pair.TextPair(split1[i], split2[i], textPairs[i].label)) if DEBUG: print(split1[i], split2[i]) return rlist def split_sent_jieba(self, textPair): jieba.setLogLevel('INFO') ger1 = jieba.cut(textPair.t1) ger2 = jieba.cut(textPair.t2) t1 = ' '.join(ger1) t2 = ' '.join(ger2) return text_pair.TextPair(t1, t2, textPair.label) def pos_tag(self, textPair): """ Stanford POS Tagger, input should be splitted :return: also TextPair with raw string of results """ t1_s = textPair.t1.split() t2_s = textPair.t2.split() t1_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t1_s)]) t2_tag = ' '.join([ele[1] for ele in self.posTagger.tag(t2_s)]) if DEBUG: print(t1_tag, t2_tag) return text_pair.TextPair(t1_tag, t2_tag, textPair.label) def pos_tag_pairs(self, textPairs): """ Stanford POS Tagger, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1.split() for textPair in textPairs] sents2 = [textPair.t2.split() for textPair in textPairs] tag1 = self.posTagger.tag_sents(sents1) tag2 = self.posTagger.tag_sents(sents2) rlist = [] for i in range(len(tag1)): t1_tag = ' '.join([ele[1] for ele in tag1[i]]) t2_tag = ' '.join([ele[1] for ele in tag2[i]]) rlist.append(text_pair.TextPair(t1_tag, t2_tag, textPairs[i].label)) if DEBUG: print(t1_tag, t2_tag) return rlist def ner_tag(self, textPair): """ Stanford Named Entity Recognizer, input should be splitted :return: also TextPair with raw string of results """ t1_s = textPair.t1.split() t2_s = textPair.t2.split() t1_ner = ' '.join( [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t1_s)]) t2_ner = ' '.join( [ele[0] + '#' + ele[1] for ele in self.nerTagger.tag(t2_s)]) if DEBUG: print(t1_ner, t2_ner) return text_pair.TextPair(t1_ner, t2_ner, textPair.label) def ner_tag_pairs(self, textPairs): """ Stanford Named Entity Recognizer, input should be list of sents :return: also TextPair with raw string of results """ sents1 = [textPair.t1.split() for textPair in textPairs] sents2 = [textPair.t2.split() for textPair in textPairs] tag1 = self.nerTagger.tag_sents(sents1) tag2 = self.nerTagger.tag_sents(sents2) rlist = [] for i in range(len(tag1)): t1_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag1[i]]) t2_ner = ' '.join([ele[0] + '#' + ele[1] for ele in tag2[i]]) rlist.append(text_pair.TextPair(t1_ner, t2_ner, textPairs[i].label)) if DEBUG: print(t1_ner, t2_ner) return rlist def depen_parse(self, textPair): """ Stanford Dependency Parser, input should be splitted :return: also TextPair with raw string of results """ print([p.tree() for p in self.parser.raw_parse(textPair.t1)])
# find_entity_t = test.find_entity() # find_VP_t = test.firstVP() # test.drawTree() test.show(firstNP_t) # test.show(find_entity_t) # test.show(find_VP_t) # # test.show(find_entity_t) # test.show(firstMinNP_t) result = test.find_realtionship(firstNP_t) print(result) test.drawTree() # # # print(test.rel) # test.show(test.find_realtionship()) # 对比实验 chi_parser = StanfordParser(path_to_jar='../stanford-parser-full-2018-02-27/stanford-parser.jar', path_to_models_jar='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models.jar', model_path='../stanford-parser-full-2018-02-27/stanford-parser-3.9.1-models/edu/stanford/nlp/models/lexparser/chinesePCFG.ser.gz') data_dir='../stanford-segmenter-2018-02-27/' segmenter = StanfordSegmenter(path_to_jar=data_dir+"stanford-segmenter-3.9.1.jar", path_to_sihan_corpora_dict=data_dir+"/data", path_to_model=data_dir+"/data/pku.gz", path_to_dict=data_dir+"/data/dict-chris6.ser.gz", java_class='edu.stanford.nlp.ie.crf.CRFClassifier', ) result=segmenter.segment(test_str) result_ls = result.split() ch_tree = list(chi_parser.parse(result_ls))[0] ch_tree.draw() # print(result)
# encoding: utf-8 import nltk from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tag import StanfordNERTagger segmenter = StanfordSegmenter( #分词依赖的jar包 path_to_jar= r"/home/jiangix/document/stanford-segmenter/stanford-segmenter.jar", path_to_slf4j=r"/home/jiangix/document/slf4j/slf4j-api.jar", #分词数据文件夹 path_to_sihan_corpora_dict= r"/home/jiangix/document/stanford-segmenter/data", #基于北大在2005backoof上提供的人名日报语料库 path_to_model=r"/home/jiangix/document/stanford-segmenter/data/pku.gz", path_to_dict= r"/home/jiangix/document/stanford-segmenter/data/dict-chris6.ser.gz") segmenter.default_config('zh') result = segmenter.segment(u'我喜欢学习编程') chi_tagger = StanfordNERTagger( model_filename= r"/home/jiangix/document/stanford-chinese-corenlp-models/chinese.misc.distsim.crf.ser.gz", path_to_jar=r"/home/jiangix/document/stanford-ner/stanford-ner.jar") for word, tag in chi_tagger.tag(result.split()): print(word, tag)
#coding:utf-8 from nltk.tokenize.stanford_segmenter import StanfordSegmenter #中文分词 segmenter=StanfordSegmenter( path_to_jar="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/stanford-segmenter-3.5.2.jar", path_to_slf4j="/home/hsiao/Develops/nlp/stanford-corenlp-full-2016-10-31/slf4j-api.jar", path_to_sihan_corpora_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data", path_to_model="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/pku.gz", path_to_dict="/home/hsiao/Develops/nlp/stanford-segmenter-2015-04-20/data/dict-chris6.ser.gz" ) str="我在我在博客园开了一个博客。" print (segmenter.segment(str)) #英文分词 from nltk.tokenize import StanfordTokenizer tokenizer=StanfordTokenizer(path_to_jar=r"/home/hsiao/Develops/nlp/stanford-parser-full-2016-10-31/stanford-parser.jar") sent = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." print (tokenizer.tokenize(sent)) #中文命名实体识别 from nltk.tag import StanfordNERTagger chi_tagger=StanfordNERTagger(model_filename=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/classifiers/chinese.misc.distsim.crf.ser.gz' ,path_to_jar=r'/home/hsiao/Develops/nlp/stanford-ner-2016-10-31/stanford-ner.jar') print (chi_tagger.tag('四川省 成都 信息 工程 大学 我 在 博客 园 开 了 一个 博客 , 我 的 博客 名叫 伏 草 惟 存 , 写 了 一些 自然语言 处理 的 文章 。\r\n'.split()))
# from nltk.tokenize.stanford_segmenter import StanfordSegmenter # segmenter = StanfordSegmenter(path_to_jar="stanford-segmenter-3.4.1.jar", path_to_sihan_corpora_dict="./data", path_to_model="./data/pku.gz", path_to_dict="./data/dict-chris6.ser.gz") # sentence = u"这是斯坦福中文分词器测试" # segmenter.segment(sentence) # # u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n' # segmenter.segment_file("test.simp.utf8") # # u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ... ##下载 # import nltk # nltk.download() from nltk.tokenize.stanford_segmenter import StanfordSegmenter segmenter = StanfordSegmenter( path_to_jar="stanford-segmenter-3.6.0.jar", path_to_slf4j = "slf4j-api.jar", path_to_sihan_corpora_dict="./data", path_to_model="./data/pku.gz", path_to_dict="./data/dict-chris6.ser.gz") sentence = u"这是斯坦福中文分词器测试" segmenter.segment(sentence) # >>> u'\u8fd9 \u662f \u65af\u5766\u798f \u4e2d\u6587 \u5206\u8bcd\u5668 \u6d4b\u8bd5\n' segmenter.segment_file("test.simp.utf8") # >>> u'\u9762\u5bf9 \u65b0 \u4e16\u7eaa \uff0c \u4e16\u754c \u5404\u56fd ... # 英文测试 # import nltk # text = 'i am a good boy.you are a bad girl' # sens = nltk.sent_tokenize(text) # print(sens) # words = [] # for sent in sens:
#coding:UTF-8 from nltk.tokenize.stanford_segmenter import StanfordSegmenter segmenter = StanfordSegmenter( path_to_jar= r"D:\StanfordNLP\stanford-segmenter\stanford-segmenter-3.6.0.jar", path_to_slf4j=r"D:\StanfordNLP\stanford-segmenter\slf4j-api.jar", path_to_sihan_corpora_dict=r"D:\StanfordNLP\stanford-segmenter\data", path_to_model=r"D:\StanfordNLP\stanford-segmenter\data\pku.gz", path_to_dict=r"D:\StanfordNLP\stanford-segmenter\data\dict-chris6.ser.gz") str = u"我在博客园开了一个博客,我的博客名叫伏草惟存,写了一些自然语言处理的文章。" result = segmenter.segment(str) print result
'helpers', "englishPCFG.ser.gz") model_de_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'helpers', "germanPCFG.ser.gz") jar_model_de_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'helpers/stanford-corenlp-full-2016-10-31') model_cn_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'helpers', "chinesePCFG.ser.gz") model_ch_lex_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'helpers', "chineseFactored.ser.gz") segmenter = StanfordSegmenter( path_to_jar=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'helpers', "stanford-segmenter.jar"), path_to_sihan_corpora_dict=os.path.join( os.path.dirname(os.path.realpath(__file__)), 'helpers', "stanford-segmenter-2015-12-09/data"), path_to_model=os.path.join(os.path.dirname(os.path.realpath(__file__)), 'helpers', "stanford-segmenter-2015-12-09/data/pku.gz"), path_to_dict=os.path.join( os.path.dirname(os.path.realpath(__file__)), 'helpers', "stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz")) #parser_en = StanfordParser(model_path = model_en_path) #parser_de = StanfordParser(model_path = model_de_path) #parser_cn = StanfordParser(model_path = model_cn_path) #parser_ch_lex = StanfordParser(model_path = model_ch_lex_path) dep_parser_en = StanfordDependencyParser(model_path=model_en_path) dep_parser_de = StanfordDependencyParser(model_path=model_de_path) dep_parser_cn = StanfordDependencyParser(model_path=model_cn_path)
# -*- coding: utf-8 -*- from __future__ import unicode_literals from nltk.tokenize.stanford_segmenter import StanfordSegmenter seg = StanfordSegmenter( path_to_jar=r'D:\temp\stanford\stanford-segmenter.jar', path_to_slf4j=r'D:\temp\stanford\slf4j-api.jar', path_to_sihan_corpora_dict=r'D:\temp\stanford\data', path_to_model=r'D:\temp\stanford\data\ctb.gz', path_to_dict=r'D:\temp\stanford\data\dict-chris6.ser.gz', ) # sentence = "这是斯坦福中文分词器测试" sentence = "操你大爷,狗日的" res = seg.segment(sentence) print res # import jieba # ss = jieba.cut(sentence) # print ' '.join(list(ss)), type(ss)
zh = open(zh_dir,'r') len_p = number_file(en_dir) corpus = [] for x in range(0, len_p): corpus.append([en.readline().strip(),zh.readline().strip()]) ############################### ###tokenize chinese#### print '########Starting tokenization###########\n' ## from nltk.tokenize.stanford_segmenter import StanfordSegmenter pre_path = '/home/db32555/MM/stanford-segmenter/' segmenter = StanfordSegmenter(path_to_jar=pre_path+'stanford-segmenter-3.4.1.jar', path_to_sihan_corpora_dict=pre_path+'./data', path_to_model=pre_path+'./data/pku.gz', path_to_dict=pre_path+'./data/dict-chris6.ser.gz') ##setup_end## from nltk import word_tokenize ##setup_end_eng## for node in corpus: index = corpus.index(node) chinese = node[1] chinese = unicode(chinese, 'utf-8') tmp_segmented = segmenter.segment(chinese) tmp_segmented = tmp_segmented.split(" ") # del corpus[index][1] corpus[index].append(tmp_segmented) print tmp_segmented ##this is chinese english = node[0]
import random, sys # import nltk from nltk.tokenize.stanford_segmenter import StanfordSegmenter jar = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/stanford-segmenter-3.8.0.jar' api_jar = '/Users/sinansmac/Public/StanfordNLP/stanford-parser-full-2017-06-09/slf4j-api.jar' # model = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/data/dict-chris6.ser.gz' seg = StanfordSegmenter(path_to_jar=jar, path_to_slf4j=api_jar) seg.default_config('zh') # sent = u'这是斯坦福中文分词器测试' # print(seg.segment(sent)) class MarkovZh: def __init__(self): self.suffix_map = {} self.prefix = () def process_file(self, filename, order=1): fp = open(filename) # self.skip_gutenberg_header(fp) for line in fp: for word in line.rstrip().split(): self.process_word(word, order) # def skip_gutenberg_header(self, fp): # for line in fp: # if line.startswith('*END*THE SMALL PRINT!'):
#coding:utf8 import os import sys import logging from nltk.tokenize.stanford_segmenter import StanfordSegmenter from nltk.tag import StanfordPOSTagger java_path = "C:\\Program Files\\Java\\jdk1.8.0_73\\bin\\java.exe" os.environ['JAVAHOME'] = java_path segmenter = StanfordSegmenter( path_to_jar= "E:\\lib\\stanford-segmenter-2017-06-09\\stanford-segmenter-3.8.0.jar", path_to_slf4j="E:\\lib\\stanford-segmenter-2017-06-09\\slf4j-api.jar", path_to_sihan_corpora_dict="E:\\lib\\stanford-segmenter-2017-06-09\\data", path_to_model="E:\\lib\\stanford-segmenter-2017-06-09\\data\\pku.gz", path_to_dict= "E:\\lib\\stanford-segmenter-2017-06-09\\data\\dict-chris6.ser.gz") postagger = StanfordPOSTagger( path_to_jar= "E:\\lib\\stanford-postagger-full-2017-06-09\\stanford-postagger.jar", model_filename= 'E:\\lib\\stanford-postagger-full-2017-06-09\\models\\chinese-distsim.tagger', ) def pos_to_sequence(sent, segmenter=segmenter, postagger=postagger): seg_sent = segmenter.segment(sent)
print "{}: {}".format(key, value) if __name__ == '__main__': examples = [] with open(pjoin(args.data_dir, args.data_file), 'rb') as f: for line in f: examples.append(line) if args.corpus == "gigaword_ch" and not args.char: print "segmenting each example for Chinese, could take a while" from nltk.tokenize.stanford_segmenter import StanfordSegmenter seg = StanfordSegmenter(path_to_slf4j=path_to_slf4j, path_to_jar=path_to_jar) seg.default_config('zh') # ==== Filtering ===== data_dist = {} filtered_examples = {} number_of_filtered_examples = 0 for i, ex in enumerate(examples): s1, s2, label = ex[:-1].split('\t') if args.corpus == 'gigaword_ch': s1 = s1.replace(' .', '。') # parser appended normal period s2 = s2.replace(' .', '。') if args.char and args.corpus == "gigaword_ch": # we presplit into chars
import os, time from nltk.tokenize.stanford_segmenter import StanfordSegmenter start = time.time() jar = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/stanford-segmenter.jar' api_jar = '/Users/sinansmac/Public/StanfordNLP/stanford-parser-full-2017-06-09/slf4j-api.jar' # dict = '/Users/sinansmac/Public/StanfordNLP/stanford-segmenter-2017-06-09/data/dict-chris6.ser.gz' seg = StanfordSegmenter(path_to_jar=jar, path_to_slf4j=api_jar) seg.default_config('zh') # sent = u'这是斯坦福中文分词器测试' # print(seg.segment(sent)) fp = "chinese.txt" tokenstr = seg.segment_file(fp) token_ls = list(tokenstr) print(len(token_ls), '\n', tokenstr, '\n', token_ls) # with open('chinese_tokens.txt', 'a') as writef: # for line in token_ls: # writef.write(line.rstrip().split()) # print(tokens, '\n', type(tokens)) # class 'str' end = time.time() print("process time:", round(end - start))
elif opt in ("-d", "--idir"): inputdir = arg elif opt in ("-t"): data_type = arg elif opt in ("-o", "--ofile"): outputfile = arg if inputdir == '': print 'test.py -t <datatype> -d <inputdir> -o <outputfile>' sys.exit(2) if outputfile == '': outputfile = 'vocab.out' ######################### segmenter = StanfordSegmenter(path_to_jar="../stanford-segmenter-2015-12-09/stanford-segmenter-3.6.0.jar", path_to_slf4j = "../stanford-segmenter-2015-12-09/slf4j-api.jar", path_to_sihan_corpora_dict="../stanford-segmenter-2015-12-09/data", path_to_model="../stanford-segmenter-2015-12-09/data/pku.gz", path_to_dict="../stanford-segmenter-2015-12-09/data/dict-chris6.ser.gz") vocabSet = set([]) build_time = 0. total = count_em(inputdir) for dirPath, dirNames, fileNames in os.walk(inputdir): if len(fileNames) > 0 : sumContain = '' for f in fileNames: try: if data_type == 'CIRB010': root = ET.parse(dirPath+'/'+f).getroot() date = root[0][1].text.strip() title = root[0][2].text.strip() text = '' for p in root[0][3]: