def preprocess(filename): f_save = open('data/char_test.txt', 'w', encoding='utf-8') pynlpir.open() with open(filename, 'r', encoding='utf-8') as f: for line in f: lst = line.rstrip().split(' ') for item in lst: c, t = item.split('/') if t == 'o': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): f_save.write(x + ' ' + 'O' + '\n') elif t == 'ns': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): if i == 0: f_save.write(x + ' ' + 'B-LOC' + '\n') else: f_save.write(x + ' ' + 'I-LOC' + '\n') elif t == 'nt': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): if i == 0: f_save.write(x + ' ' + 'B-ORG' + '\n') else: f_save.write(x + ' ' + 'I-ORG' + '\n') elif t == 'nr': c = pynlpir.segment(c, pos_tagging=False) for i, x in enumerate(c): if i == 0: f_save.write(x + ' ' + 'B-PER' + '\n') else: f_save.write(x + ' ' + 'I-PER' + '\n') f_save.write('\n') f_save.close()
def test_segment(self): """Tests that the segment() function works as expected.""" s = "我们都是美国人。" seg_s = pynlpir.segment(s, pos_tagging=False) pos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names="child") npos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names=None) ppos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names="all") expected_seg_s = ["我们", "都", "是", "美国", "人", "。"] expected_pos_seg_s = [ ("我们", "personal pronoun"), ("都", "adverb"), ("是", "verb 是"), ("美国", "transcribed toponym"), ("人", "noun"), ("。", "period"), ] expected_npos_seg_s = [("我们", "rr"), ("都", "d"), ("是", "vshi"), ("美国", "nsf"), ("人", "n"), ("。", "wj")] expected_ppos_seg_s = [ ("我们", "pronoun:personal pronoun"), ("都", "adverb"), ("是", "verb:verb 是"), ("美国", "noun:toponym:transcribed toponym"), ("人", "noun"), ("。", "punctuation mark:period"), ] self.assertEqual(expected_seg_s, seg_s) self.assertEqual(expected_pos_seg_s, pos_seg_s) self.assertEqual(expected_npos_seg_s, npos_seg_s) self.assertEqual(expected_ppos_seg_s, ppos_seg_s)
def build_list_and_idmap(train_dict, context_dict, path=None): question_list = [] question_idmap = {} context_list = [] context_idmap = {} with tqdm(total=len(train_dict)) as pbar: pbar.set_description('build list_and_idmap of train_dict') for index, item in enumerate(train_dict.items()): _id, _item = item question = _item['question'] question_list.append(pynlpir.segment(question, pos_tagging=False)) question_idmap[_id] = str(index) question_idmap[str(index)] = _id pbar.update(1) if path and os.path.exists(path.get('context_idmap')) and os.path.exists( path.get('context_list')): context_list = load_pkl_data(path.get('context_list')) context_idmap = load_pkl_data(path.get('context_idmap')) else: with tqdm(total=len(context_dict)) as pbar: pbar.set_description('build list_and_idmap of context_dict') for index, item in enumerate(context_dict.items()): _id, doc = item context_list.append( pynlpir.segment(doc["text"], pos_tagging=False)) context_idmap[_id] = str(index) context_idmap[str(index)] = _id pbar.update(1) save_pkl_data(context_list, path.get('context_list')) save_pkl_data(context_idmap, path.get('context_idmap')) return question_list, question_idmap, context_list, context_idmap
def segment(path='F:/Data/Chinese/chinese.json', json_path='F:/Data/Chinese/chinese_token.json'): """ NLPIR分词+根据词性清洗+去掉为问题或回答空的项 :param path: 源数据路径 :param json_path: 结果保存路径 :return: """ # 启动分词工具 pynlpir.open() # 只保留文本部分,并分词,根据词性过滤 # 保留以下词性的词,并去除词性标记 # 词性含义请查看https://github.com/tsroten/pynlpir/blob/master/pynlpir/pos_map.py word_filter = { 'noun', 'time word', 'locative word', 'noun of locality', 'verb', 'adjective', 'distinguishing word', 'status word', 'numeral' } # 清除分词异常的数据 question_id_filter = {294118450, 300106271, 291834409} # 边读边处理边写入文件,减少内存消耗 count = 0 with open(path, 'r') as f_in, open(json_path, 'w') as f_out: for line in f_in: q = json.loads(line) if q['question_id'] in question_id_filter: continue # 干掉有换行的情况 小写化 if '\n' in q['question']: print 'question:' print q['question'] q['question'] = q['question'].replace('\n', ' ') q['question'] = [ w[0] for w in pynlpir.segment(q['question'].lower()) if w[1] in word_filter and w[0] != u'' ] for a in q['answers']: # 干掉有换行的情况 if '\n' in a['answer']: print 'answer:' print a['answer'] a['answer'] = a['answer'].replace('\n', ' ') a['answer'] = [ w[0] for w in pynlpir.segment(a['answer'].lower()) if w[1] in word_filter and w[0] != u'' ] # 清除回答为空 q['answers'] = [a for a in q['answers'] if len(a['answer']) > 0] count = count + 1 if count % 1000 == 0: print count # 清除回答列表为空和问题为空的 if len(q['question']) > 0 and len(q['answers']) > 0: f_out.write(json.dumps(q)) f_out.write('\n') pynlpir.close()
def test_segment_space(self): """Tests that the fix for issue #2 works.""" s = '这个句子有 空格。' seg_s = pynlpir.segment(s, pos_tagging=False) pos_seg_s = pynlpir.segment(s) expected_seg_s = ['这个', '句子', '有', ' ', '空格', '。'] expected_pos_seg_s = [('这个', 'pronoun'), ('句子', 'noun'), ('有', 'verb'), (' ', None), ('空格', 'noun'), ('。', 'punctuation mark')] self.assertEqual(expected_seg_s, seg_s) self.assertEqual(expected_pos_seg_s, pos_seg_s)
def parse_name(row): segments = pynlpir.segment(row, pos_names='all') temp_index = False try: segments = pynlpir.segment(row, pos_names='all') except Exception as e: return temp_index for segment in segments: if segment[1] == u'noun:personal name': temp_index = True return temp_index
def words_cixing(question,pos=1): #pos=1,标注词性;否则不标注 pynlpir.open() if pos: pos1=['{}/{}'.format(k,v)for k,v in pynlpir.segment(question, pos_names=None,pos_tagging=pos)] else: pos0=pynlpir.segment(question) pynlpir.close() if pos: return pos1 else : return pos0
def words_cixing(question, pos=1): #pos=1,标注词性;否则不标注 pynlpir.open() if pos: pos1 = [ '{}/{}'.format(k, v) for k, v in pynlpir.segment( question, pos_names=None, pos_tagging=pos) ] else: pos0 = pynlpir.segment(question) pynlpir.close() if pos: return pos1 else: return pos0
def parse_skills(self, line): skills = [] pairs = [] match_list = ['prow', 'eng'] w_tlist = pseg.cut(line) temp = [] lasttag = 'prow' for word, tag in w_tlist: if tag in match_list: if lasttag in match_list: temp.append((word, tag)) else: pairs.append(temp) temp = [(word, tag)] lasttag = tag for line in pairs: des = "" for index, pair in enumerate(line): if pair[1] == 'prow' and (index == 0 or index == len(line)-1)\ and pynlpir.segment(pair[0])[0][1] != 'noun': continue des += pair[0] if len(des) != 0: if des not in skills: wash_text = jieba.analyse.extract_tags(des, withWeight=True) if sum([pair[1] for pair in wash_text]) > 8.0: skills.append(des) return skills
def refine_corpus(path): if os.path.isdir(path): files = [os.path.join(path, file) for file in os.listdir(path)] else: files = [ path, ] titles = [] corpus = [] ambiguity = re.compile("\{.*?\}") redundancy = re.compile("\(.*?\)|\d") for file in files: with open(file) as f: poem_set = json.load(f) for poem in poem_set: paragraphs = "".join(poem["paragraphs"]) if ambiguity.search(paragraphs): continue else: paragraphs = redundancy.sub("", paragraphs) titles.append(poem["title"]) token_poem = " ".join( pynlpir.segment(paragraphs, pos_tagging=False)) corpus.append(token_poem) del poem_set return titles, corpus
def mysegment(filename2w, filename2seg, srting2strp): dataMat = [] labelMat = [] fr = open(filename2w) fl = open(filename2seg, 'w') arrayOLines = fr.readlines() length = len(arrayOLines) for j in range(length): lineArr = arrayOLines[j].strip().split(';') if len(lineArr) < 3: pass else: fl.write(str(j)) fl.write(";") fl.write(str(lineArr[1])) fl.write(";") fl.write(str(lineArr[2])) fl.write(";") seg = pynlpir.segment(lineArr[1], pos_tagging=False) for item in seg: if str(item) in srting2strp: pass else: fl.write(str(item)) fl.write(",") fl.write(";\n") fl.close() pynlpir.close()
def train(self): # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}} # The number of articles containing "science", "physics" or "robot" # prior_table = {"valid": 183, "invalid": 244} pynlpir.open() prior_table = {ele: 0 for ele in self.category_list} posterior_table = {ele: dict() for ele in self.category_list} i = 0 for sample in self.training_set_material: buffer = sample.split("\t") text = buffer[0] seg_words = pynlpir.segment(text, pos_tagging=False) words_set = set(seg_words) try: label = buffer[1] except: print("Line " + str(i) + "in training set corrupted") continue prior_table[label] += 1 for word in words_set: # all words in the text if word in posterior_table[label].keys(): posterior_table[label][word] += 1 # posterior count +1 when this word already exists in posterior else: posterior_table[label][word] = 1 # posterior count assigned to 1 when this word does exist in posterior yet i += 1 return prior_table, posterior_table
def get_tokenised_parts(self): pynlpir.open() for s in self.sentences: sen_parts = re.split('[?!.,。,?!]', s) for sen_part in sen_parts: tokens = pynlpir.segment(sen_part) yield tokens
def get_result(self, paragraph): self.paragraph = paragraph self.segments = pynlpir.segment(self.paragraph, pos_names='all', pos_tagging=False) self.key_words = pynlpir.get_key_words(self.paragraph, weighted=False, max_words=20) self.new_sentence_wordlist = [0] * len(self.key_words) key_words = pynlpir.get_key_words(self.paragraph, max_words=20, weighted=True) self.key_weight = [item[1] for item in key_words] sentence_dict = self.cal_text_simliarity() keys = list(sentence_dict.keys()) val = list(sentence_dict.values()) temp = sorted( list(map(val.index, heapq.nlargest(self.maxSumarySize, val)))) for i in temp[:2]: if keys[i] != self.sentence()[0]: self.result.append(keys[i]) self.result.insert(0, self.sentence()[0]) if len(",".join(self.result)) < self.length: self.result.append(keys[temp[2]]) return ",".join(self.result)
def filter_q_by_word_freq(dfs_q_filters, train=None): question_f = [] with tqdm(total=len(train)) as pbar: pbar.set_description('filter question by q_dfs') for item in train: q_cut = pynlpir.segment(item.get('question'), pos_tagging=False) question = [word for word in q_cut if word in dfs_q_filters] if len(question) == 0: isBad = True else: isBad = False q = { 'question': question, 'isBad': isBad, 'docid': item['docid'], 'id': item['id'], } question_f.append(q) pbar.update(1) if False: q_empty = [] for item in question_f: if len(item['question']) == 0: q_empty.append(item) if len(q_empty) != 0: raise Exception( 'Empty in q after filtering, have {} empty q'.format( len(q_empty))) return question_f
def Participle(input_File): Total_data = '' for root, dirs, files in os.walk( input_File): #'D:\文本数据挖掘\搜狗实验室内容分类数据集\C000024' C000024_Participle for filespath in files: print(os.path.join(root, filespath)) p = re.compile(r'/') Folder = p.split(os.path.join(root, filespath)) #print(Folder[-2]) #File_name=Folder[-2]+'\\'+Folder[-1] File_body = readfile(os.path.join(root, filespath)) File_Participle = pynlpir.segment(File_body, pos_tagging=False) #提取停词 File_Participle_delstopwords = '' for word in File_Participle: #去掉左右两侧空格 word = word.strip() if word not in stopwords: if word >= u'\u4e00' and word <= u'\u9fa5': # 判断是否是汉字 #对分词进行空格隔开 File_Participle_delstopwords = File_Participle_delstopwords + ' ' + word #############################################对分词进行utf-8的格式进行保存,以便用后面的word2vector可以读取和使用 #############################################非常重要######################################################### File_Participle_delstopwords = File_Participle_delstopwords[ 1:len(File_Participle_delstopwords)] if not (os.path.exists('文本分词结果' + '/' + Folder[-2])): os.mkdir('文本分词结果' + '/' + Folder[-2]) savefile('文本分词结果' + '/' + Folder[-2] + '/' + Folder[-1], File_Participle_delstopwords.encode("utf-8")) Total_data = Total_data + File_Participle_delstopwords + '\n' savefile('文本' + '/' + Folder[-2] + '_train.txt', Total_data.encode("utf-8"))
def segment(self, sentence): #分词 pynlpir.open(license_code=")VhTW_9s02tDm") list = pynlpir.segment(sentence) wordList = [] for res in list: wordList.append(res[0]) return wordList
def find_raw_entity(text, entity_dict): raw_entities = [] segments = pynlpir.segment(text, pos_names='all') for segment in segments: try: if (segment[1] == 'noun:other proper noun') | ( segment[1] == 'noun:organization/group name'): raw_entities.append(segment[0]) elif segment[1].startswith('noun:personal name'): raw_entities.append(segment[0]) elif segment[1].startswith('noun:toponym'): raw_entities.append(segment[0]) elif (segment[1] == 'noun') and len(segment[0]) > 1: # 个人添加的,有主观倾向 raw_entities.append(segment[0]) except: continue entity_list = list(set(raw_entities)) entity_id_list = [] for entity in entity_list: try: entity_id_list.append(entity_dict[entity]) except: entity_id_list.append(entity) return entity_id_list
def main(input_file, output_file): pynlpir.open() fw = open(output_file, 'w+', encoding='utf-8') pos2id = get_pos_map() data = read_corpus(input_file) for _sent, _tags in data: sent = ''.join(_sent) result = pynlpir.segment(sent, pos_tagging=True, pos_names='parent', pos_english=True) # print(result) i = 0 for _word, _speech in result: for j in range(len(_word)): char = _word[j] speech = '' if _speech is None or _speech not in reserve_pos_list: speech = 'O' else: speech = '-'.join(_speech.split(' ')) if j == 0: speech = 'B-' + speech else: speech = 'I-' + speech if i >= len(_tags): print(i, len(_sent), _sent) fw.write(char + ' ' + _tags[i] + ' ' + speech + '\n') i += 1 fw.write('\n') fw.close() pynlpir.close()
def get_train_with_doc(train, bm25_model, context_idmap, k=5): save_path = '../data/rank/train_with_doc_top{}_2.pkl'.format(k) if os.path.exists(save_path): train_with_doc = load_pkl_data(save_path) else: train_with_doc = [] with tqdm(total=len(train)) as pbar: pbar.set_description('build train with doc in top-{}'.format(k)) for item in train: question = item['question'] qid = item['qid'] q_cut = pynlpir.segment(question, pos_tagging=False) bm25_score = bm25_model.get_scores(q_cut) bm25_score = [[context_idmap[str(index)], score] for index, score in enumerate(bm25_score)] bm25_score.sort(key=op.itemgetter(1), reverse=True) best_text_id = [item[0] for item in bm25_score[:k]] # if item['docid'] in best_doc_id: # answer = item['answer'] train_sample = { 'qid': qid, 'question': question, 'text_ids': best_text_id, 'answer': item['answer'], 'answer_span': item['answer_span'], "docid": item['docid'] } train_with_doc.append(train_sample) pbar.update(1) save_pkl_data(train_with_doc, save_path) return train_with_doc
def test_segment_space(self): """Tests that the fix for issue #2 works.""" s = "这个句子有 空格。" seg_s = pynlpir.segment(s, pos_tagging=False) pos_seg_s = pynlpir.segment(s) expected_seg_s = ["这个", "句子", "有", " ", "空格", "。"] expected_pos_seg_s = [ ("这个", "pronoun"), ("句子", "noun"), ("有", "verb"), (" ", None), ("空格", "noun"), ("。", "punctuation mark"), ] self.assertEqual(expected_seg_s, seg_s) self.assertEqual(expected_pos_seg_s, pos_seg_s)
def init(self, filename=TRAINSETFILE, IsTraining=True, IsSegment=True): with open(filename, encoding='GB18030') as file: filereader = csv.reader(file, dialect='excel-tab', quoting=csv.QUOTE_NONE) if not IsSegment: for item in filereader: self.userlist.append(item) else: pynlpir.open() if IsTraining: infoflag = 4 else: infoflag = 1 # count_test =0 for userquery in filereader: userdict = {} self.userinfo.append(userquery[:infoflag]) for item in userquery[infoflag:]: for word in pynlpir.segment(item, pos_tagging=False): if word not in self.dict.keys(): self.dict[word] = 0 if word in userdict.keys(): userdict[word] += 1 else: userdict[word] = 1 self.userlist.append(userdict) # count_test +=1 # if count_test>100: # break pynlpir.close() self.IsTraining = IsTraining self.IsSegment = IsSegment self.IsDF = False
def stati_pos(content, lang='zh'): """ :param lang: zh, jp 统计词性,返回字典,具体词性看相关说明 todo: 返回特殊词集合 """ pos_count = {} if lang == 'zh': items = pynlpir.segment(content, pos_english=False) for item in items: if item[1] is None: continue pos = item[1] pos_count[pos] = pos_count.get(pos, 0) + 1 elif lang == 'jp': res = mecab.parse(content) for item in res.split('\n'): if item == 'EOS': break pos = item.split('\t')[3] pos_count[pos] = pos_count.get(pos, 0) + 1 # 可以扩展en.. return pos_count
def separateWordFromFile(fileName): pynlpir.open() file = open(fileName,'r') lines = file.readlines() i = 0 allSegmentResult = [] #print type(s) label = [] for line in lines: i = i+1 textsegment = line if textsegment == "\n": print "skip" continue ##note: ''' gbk 转 utf-8时, gbk --> unicode --> utf-8 分解为两个步骤, 1. gbk --> unicode python 语法:你的字符串.decode("gbk") 2. unicode --> utf-8 python 语法:你的字符串.decode("gbk").encode("utf-8") ''' segmentResult = pynlpir.segment(textsegment,pos_tagging=True) newSegmentResult = removePunctuation(segmentResult) allSegmentResult.append(newSegmentResult) print len(allSegmentResult) file.close() pynlpir.close() #print label return allSegmentResult
def train(self): # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}} # The number of articles containing "science", "physics" or "robot" # prior_table = {"valid": 183, "invalid": 244} pynlpir.open() prior_table = {ele: 0 for ele in self.category_list} posterior_table = {ele: dict() for ele in self.category_list} i = 0 for sample in self.training_set_material: buffer = sample.split("\t") text = buffer[0] seg_words = pynlpir.segment(text, pos_tagging=False) words_set = set(seg_words) try: label = buffer[1] except: print("Line " + str(i) + "in training set corrupted") continue prior_table[label] += 1 for word in words_set: # all words in the text if word in posterior_table[label].keys(): posterior_table[label][word] += 1 # posterior count +1 when this word already exists in posterior else: posterior_table[label][ word ] = 1 # posterior count assigned to 1 when this word does exist in posterior yet i += 1 return prior_table, posterior_table
def splitFile(docName, encodingType): ''' default code style of docName : encodingType function : segmente the chinese text of docName and return ''' # all is wrote in cache -- ok ? maybe wrote in files f = file(docName, 'r') pynlpir.open(encoding='utf-8') contest = [] line = f.readline() cou = 0 while line: line = line.strip() cou += 1 try: line = line.decode(encodingType) if line.find(testChar) != -1: #delete the file header line = f.readline() continue temp = pynlpir.segment(line, pos_tagging=False) contest += temp line = f.readline() except: line = f.readline() # print '.' # print "err %s, %d"%(docName, cou) f.close() pynlpir.close() return contest
def fenci(content): dict = {} # pr.open() # dicConf=GetDicConfig() # FilePath=dicConf['Testfilepath'] # DicNews=GetDictFromJsonFile(FilePath) # content=DicNews['content'] pr.open() segs = pr.segment(content, pos_english=False, pos_names='child') AllList = [] NamedList = [] OtherList = [] for w, c in segs: if len(w) < 2: continue else: AllList.append(w) if c == '地名' or c == '人名': NamedList.append(w) else: OtherList.append(w) #print("NameList=",NamedList) #print('OtherList=',OtherList) #print('Alllist=',AllList) dict.update({'NameList': NamedList}) dict.update({'OtherList': OtherList}) dict.update({'AllList': AllList}) pr.close() return dict
def wordSegmenter(sentence='', pathOfStopWords=''): """ 将传入的句子分词并去除停用词 :param sentence: 传入的句子 :param pathOfStopWords: 停用词的路径 :return: 分词并去除停用词后由空格分隔的字符串 """ #打开分词器 pynlpir.open() #分词 seg_list = [] for seg in pynlpir.segment(sentence): seg_list.append(seg[0]) #去除停用词 resultWords = [] if pathOfStopWords == '': #没指定停用词就使用默认的停用词 pathOfStopWords = path.join(ROOT, STOP_WORDS) f_stop = open(pathOfStopWords, 'rt', encoding='utf-8') try: f_stop_text = f_stop.read() finally: f_stop.close() f_stop_words = f_stop_text.split("\n") for seg in seg_list: seg = seg.strip() if re.match(r'[a-zA-Z0-9]+', seg): #去掉英文以及数字 continue if len(seg) > 0 and (seg not in f_stop_words): resultWords.append(seg) return " ".join(resultWords)
def process_text(text): lowered = text.lower() tokens = pynlpir.segment(lowered, pos_names='child') filtered = [word[0] for word in tokens if filter(word)] return filtered
def cut(data_list): """ 分词 """ pynlpir.open() data_list = [(pynlpir.segment(x)) for x in data_list] pynlpir.close() return data_list
def cut_sentence(sentence): try: words = pynlpir.segment(sentence, pos_tagging=False) except: words = jieba.cut(str(sentence)) words = [word for word in words if word not in stopwords] return "#".join(words)
def get_question_corpus(train): question_corpus = [] for item in train: cut_words = pynlpir.segment(item.get('question'), pos_tagging=False) if 'delete stop word': pass question_corpus.append(cut_words) return question_corpus
def sentence_seg(sentence): """ 对句子进行分词处理 :param sentence: 需要分词的句子 :return: 分词后的结果 [(分词1,词性1),(分词2,词性2)...] """ pos_seg_list = pynlpir.segment(sentence + ',') return pos_seg_list[:-1]
def segment(self): """ fni: str; input file name with path fno: str; output file name with path lang: str; language code pos: bool; POS tags included n: int; no. of lines processed """ import copy from PyQt5.QtWidgets import QApplication from opencc import OpenCC openCC = OpenCC('t2s') # convert from Traditional-to-Simplified pynlpir.open(encoding="utf-8") print("Finished initializing ITCLAS/NLPIR") count = lineCount(self.fni) fit = open(self.fni, "r", encoding="UTF-8") fot = open(self.fno, "w", encoding="UTF-8", newline="\n") sep = " " # separator of Chinese tokens (space by default) n = 0 for linet in fit: n += 1 if (linet.strip() == ''): # empty string fot.write("\n") continue lines = openCC.convert(linet.strip()) lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) # segment with optional POS-tagging # The following segments the zht text according to the # segmentation patterns obtained from NLPIR above tokens = [] # initialize list to hold 'words' of segmented zht line pos_tags = [] # initialize list to hold pos tags of segmented words while len(lines_seg) > 0: # loop until nothing is left in lines_seg t, p = lines_seg.pop(0) # remove leftmost zhs token and save to variable t0 m = len(t) # no. of characters in token tokens.append(linet[0:m]) # add corresponding zht token to tokens[] pos_tags.append(p) linet = linet[m:] # delete token from zht line (from beginning of string) #fot.write(sep.join(tokens)+"\n") # wirte zht-seg output tok_pos = ["{}_{}".format(x, y) for x,y in zip(tokens, pos_tags)] # list of tok_pos pairs fot.write(sep.join(tok_pos)+"\n") #if (n == 1): break if n % 50 == 0: self.window.ui.progressBar.setValue(round(100 * n / self.fi_linecount, 0)) self.window.ui.progressBar.repaint() QApplication.processEvents() self.window.ui.progressBar.setValue(100) self.window.ui.progressBar.repaint() fit.close() fot.close() pynlpir.close() self.numLineProcessed = n return n
def prep_word_dict(): CURRENT_W = None with open(IN_FILE) as fin: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break if line[:4] == '[DDv' : CURRENT_W = line[5: line.index(']')] term_to_id(CURRENT_W) continue if CURRENT_W and line[0] == '【' and ('=】' in line): line_x = line[line.index('】')+1:] line_x = line_x.split() if line_x: for item in line_x: term_to_id(item) continue LINE_NUM = 0 with open(YL_FILE) as fin, open(YLP_FILE, 'w') as fout: while True: try: line = fin.readline() except: print("READ ERROR:%d" %(LINE_NUM) ) continue if not line: print("PROCESS DONE!") break LINE_NUM += 1 if not (LINE_NUM % 5000): print('C:%d' %(LINE_NUM)) if len(line) > 30: continue seg_list = pynlpir.segment(line, pos_tagging=False) for i in range(len(seg_list)): if is_zhs(seg_list[i]): term_to_id(seg_list[i]) elif len(seg_list[i]) == 1 and is_punct(seg_list[i]): seg_list[i] = PUNCING else: seg_list[i] = PADDING fout.write(' '.join(seg_list) + '\n') term_to_id(PADDING) #term_to_id(PUNCING) print('SEN DONE!')
def test_double_slash(self): """Tests for issue #7 -- double slashes raises exception.""" s = '转发微博 //@张明明:霸气全露' seg_s = pynlpir.segment(s) expected_seg_s = [('转发', 'verb'), ('微', 'adjective'), ('博', 'adjective'), (' ', None), ('//', 'string'), ('@张明明', 'noun'), (':', 'punctuation mark'), ('霸气', 'noun'), ('全', 'adverb'), ('露', 'verb')] self.assertEqual(expected_seg_s, seg_s)
def get_feat(self, text): tokens = [] filtered = [] segs = pynlpir.segment(text, pos_tagging = False) for seg in segs: if self.validtoken(seg): tokens.append(seg) else: filtered.append(seg) return tokens, filtered
def word_to_class(word): c = "null" c_list = [] try: c_list = pynlpir.segment(word) except Exception as e: print (e) return c if len(c_list)>=1 and c_list[0][1] != None: c = c_list[-1][1] return c
def test_segment(self): """Tests that the segment() function works as expected.""" s = '我们都是美国人。' seg_s = pynlpir.segment(s, pos_tagging=False) pos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names='child') npos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names=None) ppos_seg_s = pynlpir.segment(s, pos_tagging=True, pos_names='all') expected_seg_s = ['我们', '都', '是', '美国', '人', '。'] expected_pos_seg_s = [('我们', 'personal pronoun'), ('都', 'adverb'), ('是', 'verb 是'), ('美国', 'transcribed toponym'), ('人', 'noun'), ('。', 'period')] expected_npos_seg_s = [('我们', 'rr'), ('都', 'd'), ('是', 'vshi'), ('美国', 'nsf'), ('人', 'n'), ('。', 'wj')] expected_ppos_seg_s = [('我们', 'pronoun:personal pronoun'), ('都', 'adverb'), ('是', 'verb:verb 是'), ('美国', 'noun:toponym:transcribed toponym'), ('人', 'noun'), ('。', 'punctuation mark:period')] self.assertEqual(expected_seg_s, seg_s) self.assertEqual(expected_pos_seg_s, pos_seg_s) self.assertEqual(expected_npos_seg_s, npos_seg_s) self.assertEqual(expected_ppos_seg_s, ppos_seg_s)
def query2words(self,query): words = [] segs = query.split(' ') for s in segs: s = s.strip() ## need regularization if s in self.vocab: words.append(s) ## in word2vec vocab else: pynlpir.open() # words.extend(pynlpir.get_key_words(query,max_words=3)) word_segs = pynlpir.segment(query,pos_tagging=False) for word in word_segs: if word not in self.stop_list: words.append(word) print(words) return words
def _part_document(self): pynlpir.open() docs = {} for dirname, dirnames,filenames in os.walk('dependence/new_data'): for filename in filenames: path = os.path.join(dirname, filename) text = '' with io.open(path, 'r',encoding='utf-8') as f: text = f.readline() words = pynlpir.segment(text,pos_tagging=False) clean_words = [w for w in words if w not in self.stop_list and len(w)>1] index = filename[:6] docs[index] = clean_words dictionary = corpora.Dictionary(docs.values()) corporas = {index: dictionary.doc2bow(docs[index]) for index in docs} return docs, dictionary, corporas
def test_double_slash(self): """Tests for issue #7 -- double slashes raises exception.""" s = "转发微博 //@张明明:霸气全露" seg_s = pynlpir.segment(s) expected_seg_s = [ ("转发", "verb"), ("微", "adjective"), ("博", "adjective"), (" ", None), ("//", "string"), ("@张明明", "noun"), (":", "punctuation mark"), ("霸气", "noun"), ("全", "adverb"), ("露", "verb"), ] self.assertEqual(expected_seg_s, seg_s)
def __call__(self, value, positions=False, chars=False, keeporiginal=False, removestops=True, start_pos=0, start_char=0, mode='', **kwargs): assert isinstance(value, text_type), "%r is not unicode" % value t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs) nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE) pynlpir.open() pynlpir.open(encoding='utf-8') seglist = pynlpir.segment(value,) for w in seglist: t.original = t.text = w t.boost = 1.0 if positions: t.pos=start_pos+value.find(w) if chars: t.startchar=start_char+value.find(w) t.endchar=start_char+value.find(w)+len(w) yield t #通过生成器返回每个分词的结果token
def predict(self, text): # words = [word1, word2, word3, ...] pynlpir.open() seg_words = pynlpir.segment(text, pos_tagging=False) words_set = set(seg_words) result = dict() for category in self.category_list: prob = self.comp_prop(category, words_set) result[category] = prob """ buffer = [result[my_key] for my_key in result.keys()] score_sum = sum(buffer) # result = {my_key: result[my_key]/score_sum for my_key in result.keys()} """ buffer = list(result.items()) buffer.sort(key=lambda x: x[1], reverse=True) top_category = buffer[0][0] return top_category
def dispatch_me(str_test): print("测试语句:%s" %(str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) if USE_SEGMENT == "JIEBA": print("==JIEBA分词==") jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": print("==NLPIR分词==") jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) else: print("ERROR:未知分词系统!") return None print("分词结果:%s"%(repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) result_collect = [] for i in range(0,jieba_len): if i > 0: head = jieba_i[i-1] else: head = None if i < jieba_len -1: tail = jieba_i[i+1] else: tail = None ret = calc_list_pro(jieba_i[i], head, tail) if ret: ret_pro = find_max_dict(ret) if ret_pro: print("词汇:[[%s]], 最大概率义项:%s, 概率:%f" %(jieba_i[i], ret_pro[0], ret_pro[1])) print("DEBUG:::"+repr(ret)) result_collect.append((jieba_i[i], ret_pro[0], ret_pro[1])) else: print("无计算结果") return result_collect
def document2sentences(self,document): pynlpir.open() words = pynlpir.segment(document,pos_tagging=False) sign = ['。', ';', '.', ';'] pause_position = [] for i in range(len(words)): if words[i] in sign: pause_position.append(i) setences = [] if len(pause_position) == 0: clean_d = [s.strip() for s in words if s not in self.stop_list] setences.append(' '.join(clean_d)+'\n') else: for i in range(len(pause_position)): setence = [] if i == 0: setence = words[:pause_position[i]] elif i == len(pause_position)-1 and i != 0: break else: setence = words[pause_position[i]:pause_position[i+1]] clean_s = [s.strip() for s in setence if s not in self.stop_list] setences.append(' '.join(clean_s)+'\n') return setences
def read_lexical_datas(file, compose_func=None): pynlpir.open() f = open(file, 'r', encoding='utf-8') tokens_list = [pynlpir.segment(line.rstrip('\n').replace('幺', '一'), pos_tagging=False) for line in f] if compose_func is None: word_idx = {} for tokens in tokens_list: for token in tokens: if token not in word_idx: word_idx[token] = len(word_idx) array = numpy.zeros([len(tokens_list), len(word_idx)]) for i, tokens in enumerate(tokens_list): for token in tokens: array[i][word_idx[token]] = 1.0 else: print('reading word vectors') word_vecs = word_vec.read_word_vec(r'../data/vectors_cbow') print('reading complete') array = numpy.asarray([compose_func(tokens, word_vecs) for tokens in tokens_list]) return array
def main(): py.open() a = sys.argv[1] result = py.segment(a) res_str = [] for r in result: if len(r[0]) == 2 and (r[1] == "noun" or r[1] == "verb" or r[1] == "adjective"): f_result = fsame.find(r[0]) ff_result = fsame.ffind(r[0]) if f_result == r[0] or ff_result == r[0]: res_str.append(r[0]) else: if random.randint(0, 1) == 0: res_str.append(f_result) else: res_str.append(ff_result) else: res_str.append(r[0]) print "".join(res_str) py.close()
def extract_news_kws(hot_news): pynlpir.open() s = hot_news kw_list = pynlpir.segment(s, pos_tagging=True, pos_names=None) kws = "" for kw in kw_list: pos = kw[0] tagging = kw[1] try: if tagging: # test if tagging is none, which means the pos is a space character tagging_first = tagging[0] else: tagging_first = "" except: tagging_first = "" if tagging_first == "n" and len(pos) > 1: if pos != "quot": kws = kws + pos + u" " kws = kws.strip(u" ") return kws
def tokenize(file): words = [] pynlpir.open() directory = '\\resources\\original files\\htl_del_4000\\' posWords = codecs.open(directory + file + 'Words.txt', 'w+', 'utf-8') with codecs.open(directory + file + '.txt', 'r', 'utf-8') as posFile: for s in posFile.readlines(): # print posFile.readline() a = pynlpir.segment(s, pos_tagging=False) # print a for i in range(len(a)): # print a[i] if i != (len(a) - 1): # print 'i='+str(i) # print 'a='+str(len(a)) posWords.write(a[i] + ' ') else: posWords.write(a[i] + '\r') # for i in a: # posWords.write(i + ';') # posWords.write('\0') posWords.close()
def part_sentence(stop_list): pynlpir.open() for dirname, dirnames,filenames in os.walk('dependence/ch_corporas/wiki/lost'): for filename in filenames: lines = [] read_path = os.path.join(dirname, filename) rf = open(read_path,'rb') print(filename) for line in rf: # detector.feed(byte) encoding = chardet.detect(line)['encoding'] if encoding == None: encoding = 'utf-8' new_line = line.decode(encoding,'ignore') words = pynlpir.segment(new_line,pos_tagging=False) clean_words = [w.strip() for w in words if w not in stop_list] str_line = ' '.join(clean_words) if str_line: lines.append(str_line+'\n') rf.close() write_path = os.path.join('dependence/ch_corporas/wiki_clean', filename) wf = open(write_path, 'w') wf.writelines(lines) wf.close()
def __init__(self, content, norm="l1_norm"): self.norm = norm pynlpir.open() words = pynlpir.segment(content, pos_tagging=True, pos_names=None) kws = "" for word in words: pos = word[0] tagging = word[1] try: if tagging: # test if tagging is none, which means the pos is a space character tagging_first = tagging[0] else: tagging_first = "" except: tagging_first = "" if tagging_first == "n" and len(pos) > 1: if pos != "quot": kws = kws + pos + u" " result = kws.split(" ") self.PoS = result
def test_issue_52(self): """ Tests for issue #52 -- segment(pos_names='all') fails for certain texts input. """ # it seems '甲' returns 'Mg', which is not listed in the POS_MAP. # thus in this case 'None' needs to be returned for '甲'. s = u'其中,新增了甲卡西酮、曲马多、安钠咖等12种新类型毒品的定罪量刑数量标准,' \ u'并下调了在我国危害较为严重的毒品氯胺酮的定罪量刑数量标准。' segments = pynlpir.segment(s=s, pos_tagging=True, pos_names='all') expected_segments = [ (u'其中', 'pronoun:demonstrative pronoun'), (u',', 'punctuation mark:comma'), (u'新增', 'verb'), (u'了', 'particle:particle 了/喽'), (u'甲', 'numeral'), (u'卡', 'noun'), (u'西', 'distinguishing word'), (u'酮', 'noun'), (u'、', 'punctuation mark:enumeration comma'), (u'曲马多', 'noun:personal name:transcribed personal name'), (u'、', 'punctuation mark:enumeration comma'), (u'安', 'noun:personal name:Chinese surname'), (u'钠', 'noun'), (u'咖', 'noun'), (u'等', 'particle:particle 等/等等/云云'), (u'12', 'numeral'), (u'种', 'classifier'), (u'新', 'adjective'), (u'类型', 'noun'), (u'毒品', 'noun'), (u'的', 'particle:particle 的/底'), (u'定罪', 'verb:noun-verb'), (u'量刑', 'verb:noun-verb'), (u'数量', 'noun'), (u'标准', 'noun'), (u',', 'punctuation mark:comma'), (u'并', 'conjunction:coordinating conjunction'), (u'下调', 'verb'), (u'了', 'particle:particle 了/喽'), (u'在', 'preposition'), (u'我国', 'noun'), (u'危害', 'verb:noun-verb'), (u'较为', 'adverb'), (u'严重', 'adjective'), (u'的', 'particle:particle 的/底'), (u'毒品', 'noun'), (u'氯', 'noun'), (u'胺', 'noun'), (u'酮', 'noun'), (u'的', 'particle:particle 的/底'), (u'定罪', 'verb:noun-verb'), (u'量刑', 'verb:noun-verb'), (u'数量', 'noun'), (u'标准', 'noun'), (u'。', 'punctuation mark:period'), ] self.assertEqual(segments, expected_segments)
writer = csv.writer(of, delimiter=",", quotechar='|', quoting=csv.QUOTE_MINIMAL) errors = 0 unbounderrors = 0 for f in files: infile = "./../" + f with open(infile, 'rb') as csvfile: count = 0 reader = csv.reader(csvfile, delimiter=",") for row in reader: if row[10]!="": mid = row[0] message = row[6] censored = 1 try: segmented = pynlpir.segment(message) except UnicodeDecodeError: errors += 1 continue except UnboundLocalError: unbounderrors += 1 print "what??" continue except: print "core dump...?" continue mString = "" for segment in segmented: mString += segment[0] mString += " "
def __init__(self, file): with open(file, 'r', encoding="utf-8") as f: content = f.read() pynlpir.open() result = pynlpir.segment(content, pos_tagging=False) self.PoS = result
if not os.path.exists(FILE_NAME_JIEBA): with open(FILE_NAME_PREP) as fin: with open(FILE_NAME_JIEBA, "w") as fout: for line in fin: i = i + 1 if not i % 1000: print("C:%d" % (i)) line_p = hanzi_prep.split_into_sentences_e(line) for line_i in line_p: # 用空格分割每个汉字 str_i = "".join(line_i) str_j = "" if USE_SEGMENT == "JIEBA": str_j = " ".join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": str_j = " ".join(pynlpir.segment(str_i, pos_tagging=False)) else: print("ERROR:未知分词系统!") fout.write(str_j + "\n") if USE_SEGMENT == "ICTCLAS": print("END:ICTCLAS分词系统") pynlpir.close() elif USE_SEGMENT == "JIEBA": print("END:JIEBA分词系统") else: print("END:未知分词系统") # 计算N-Gram词频信息
#!/usr/bin/env python # coding=utf-8 import pynlpir from pynlpir import nlpir pynlpir.open() s = '北京邮电大学是一所美丽的学校' print(pynlpir.segment(s)) nlpir.Exit()
import pynlpir import MySQLdb # import sys # reload(sys) # sys.setdefaultencoding('utf-8') conn= MySQLdb.connect( host='localhost', port = 3306, user='******', passwd='1111', db ='sinadb', charset='utf8' ) cur = conn.cursor() pynlpir.open() res = cur.execute("SELECT weibotext FROM hot_1_user_weibo WHERE UNIX_TIMESTAMP(time) > UNIX_TIMESTAMP('2016-4-3 12:00:00')") info = cur.fetchmany(res) nlpir_results = pynlpir.segment(info[0][0]) for nlpir_result in nlpir_results: print nlpir_result[0], nlpir_result[1] # pynlpir.open() # s = u':【东北衰败宣告了国企城市的破产 】东北衰落的原因很简单,那就是经济被国企吸干了。东北是全球苏联式经济的最佳典范。苏联计划经济已经垮台了,东北国企还在苟延残喘……' # print s # for x in pynlpir.segment(s): # print x[0], x[1]
def correct_me(str_test, enhance = True): print("") print("测试语句:%s" %(str_test)) line_p = hanzi_prep.split_into_sentences(str_test) lines = [] for line_i in line_p: lines.extend(line_i) str_i = ''.join(lines) if USE_SEGMENT == "JIEBA": print("==JIEBA分词==") jieba_i = ' '.join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": print("==NLPIR分词==") jieba_i = ' '.join(pynlpir.segment(str_i, pos_tagging=False)) else: print("未知分词类型!") jieba_i = [] print("分词结果:%s"%(repr(jieba_i))) jieba_i = jieba_i.split() jieba_len = len(jieba_i) if jieba_len < 3: print("词数太小,放弃纠错!") return jieba_key = [] jieba_pro = [] for i in range(1,jieba_len): #不考虑开头结尾模式 tmp_str = jieba_i[i-1] + jieba_i[i] pro = JIEBA_HZ.get(tmp_str) jieba_key.append(tmp_str) if pro: jieba_pro.append(pro) else: jieba_pro.append(0) print("分词表:"+repr(jieba_key)) print("概率表:"+repr(jieba_pro)) jieba_pro_t = [] for i in range(0,jieba_len-2): jieba_pro_t.append( jieba_pro[i] + jieba_pro[i+1]) min_index = jieba_pro_t.index(min(jieba_pro_t)) + 1 print("可疑位置:[%d]->%s"%(min_index,jieba_i[min_index])) to_do = [] g_check_a = None g_check_e = None #纠错位置不可能在开头或者结尾 to_do.append(jieba_i[min_index-1]) to_do.append(jieba_i[min_index]) to_do.append(jieba_i[min_index+1]) if min_index - 2 >= 0: g_check_a = jieba_i[min_index-2] if min_index + 2 < jieba_len: g_check_e = jieba_i[min_index+2] print("需要处理:"+repr(to_do)) print("辅助检测:%s,%s" %(g_check_a, g_check_e)) #保存最终的结果 p_res_stage1 = {} p_res_stage2 = {} p_res_stage3 = {} if enhance: #STAGE1 假设分词没有错误 p_res_st1 = sub_correct_me_ext(to_do[0], to_do[1], to_do[2], 1) #STAGE2 假设第一和第二个合并 p_res_st2 = sub_correct_me_ext(g_check_a, to_do[0]+to_do[1], to_do[2], 2) #STAGE3 假设第二和第三个合并 p_res_st3 = sub_correct_me_ext(to_do[0], to_do[1]+to_do[2], g_check_e, 3) else: #STAGE1 假设分词没有错误 p_res_st1 = sub_correct_me(to_do[0], to_do[1], to_do[2], 1) #STAGE2 假设第一和第二个合并 p_res_st2 = sub_correct_me(g_check_a, to_do[0]+to_do[1], to_do[2], 2) #STAGE3 假设第二和第三个合并 p_res_st3 = sub_correct_me(to_do[0], to_do[1]+to_do[2], g_check_e, 3) #打印纠正结果 cor_ret = correct_result(to_do, [p_res_st1, p_res_st2, p_res_st3], True) if not cor_ret: final_words = ['NONE'] else: if cor_ret['type'] == 1: final_words = jieba_i[0:min_index-1] + [ to_do[0], cor_ret['item'], to_do[2] ] + jieba_i[min_index+2:jieba_len] elif cor_ret['type'] == 2: final_words = jieba_i[0:min_index-1] + [ cor_ret['item'], to_do[2] ] + jieba_i[min_index+2:jieba_len] elif cor_ret['type'] == 3: final_words = jieba_i[0:min_index-1] + [ to_do[0], cor_ret['item'] ] + jieba_i[min_index+2:jieba_len] else: final_words = ['NONE'] return ''.join(final_words)