def word_segment(text): pynlpir.open() segments = nlpir.segment_pos(text) segment_result = [] pos_result = [] for segment in segments: segment_result.append(segment[0]) pos_result.append(segment[1]) pynlpir.close() return segment_result, pos_result
def get_key_words(text): pynlpir.open() result = [] keywords = pynlpir.get_key_words(text, weighted=True) if len(keywords) == 0: return result for i in range(len(keywords)): keyword = keywords[i][0] result.append(keyword) pynlpir.close() return result
def nlpir_keywords(text,n): pynlpir.open() # print '关键词测试:\n' key_words = list(pynlpir.get_key_words(text,n,weighted=False)) # for key_word in key_words: # print key_word[0], '\t', key_word[1] pynlpir.close() print key_words return key_words
def words_cixing(question,pos=1): #pos=1,标注词性;否则不标注 pynlpir.open() if pos: pos1=['{}/{}'.format(k,v)for k,v in pynlpir.segment(question, pos_names=None,pos_tagging=pos)] else: pos0=pynlpir.segment(question) pynlpir.close() if pos: return pos1 else : return pos0
def test_license_auto_update(self): """Tests that the auto-update of the license works.""" try: # switch old one to the new one os.rename(os.path.join(DATA_DIR, LICENSE_NAME), os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME))) os.rename(os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME)), os.path.join(DATA_DIR, LICENSE_NAME)) pynlpir.open() pynlpir.close() finally: # switch back the license os.rename(os.path.join(DATA_DIR, LICENSE_NAME), os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME))) os.rename(os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME)), os.path.join(DATA_DIR, LICENSE_NAME))
def test_license_expire(self): """Tests that a RuntimeError is raised if the license is invalid.""" temp_dir = tempfile.mkdtemp() temp_data_dir = os.path.join(temp_dir, 'Data') shutil.copytree(DATA_DIR, temp_data_dir) shutil.copy(LICENSE_FILE, temp_data_dir) self.assertRaises(RuntimeError, pynlpir.open, temp_dir) temp_license_file = os.path.join(temp_data_dir, LICENSE_NAME) os.remove(temp_license_file) self.assertRaises(RuntimeError, pynlpir.open, temp_dir) shutil.rmtree(temp_dir) pynlpir.close()
def main(): py.open() a = sys.argv[1] result = py.segment(a) res_str = [] for r in result: if len(r[0]) == 2 and (r[1] == "noun" or r[1] == "verb" or r[1] == "adjective"): f_result = fsame.find(r[0]) ff_result = fsame.ffind(r[0]) if f_result == r[0] or ff_result == r[0]: res_str.append(r[0]) else: if random.randint(0, 1) == 0: res_str.append(f_result) else: res_str.append(ff_result) else: res_str.append(r[0]) print "".join(res_str) py.close()
def post(self,request): obj_id = request.POST['obj_id'] school = MySchool.objects.get(id=int(obj_id)) feeds = [] # weibo # App Key:802677147 # App Secret:f75be23800d779cc9dbbf6b467b7ff61 # Redirect url: https://api.weibo.com/oauth2/default.html # code: 4ccb7879bf204466b80e02c106d09727 # read baidu params = {'keyword':school.name} # send a 3rd party service request baidu_consumer.delay(params) # read saved feeds feeds = MyBaiduStream.objects.filter(school=school).order_by('-last_updated')[:100] content = loader.get_template(self.template_name) tieba_html= content.render(Context({ 'obj':school, 'feeds': feeds, })) # hot topics pynlpir.open() # must have this line! topics = feeds[:50] content = loader.get_template(self.newsticker_template_name) newsticker_html= content.render(Context({ 'objs':topics, 'keywords': pynlpir.get_key_words(''.join([f.name+f.description for f in feeds]), max_words=50, weighted=True) })) pynlpir.close() return HttpResponse(json.dumps({'bd_html':tieba_html,'news_html':newsticker_html}), content_type='application/javascript')
def import_userdict(file_dir): pynlpir.open() nlpir.import_userdict(file_dir) pynlpir.close()
def Participle(list_datas, filename_stopwords): #分词 time_start = time.time() print("正在分词...") list_garbagesT.clear() list_words_stop = GetListWords(filename_stopwords) pynlpir.open() for data in list_datas: # segments = pynlpir.segment(data.content, pos_names='all',pos_english=False) # file_nlp.write('\n') # for segment in segments: # file_nlp.write("[ %s : %s ]" % (segment[0], segment[1])) # file_nlp.write('\n') if len(data.content) < 8: data.error = "内容过短" list_garbagesT.append(data) continue list_words = pynlpir.get_key_words(data.content, max_words=70) if len(list_words) == 0: data.error = "没有分词结果" list_garbagesT.append(data) continue #print("开始停词") for word in list_words: if word in list_words_stop: #print("停了个词" + word) continue if word == '': data.error = "包含空白分词" list_garbagesT.append(data) break #统计词频 contentT = data.content count = 0 while contentT.find(word) > -1: contentT = contentT.replace(word, '', 1) count += 1 if count == 0: data.error = "分词不属于原文" list_garbagesT.append(data) break #保存词频统计结果 data.dict_words_tfidf[word] = count if len(data.dict_words_tfidf) == 0: data.error = "词频统计结果为空" list_garbagesT.append(data) continue #清除垃圾数据 for data in list_garbagesT: list_datas.remove(data) list_garbages.append(data) list_garbagesT.clear() pynlpir.close() time_end = time.time() print("用时 : %.2f s" % (time_end - time_start)) return list_datas
def segment(self): """ fni: str; input file name with path fno: str; output file name with path lang: str; language code pos: bool; POS tags included n: int; no. of lines processed """ import copy from PyQt5.QtWidgets import QApplication from opencc import OpenCC openCC = OpenCC('t2s') # convert from Traditional-to-Simplified pynlpir.open(encoding="utf-8") print("Finished initializing ITCLAS/NLPIR") count = lineCount(self.fni) fit = open(self.fni, "r", encoding="UTF-8") fot = open(self.fno, "w", encoding="UTF-8", newline="\n") sep = " " # separator of Chinese tokens (space by default) n = 0 for linet in fit: n += 1 if (linet.strip() == ''): # empty string fot.write("\n") continue lines = openCC.convert(linet.strip()) lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) # segment with optional POS-tagging # The following segments the zht text according to the # segmentation patterns obtained from NLPIR above tokens = [ ] # initialize list to hold 'words' of segmented zht line pos_tags = [ ] # initialize list to hold pos tags of segmented words while len( lines_seg) > 0: # loop until nothing is left in lines_seg t, p = lines_seg.pop( 0) # remove leftmost zhs token and save to variable t0 m = len(t) # no. of characters in token tokens.append( linet[0:m]) # add corresponding zht token to tokens[] pos_tags.append(p) linet = linet[ m:] # delete token from zht line (from beginning of string) #fot.write(sep.join(tokens)+"\n") # wirte zht-seg output tok_pos = ["{}".format(x) for x, y in zip(tokens, pos_tags) ] # list of tok_pos pairs fot.write(sep.join(tok_pos) + "\n") #if (n == 1): break if n % 50 == 0: self.window.ui.progressBar.setValue( round(100 * n / self.fi_linecount, 0)) self.window.ui.progressBar.repaint() QApplication.processEvents() self.window.ui.progressBar.setValue(100) self.window.ui.progressBar.repaint() fit.close() fot.close() pynlpir.close() self.numLineProcessed = n return n
def close(): return pynlpir.close()
def partition(input_path, output_path): ''' 分词,把input _path 里的文本文件分词,结果存在output_path :param input_path: 文本文件路径 :param output_path: 分词结果的路径 :return: 编码错误的词的错误 ''' f3 = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8') f3_name = f3.name stop_set = [] f_stop_list = open( 'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/extract_main_word/stop_list.txt', 'r', encoding='utf-8') for line in f_stop_list: stop_set.append(line.split()[0]) stop_set = set(stop_set) os.chdir(input_path) f_lst = os.listdir(os.getcwd()) cnt1 = 0 nlpir = pynlpir.nlpir pynlpir.open() nlpir.ImportUserDict( b'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/new_bigdic.txt' ) for item in f_lst: ans_lst = [] f = open(item, 'r', encoding='utf-8') s = bytes(f.read(), encoding='utf-8') f.close() size = ctypes.c_int() result = nlpir.ParagraphProcessA(s, ctypes.byref(size), True) result_t_vector = ctypes.cast(result, ctypes.POINTER(nlpir.ResultT)) words = [] for i in range(0, size.value): r = result_t_vector[i] word = s[r.start:r.start + r.length] words.append((word, r.sPOS)) f2 = open(output_path + item, 'w', encoding='utf-8') for word, pos in words: # try: if word.decode('utf-8') not in stop_set: if pos.decode('utf-8') > b'z'.decode('utf-8') or pos.decode( 'utf-8').upper() == pos.decode( 'utf-8') and pos.decode('utf-8') != '': ans_lst.append((pos.decode('utf-8'), word.decode('utf-8'))) f2.write( (word.decode('utf-8') + ' ' + pos.decode('utf-8') + '\n')) f3.write( (word.decode('utf-8') + ' ' + pos.decode('utf-8') + '\n')) # except: # cnt1+=1 # else: # f2.write(word.decode('utf-8') + '\n') keys = pynlpir.get_key_words(s, max_words=10, weighted=False) ans_set = list(set(ans_lst)) feqrence = [0 for k in range(len(ans_set))] for k in range(len(ans_set)): for item in ans_lst: if item == ans_set[k]: feqrence[k] += 1 f2.write('\n\nMy tags: ') type_lst = [] for item in ans_set: # ans_set: ('COMPANY_OF_INDUSTRY_56', '兴业银行') if item[0] not in type_lst: type_lst.append(item[0]) type_lst.sort() ans_s = '' for k in range(len(type_lst)): ans_s += str(type_lst[k]) + ': ' for l in range(len(ans_set)): if ans_set[l][0] == type_lst[k]: # 这里插入一个函数,来表示股票与基金间的关系 ans_s += stock2fund(ans_set, feqrence, l) # ans_s+=' ('+str(ans_set[l][1])+': '+str(feqrence[l])+')' ans_s += '\n' f2.write(ans_s) f2.write('\n\nkeyword: ') # 这里是在数分词器给出的关键词词频 keys_f = [0 for l in range(len(keys))] commen_last_name = [ '王', '李', '张', '刘', '陈', '杨', '黄', '赵', '吴', '周', '徐', '孙', '马', '朱', '胡', '郭', '何', '高', '林', '郑', '谢', '罗', '梁', '宋', '唐', '许', '韩', '冯', '邓', '曹', '彭', '曾', '蕭', '田', '董', '袁', '潘', '于', '蒋', '蔡', '余', '杜', '叶', '程', '苏', '魏', '吕', '丁', '任', '沈', '姚', '卢', '姜', '崔', '钟', '谭', '陆', '汪', '范', '金', '石', '廖', '贾', '夏', '韦', '付', '方', '白', '邹', '孟', '熊', '秦', '邱', '江', '尹', '薛', '闫', '段', '雷', '侯', '龙', '史', '陶', '黎', '贺', '顾', '毛', '郝', '龚', '邵', '万', '钱', '严', '覃', '武', '戴', '莫', '孔', '向', '汤' ] ans3 = '' f3.seek(0) for line in f3: if len(line.split()) == 2: name = line.split()[0] pos = line.split()[1] for l in range(len(keys)): if name == keys[l]: keys_f[l] += 1 if name[0] in commen_last_name and name not in [ '万元', '周一', '周二', '周三', '周四', '周五', '周六', '周日', '周天' ] and len(name) in [2, 3] and pos == 'nr': ans3 += ' ' + name ans2 = '' for l in range(len(keys)): ans2 += str(keys[l]) + ': ' + str(keys_f[l]) + ' ' f2.write(ans2) f2.write('\n\nRelated person: ' + ans3) f2.close() pynlpir.close() return cnt1
def segment_tagging(sentence): pynlpir.open() # Initializes the NLPIR API sentence_segment_tag = pynlpir.segment( sentence) #Get the result of segment and tagging pynlpir.close() # Exits the NLPIR and frees allocated memory. return sentence_segment_tag #return results
def _convert_examples_to_features(self, examples, doc_stride, is_train=True, topRate=0.5): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 features = [] pynlpir.open() with tqdm(total=len(examples), desc="convert examples to features add bm25:") as pbar: for example_id, example in enumerate(examples): qid = example.qid qusetion = example.qusetion docids = example.docids contexts = example.contexts answer = None answer_span = None features_temp = [] sub_doc = [] for context_index, (docid, context) in enumerate( zip(docids, contexts)): if is_train: answer_span = example.answer_span qusetion_tokens = self.tokenizer.tokenize( qusetion) if len(qusetion) > 0 else [] if len(qusetion_tokens ) > self.max_query_length: # cut at tail qusetion_tokens = qusetion_tokens[0:self. max_query_length] token_to_orig_index = [] orig_to_token_index = [] context_tokens = [] for (i, word) in enumerate(context): orig_to_token_index.append(len(context_tokens)) sub_tokens = self.tokenizer.tokenize(word) for sub_token in sub_tokens: token_to_orig_index.append(i) context_tokens.append(sub_token) token_start_position = None token_end_position = None if is_train: token_start_position = -1 token_end_position = -1 if is_train: token_start_position = orig_to_token_index[ answer_span[0]] if answer_span[1] < len(context) - 1: token_end_position = orig_to_token_index[ answer_span[1] + 1] - 1 else: token_end_position = len(context_tokens) - 1 (token_start_position, token_end_position) = self._improve_answer_span( context_tokens, token_start_position, token_end_position, example.answer) # We can have documents that are longer than the maximum sequence length. # To deal with this we do a sliding window approach, where we take chunks # of the up to our max length with a stride of `doc_stride`. max_context_length = self.max_seq_length - self.max_query_length - 3 _DocSpan = collections.namedtuple( # pylint: disable=invalid-name "DocSpan", ["start", "length"]) doc_spans = [] start_offset = 0 while start_offset < len(context_tokens): length = len(context_tokens) - start_offset if length > max_context_length: length = max_context_length doc_spans.append( _DocSpan(start=start_offset, length=length)) if start_offset + length == len(context_tokens): break start_offset += min(length, doc_stride) for (doc_span_index, doc_span) in enumerate(doc_spans): token_to_orig_map = {} token_is_max_context = {} tokens = ["[CLS]"] + qusetion_tokens + ["[SEP]"] segment_ids = [0] * len(tokens) for i in range(doc_span.length): split_token_index = doc_span.start + i token_to_orig_map[ len(tokens )] = token_to_orig_index[split_token_index] is_max_context = self._check_is_max_context( doc_spans, doc_span_index, split_token_index) token_is_max_context[len(tokens)] = is_max_context tokens.append(context_tokens[split_token_index]) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = self.tokenizer.convert_tokens_to_ids( tokens) input_mask = [1] * len(tokens) padding = [0] * (self.max_seq_length - len(input_ids)) input_ids += padding input_mask += padding segment_ids += padding assert len(input_ids) == self.max_seq_length assert len(input_mask) == self.max_seq_length assert len(segment_ids) == self.max_seq_length start_position = None end_position = None if is_train: # For training, if our document chunk does not contain an annotation # we throw it out, since there is nothing to predict. doc_start = doc_span.start doc_end = doc_span.start + doc_span.length - 1 out_of_span = False if not (token_start_position >= doc_start and token_end_position <= doc_end): out_of_span = True if not out_of_span: doc_offset = len(qusetion_tokens) + 2 start_position = token_start_position - doc_start + doc_offset end_position = token_end_position - doc_start + doc_offset else: continue features_temp.append( InputFeatures( unique_id=unique_id, qid=qid, context_index=context_index, doc_span_index=doc_span_index, tokens=tokens, token_to_orig_map=token_to_orig_map, token_is_max_context=token_is_max_context, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, docid=docid, answer=answer, start_idx=start_position, end_idx=end_position)) if not is_train: sub_doc.append( pynlpir.segment(''.join(tokens), pos_tagging=False)) unique_id += 1 try: if not is_train: bm25_model = BM25(sub_doc) bm25_score = bm25_model.get_scores( pynlpir.segment(qusetion, False)) rankindex = np.argsort(-np.array(bm25_score)) features.extend([ features_temp[i] for i in rankindex[:int(len(rankindex) * topRate)] ]) else: features.extend(features_temp) except: print('end') pbar.update(1) pynlpir.close() return features
def getKeywordsAndSave(self, *args, **kwargs): import pickle freq_lower_bound = int(kwargs["freq_lower_bound"]) token_len_lower_bound = int(kwargs["token_len_lower_bound"]) doc_len_lower_bound = int(kwargs["doc_len_lower_bound"]) doc_len_upper_bound = int(kwargs["doc_len_upper_bound"]) if str(kwargs["method"]) == "keyword": file_keywords = open( self.conf_io["prefix"] + self.conf_io["output_data_directory"] + str(kwargs["target_name"]) + '.fine.keywords', 'w') elif str(kwargs["method"]) == "normal": file_keywords = open( self.conf_io["prefix"] + self.conf_io["output_data_directory"] + str(kwargs["target_name"]) + '.keywords', 'w') tokens = [] token_indexes = {} if bool(kwargs["static_file"]) is True: source_name = self.conf_io["prefix"] + self.conf_io[ "output_data_directory"] + str(kwargs["source_name"]) with open(source_name, 'r') as f: _ind = 0 for ind, line in enumerate(f): try: with Timer('calculateTokens') as t: tokens.append( self.calculateTokens( line, method=str(kwargs["method"]), doc_len_lower_bound=doc_len_lower_bound, doc_len_upper_bound=doc_len_upper_bound)) # [experimental feature] # this is to be used with LDA # to show what raw doc is associated with each topic token_indexes[ind] = _ind _ind += 1 except Exception as e: if e is KeyboardInterrupt: break print e print "error with ", line continue else: pass for line in tokens: if line is not None: filtered_tokens = [ token for token in line.split(',') if self.frequency[token.lower()] > freq_lower_bound and len(token) > token_len_lower_bound ] filtered_tokens = ','.join(filtered_tokens) file_keywords.write('%s\n' % (filtered_tokens.encode('utf-8'))) file_keywords.flush() f.close() # experimental json.dump(token_indexes, open(self.f_token_indexes + "token_indexes.pickle", "w"), ensure_ascii=True) else: doc_list = args[0] for ind, line in enumerate(list(doc_list)): try: tokens.append( self.calculateTokens( line, method=str(kwargs["method"]), doc_len_lower_bound=doc_len_lower_bound, doc_len_upper_bound=doc_len_upper_bound)) except Exception as e: if e is KeyboardInterrupt: break print e print "error with ", line continue else: pass for line in tokens: if line is not None: filtered_tokens = [ token for token in line.split(',') if self.frequency[token.lower()] > freq_lower_bound and len(token) > token_len_lower_bound ] filtered_tokens = ','.join(filtered_tokens) file_keywords.write('%s\n' % (filtered_tokens.encode('utf-8'))) file_keywords.flush() file_keywords.close() pynlpir.close() return True
def segment_filter(): """ 原始文件分词并过滤 :return: """ # 获取文件列表 file_list = os.listdir(path) # file_list = ['caption_validation_annotations_20170910.json'] res = [] # 结果列表 # 启动分词工具 pynlpir.open() for file_name in file_list: file_path = os.path.join(path, file_name) # 打开文件 f = open(file_path, 'r') # 载入json,文件只有一行,一个json j = json.loads(f.readline()) # j = j[:100] # 只取caption部分 j = [x['caption'] for x in j] # 分词 j = [[y.replace('\n', ' ') for y in x] for x in j] j = [[pynlpir.segment(y) for y in x] for x in j] # 根据词性清洗词 # 保留以下词性的词,并去除词性标记 # 词性含义请查看https://github.com/tsroten/pynlpir/blob/master/pynlpir/pos_map.py word_filter = ('noun', 'time word', 'locative word', 'noun of locality', 'verb', 'adjective', 'distinguishing word', 'status word', 'numeral', 'adverb') j = [[[z[0] for z in y if z[1] in word_filter] for y in x] for x in j] # 去除为空的句子 j = [[y for y in x if len(y) != 0] for x in j] # 清除重复 for x in range(len(j)): temp = [] for i in range(len(j[x])): flag = True for k in range(i + 1, len(j[x])): temp_set1 = set(j[x][i]) temp_set2 = set(j[x][k]) if len(temp_set1 | temp_set2) == len(temp_set1 & temp_set2): flag = False break if flag: temp.append(j[x][i]) j[x] = temp # 加到结果集 res[len(res):len(res)] = j pynlpir.close() print '分词完成' # 保存到json文件 res = [{'caption': x} for x in res] # 保存到json对象 json_obj = json.dumps(res) # 写入文件 f = open(json_path, 'w') f.write(json_obj) f.close() print '存入json文件:%s' % json_path
def tearDown(self): pynlpir.close()
def get_key_words(): s = '' max_words = MAX_WORDS_DEFAULT max_hot_words = MAX_HOT_WORDS_DEFAULT update_hot_word = UPDATE_HOT_WORD_DEFAULT # get doc if request.method == 'POST': s = request.form.get('s', type=str, default='') update_hot_word = request.form.get( 'update_hot_word', type=str, default=UPDATE_HOT_WORD_DEFAULT) # 是否更新hot_word表 try: max_words = request.form.get('max_words', type=str, default=MAX_WORDS_DEFAULT) if max_words != '': # 有max_words参数(可能是默认值'3') print('[POST] max_words yes') max_words = int(max_words.strip()) print('\tmax_words =', max_words) else: max_words = MAX_WORDS_DEFAULT print('[POST] max_words no') except: # max_words参数处理异常,设置默认值3 max_words = MAX_WORDS_DEFAULT try: max_hot_words = request.form.get('max_hot_words', type=str, default=MAX_HOT_WORDS_DEFAULT) if max_hot_words != '': max_hot_words = int(max_hot_words.strip()) else: max_hot_words = MAX_HOT_WORDS_DEFAULT except: max_hot_words = MAX_HOT_WORDS_DEFAULT elif request.method == 'GET': s = request.args.get('s') update_hot_word = request.args.get('update_hot_word') if update_hot_word != 'False': update_hot_word = 'True' try: max_words = int(request.args.get('max_words').strip()) except: max_words = MAX_WORDS_DEFAULT try: max_hot_words = int(request.args.get('max_hot_words').strip()) except: max_hot_words = MAX_HOT_WORDS_DEFAULT print('[PID]', os.getppid()) # get key words if s == '': # 文章内容为空,不分析 return 'null' else: # 分析关键词 try: pynlpir.open() key_word_list = pynlpir.get_key_words(s, max_words=max_words, weighted=False) # temp_str = '' for i in range(len(key_word_list)): key_word_list[i] = key_word_list[i] except: key_word_list = [] else: pynlpir.close() if update_hot_word == 'True': # 新开一个线程,更新数据库 print('[update_hot_word] True') t = threading.Thread(target=db_helper.update_tables, args=(','.join(key_word_list), max_hot_words)) t.setDaemon(True) t.start() else: print('[update_hot_word] False') return ','.join(key_word_list)
def textual(uid): import torch import pynlpir import random import numpy as np torch.set_num_threads(8) torch.manual_seed(1) random.seed(1) # opening embedding file print('opening embedding file') f = open('sgns_weibo.bigram-char', 'r', encoding='utf8') raw = f.readlines() f.close() # constructing word to index dictionary print('constructing word to index dictionary') word_to_ix = dict() iter = 0 for line in raw: word_to_ix[(line.split())[0]] = iter iter = iter + 1 for i in ['ttttt', 'ggggg', 'uuuuu', 'eeeee', 'ooooo', ' ']: word_to_ix[i] = iter iter += 1 model_path = 'mr_best_model_minibatch_acc_7863.model' EMBEDDING_DIM = 300 VOCAB_SIZE = 195203 HIDDEN_DIM = 100 # TO BE TUNED LABEL_SIZE = 2 BATCH_SIZE = 1 # TO BE TUNED EPOCH = 100 # TO BE TUNED DROPOUT = 0.5 # TO BE TUNED 0.95* every epoch NUM_LAYER = 2 # TO BE TUNED model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, LABEL_SIZE, BATCH_SIZE, DROPOUT, NUM_LAYER) model.load_state_dict( torch.load(model_path, map_location=torch.device('cpu'))) path = './data/' + uid + '.txt' text_set = [] pynlpir.open() f = open(path, 'r', encoding='utf8') raw = f.readlines() f.close() if len(raw) == 0: return [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1] #to be determined for line in raw: #print(line) if (len(line) == 0): continue # tokenizer from chinese academy of sciences temp = pynlpir.segment(line, pos_tagging=False) # 使用pos_tagging来关闭词性标注 temp2 = [x for x in temp if x != ' '] # remove redundant spaces sentence = [] for word in temp2: if word in word_to_ix.keys(): sentence.append(word_to_ix[word]) else: sentence.append(word_to_ix['ooooo']) # oov text_set.append(sentence) pynlpir.close() scores = [] model.eval() if len(text_set) == 0: return [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1] for sent in text_set: if (len(sent) == 0): continue model.hidden = model.init_hidden() #detach from last example #print(sent) temp = model(torch.tensor(sent)) temp = (temp[0][1]).item() scores.append(np.e**temp) #print(scores) feature = [] # mean >0.8 >0.9 <0.1 <0.2 max min median continuous>0.75 continuous<0.25 feature.append(np.mean(scores)) feature.append(more_than(scores, 0.8)) feature.append(more_than(scores, 0.9)) feature.append(less_than(scores, 0.1)) feature.append(less_than(scores, 0.2)) feature.append(max(scores)) feature.append(min(scores)) feature.append(np.median(scores)) count = 0 temp = 0 for x in scores: if x > 0.75: temp += 1 else: count = max(count, temp) temp = 0 count = max(count, temp) feature.append(count) count = 0 temp = 0 for x in scores: if x < 0.25: temp += 1 else: count = max(count, temp) temp = 0 count = max(count, temp) feature.append(count) return feature
def createDocMapAndClickInfo(total_set_file, doc_set_file): user_map_r2v = {} user_map_v2r = {} doc_map_r2v = {} #doc map(Facilitate the calculation in PLSA) doc_map_v2r = {} #not use user_set = set() #total users doc_set = set() #total documents user_click_count = {} doc_click_count = {} #clicks in every document user_doc_click_count = {} #clicks in specific document from specific user if os.path.isfile(doc_set_file): is_write_need_file = True else: is_write_need_file = False fp_total_set = open(total_set_file, 'r') if is_write_need_file == False: fp_doc_set = open(doc_set_file, 'w') fp_doc_map_r2v = open('../PLSA/data/doc_map_r2v.csv', 'w') fp_doc_map_v2r = open('../PLSA/data/doc_map_v2r.csv', 'w') fp_doc_click_count = open('../PLSA/data/doc_click_count.csv', 'w') fp_user_doc_click_count = open('../PLSA/data/user_doc_click_count.csv', 'w') cnt = 0 cnt1 = 0 pynlpir.open() for line in fp_total_set: word = line.split('\t') user_set.add(word[0]) doc_set.add(word[1]) doc_click_count.setdefault(word[1], 0) doc_click_count[word[1]] += 1 user_click_count.setdefault(word[0], 0) user_click_count[word[0]] += 1 user_doc_click_count.setdefault(word[0], {}) if user_doc_click_count[word[0]].has_key(word[1]) == False: user_doc_click_count[word[0]][word[1]] = 0 user_doc_click_count[word[0]][word[1]] += 1 if user_map_r2v.has_key(word[0]) == False: user_map_r2v[word[0]] = cnt1 user_map_v2r[cnt1] = word[0] cnt1 += 1 if doc_map_r2v.has_key(word[1]) == False: doc_map_r2v[word[1]] = cnt doc_map_v2r[cnt] = word[1] cnt += 1 if is_write_need_file == False: title_split_result = pynlpir.nlpir.ParagraphProcess( word[4], True) content_split_result = pynlpir.nlpir.ParagraphProcess( word[5], True) #make sure that news id map is true fp_doc_set.write( '%s\t%s\t%s' % (word[1], title_split_result, content_split_result)) #, content_split_result)) # doc_map = sorted(doc_map_r2v.items(), key=lambda d:d[1], reverse=False) if is_write_need_file == False: for d, dtag in doc_map_r2v.items(): fp_doc_map_r2v.write('%s %d\n' % (d, dtag)) for dtag, d in doc_map_v2r.items(): fp_doc_map_v2r.write('%d %s\n' % (dtag, d)) for d, dclicks in doc_click_count.items(): fp_doc_click_count.write('%s %d\n' % (d, dclicks)) user_clicks = 0 for u, uitem in user_doc_click_count.items(): for d in uitem.keys(): if is_write_need_file == False: fp_user_doc_click_count.write('%s %s %d\n' % (u, d, uitem[d])) user_clicks += uitem[d] print 'user clicks = ', user_clicks pynlpir.close() if is_write_need_file == False: fp_doc_set.close() fp_total_set.close() print 'number of users:', len(user_set) print 'number of documents:', len(doc_set) print 'createDocMap end' #user_set (real_user_id) doc_set(real_news_id) #doc_map_r2v (real_news_id -> virtual_news_id) #doc_map_v2r (virtual_news_id -> real_news_id) #doc_click_count (real_news_id -> clicks) #user_doc_click_count (real_user_id, real_news_id -> clicks) return user_set, doc_set, user_map_r2v, user_map_v2r, doc_map_r2v, doc_map_v2r, user_click_count, doc_click_count, user_doc_click_count
def train(): # opening embedding file print('opening embedding file') f = open(path + 'sgns.weibo.bigram-char', 'r', encoding='utf8') raw = f.readlines() f.close() # constructing word to index dictionary print('constructing word to index dictionary') word_to_ix = dict() iter = 0 for line in raw: word_to_ix[(line.split())[0]] = iter iter = iter + 1 for i in ['ttttt', 'ggggg', 'uuuuu', 'eeeee', 'ooooo', ' ']: word_to_ix[i] = iter iter += 1 # loading the pre-trained embedding vectors print('loading the pre-trained embedding vectors') embed_vectors = [] for line in raw: embed_vectors.append([float(j) for j in ((line.split())[1:])]) for i in ['ttttt', 'ggggg', 'uuuuu', 'eeeee', 'ooooo']: embed_vectors.append((torch.zeros(300)).tolist()) #randn or zeros embed_vectors.append((torch.zeros(300)).tolist()) #for ' ' global FINAL FINAL = len(embed_vectors) - 1 # load the train, val, test data print('load the train, val, test data') df = pd.read_excel(path + 'final data.xlsx') # generate matrix with first-col uid second-col botornot idbotmat = [] for i in range(len(df)): temp = [] temp.append(df['uid'][i]) temp.append(df['botornot'][i]) idbotmat.append(temp) # all text into text_set # where every element is [sentence, label] # sentence is another list text_set = [] pynlpir.open() # 打开分词器 for pair in idbotmat: f = open(path + 'data/' + str(pair[0]) + '.txt', 'r', encoding='utf8') #print(f) raw = f.readlines() f.close() if len(raw) == 0: continue for line in raw: # tokenizer from chinese academy of sciences temp = pynlpir.segment(line, pos_tagging=False) # 使用pos_tagging来关闭词性标注 temp2 = [x for x in temp if x != ' '] #remove redundant spaces sentence = [] for word in temp2: if word in word_to_ix.keys(): sentence.append(word_to_ix[word]) else: sentence.append(word_to_ix['ooooo']) #oov data = [sentence, pair[1]] text_set.append(data) pynlpir.close() print('Total sentences: ' + str(len(text_set))) random.shuffle(text_set) #determine train dev test train_ratio = 0.7 dev_ratio = 0.1 test_ratio = 0.2 total_sample = len(text_set) train_set = text_set[:int(total_sample * train_ratio)] dev_set = text_set[int(total_sample * train_ratio):int(total_sample * (train_ratio + dev_ratio))] test_set = text_set[int(total_sample * (train_ratio + dev_ratio)):] print('delcaring model') best_dev_acc = 0.0 EMBEDDING_DIM = len(embed_vectors[0]) VOCAB_SIZE = len(embed_vectors) HIDDEN_DIM = 100 # TO BE TUNED LABEL_SIZE = 2 BATCH_SIZE = 64 # TO BE TUNED EPOCH = 100 # TO BE TUNED DROPOUT = 0.5 # TO BE TUNED 0.95* every epoch NUM_LAYER = 2 # TO BE TUNED # declare model model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, LABEL_SIZE, BATCH_SIZE, DROPOUT, NUM_LAYER) #load the word embedding embedtensor = torch.tensor(embed_vectors) #embedtensor.to(device) model.word_embeddings.weight.data = embedtensor #model.to(device) #how many parameters in the model print('Total params in the network: ' + str(count_parameters(model))) loss_function = nn.NLLLoss() optimizer = optim.Adam(model.parameters(), lr=1e-2) # TO BE TUNED #optimizer = optim.SGD(model.parameters(), lr = 1e-2, momentum = 0.9) print('training starts now') torch.autograd.set_detect_anomaly(True) no_up = 0 for i in range(EPOCH): print('epoch: %d start!' % i) random.shuffle(train_set) model.lstm.dropout = DROPOUT train_epoch(model, train_set, loss_function, optimizer, BATCH_SIZE, i) #what to do? what to pass? DROPOUT = DROPOUT * 0.95 #dropout scheduling print('now best dev acc:', best_dev_acc) dev_acc = evaluate(model, dev_set, loss_function, 'dev') #what to do? what to pass? test_acc = evaluate(model, test_set, loss_function, 'test') #what to do? what to pass? if dev_acc > best_dev_acc: best_dev_acc = dev_acc #!rm '/content/gdrive/My Drive/Colab Notebooks/best_models/mr_best_model_minibatch_acc_*.model' #colab only os.system(cmd + ' ' + path + 'best_models/mr_best_model_minibatch_acc_*.model') print('New Best Dev!!!') torch.save( model.state_dict(), path + 'best_models/mr_best_model_minibatch_acc_' + str(int(test_acc * 10000)) + '.model') no_up = 0 else: no_up += 1 if no_up >= 10: print("so what")
def nlpir_tokenizer(sentence): pynlpir.open() segs = pynlpir.segment(sentence, pos_tagging=False) pynlpir.close() return segs
def createDocMapAndClickInfo(total_set_file, doc_set_file): doc_map1 = {} #doc map(Facilitate the calculation in PLSA) doc_map2 = {} #not use user_set = set() #total users doc_set = set() #total documents doc_click_count = {} #clicks in every document user_doc_click_count = {} #clicks in specific document from specific user if os.path.isfile(doc_set_file): is_write_need_file = True else: is_write_need_file = False fp_total_set = open(total_set_file, 'r') if is_write_need_file == False: fp_doc_set = open(doc_set_file, 'w') fp_doc_map1 = open('data//doc_map1.csv', 'w') fp_doc_map2 = open('data//doc_map2.csv', 'w') fp_doc_click_count = open('data//doc_click_count.csv', 'w') fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w') cnt = 0 pynlpir.open() for line in fp_total_set: word = line.split('\t') user_set.add(word[0]) doc_set.add(word[1]) doc_click_count.setdefault(word[1], 0) doc_click_count[word[1]] += 1 user_doc_click_count.setdefault(word[0], {}) if user_doc_click_count[word[0]].has_key(word[1]) == False: user_doc_click_count[word[0]][word[1]] = 0 user_doc_click_count[word[0]][word[1]] += 1 if doc_map1.has_key(word[1]) == False: doc_map1[word[1]] = cnt doc_map2[cnt] = word[1] cnt += 1 if is_write_need_file == False: # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True) content_split_result = pynlpir.nlpir.ParagraphProcess(word[5], True) fp_doc_set.write('%s\t%s' %(word[1], content_split_result))#, content_split_result)) # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False) if is_write_need_file == False: for d, dtag in doc_map1.items(): fp_doc_map1.write('%s %d\n' %(d, dtag)) for dtag, d in doc_map2.items(): fp_doc_map2.write('%d %s\n' %(dtag, d)) for d, dclicks in doc_click_count.items(): fp_doc_click_count.write('%s %d\n' %(d, dclicks)) user_clicks = 0 for u, uitem in user_doc_click_count.items(): for d in uitem.keys(): if is_write_need_file == False: fp_user_doc_click_count.write('%s %s %d\n' %(u, d, uitem[d])) user_clicks += uitem[d] print 'user clicks = ', user_clicks pynlpir.close() if is_write_need_file == False: fp_doc_set.close() fp_total_set.close() print 'number of users:', len(user_set) print 'number of documents:', len(doc_set) print 'createDocMap end' return user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count
def drive_end(): pynlpir.close()
def pynlpir_segment(self, sentence): # pynlpir分词 pynlpir.open() sentence = pynlpir.segment(sentence, pos_tagging=False) pynlpir.close() return ' '.join(sentence)
def segment(sentence): pynlpir.open() #Initializes the NLPIR API sentence_segment = pynlpir.segment(sentence, pos_tagging=False) # pynlpir.close() #Exits the NLPIR and frees allocated memory. return sentence_segment
def close_pynlpir(): global pynlpir pynlpir.close()
line_p = hanzi_prep.split_into_sentences_e(line) for line_i in line_p: # 用空格分割每个汉字 str_i = "".join(line_i) str_j = "" if USE_SEGMENT == "JIEBA": str_j = " ".join(jieba.cut(str_i, cut_all=False)) elif USE_SEGMENT == "ICTCLAS": str_j = " ".join(pynlpir.segment(str_i, pos_tagging=False)) else: print("ERROR:未知分词系统!") fout.write(str_j + "\n") if USE_SEGMENT == "ICTCLAS": print("END:ICTCLAS分词系统") pynlpir.close() elif USE_SEGMENT == "JIEBA": print("END:JIEBA分词系统") else: print("END:未知分词系统") # 计算N-Gram词频信息 # if not os.path.exists(FILE_NAME_UNIC_LM): # str_cmd = "ngram-count -text %s -order 2 -write %s" %(FILE_NAME_UNIC, FILE_NAME_UNIC_CNT) # print("正在执行:%s" %(str_cmd)) # os.system(str_cmd) # str_cmd = "ngram-count -read %s -order 2 -lm %s -gt1min 2 -gt1max 5 -gt2min 2 -gt2max 5 " %(FILE_NAME_UNIC_CNT, FILE_NAME_UNIC_LM) # print("正在执行:%s" %(str_cmd)) # os.system(str_cmd)
def close(self): pynlpir.close()
def createDocMapAndClickInfo(total_set_file, doc_set_file): doc_map1 = {} #doc map(Facilitate the calculation in PLSA) doc_map2 = {} #not use user_set = set() #total users doc_set = set() #total documents doc_click_count = {} #clicks in every document user_doc_click_count = {} #clicks in specific document from specific user if os.path.isfile(doc_set_file): is_write_need_file = True else: is_write_need_file = False fp_total_set = open(total_set_file, 'r') if is_write_need_file == False: fp_doc_set = open(doc_set_file, 'w') fp_doc_map1 = open('data//doc_map1.csv', 'w') fp_doc_map2 = open('data//doc_map2.csv', 'w') fp_doc_click_count = open('data//doc_click_count.csv', 'w') fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w') cnt = 0 pynlpir.open() for line in fp_total_set: word = line.split('\t') user_set.add(word[0]) doc_set.add(word[1]) doc_click_count.setdefault(word[1], 0) doc_click_count[word[1]] += 1 user_doc_click_count.setdefault(word[0], {}) if user_doc_click_count[word[0]].has_key(word[1]) == False: user_doc_click_count[word[0]][word[1]] = 0 user_doc_click_count[word[0]][word[1]] += 1 if doc_map1.has_key(word[1]) == False: doc_map1[word[1]] = cnt doc_map2[cnt] = word[1] cnt += 1 if is_write_need_file == False: # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True) content_split_result = pynlpir.nlpir.ParagraphProcess( word[5], True) fp_doc_set.write( '%s\t%s' % (word[1], content_split_result)) #, content_split_result)) # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False) if is_write_need_file == False: for d, dtag in doc_map1.items(): fp_doc_map1.write('%s %d\n' % (d, dtag)) for dtag, d in doc_map2.items(): fp_doc_map2.write('%d %s\n' % (dtag, d)) for d, dclicks in doc_click_count.items(): fp_doc_click_count.write('%s %d\n' % (d, dclicks)) user_clicks = 0 for u, uitem in user_doc_click_count.items(): for d in uitem.keys(): if is_write_need_file == False: fp_user_doc_click_count.write('%s %s %d\n' % (u, d, uitem[d])) user_clicks += uitem[d] print 'user clicks = ', user_clicks pynlpir.close() if is_write_need_file == False: fp_doc_set.close() fp_total_set.close() print 'number of users:', len(user_set) print 'number of documents:', len(doc_set) print 'createDocMap end' return user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count
def tycl_replace(pat_name): global tycl_rep global stop_pos pynlpir.open() fp = file(os.path.join('./manual_pattern', pat_name), 'rb') fp_out = file(os.path.join('./extend_pattern', pat_name + '_tycl'), 'wb') for pat_line in fp: fp_out.write('=' * 50 + '\n') pat_line = (pat_line.strip()).decode('UTF-8') pat_line = pat_line.split() pat = pat_line[0] c_pat = pat for k, v in tycl_rep.iteritems(): c_pat = c_pat.replace(k, v) seg_list = pynlpir.segment(pat, pos_tagging=False) seg_line = pynlpir.segment(pat, pos_tagging=True) c_seg_line = pynlpir.segment(c_pat, pos_tagging=True) for s_i in range(len(seg_line) - 1, -1, -1): seg_line[s_i] = list(seg_line[s_i]) if seg_line[s_i][1] == None: seg_line[s_i][1] = u'None' if seg_line[s_i][0] == u' ': del seg_line[s_i] else: seg_line[s_i][1] = seg_line[s_i][1].replace(' ', '-') seg_line_str = ' '.join('/'.join(y) for y in seg_line) for s_i in range(len(c_seg_line) - 1, -1, -1): c_seg_line[s_i] = list(c_seg_line[s_i]) if c_seg_line[s_i][1] == None: c_seg_line[s_i][1] = u'None' if c_seg_line[s_i][0] == u' ': del c_seg_line[s_i] else: c_seg_line[s_i][1] = c_seg_line[s_i][1].replace(' ', '-') c_seg_line_str = ' '.join('/'.join(y) for y in c_seg_line) fp_out.write(pat.encode('UTF-8') + '\n') fp_out.write(c_pat.encode('UTF-8') + '\n') fp_out.write('/'.join(y.encode('UTF-8') for y in seg_list)) fp_out.write('\n') fp_out.write(seg_line_str.encode('UTF-8') + '\n') fp_out.write(c_seg_line_str.encode('UTF-8') + '\n') ss_i = 0 #the relative index of the word for s_i in range(len(seg_line)): left_sign = seg_line[s_i][0] right_sign = seg_line[s_i][1] if left_sign == "MED" or left_sign == "DIS" or left_sign == "SYM" or left_sign == "TRE" or \ left_sign == "<": continue elif right_sign == None or right_sign == "punctuation-mark" or right_sign == "numeral" or \ right_sign == "particle": ss_i += 1 continue else: rep_word = seg_list[s_i] rep_ret = set() ids_list = sear_num_new(rep_word) if ids_list != None: for id_i in ids_list: one_part = sear_words(id_i) assert one_part != None if len(one_part) > 1: #print rep_word, one_part tmp_part = one_part[:] tmp_part.remove(rep_word) #TODO... the replace operations for tmp_i in range(len(tmp_part) - 1, -1, -1): rep_w = tmp_part[tmp_i] new_str = seg_list[:s_i] new_str.append(rep_w) new_str.extend(seg_list[s_i + 1:]) new_str = ''.join(new_str) c_new_str = new_str for k, v in tycl_rep.iteritems(): c_new_str = c_new_str.replace(k, v) fp_out.write('*** ' + c_new_str.encode('UTF-8') + '\t') new_seg = pynlpir.segment(new_str, pos_tagging=True) c_new_seg = pynlpir.segment(c_new_str, pos_tagging=True) #something to do for n_i in range(len(c_new_seg) - 1, -1, -1): c_new_seg[n_i] = list(c_new_seg[n_i]) if c_new_seg[n_i][1] == None: c_new_seg[n_i][1] = u'None' if c_new_seg[n_i][0] == u' ': del c_new_seg[n_i] else: c_new_seg[n_i][1] = c_new_seg[n_i][ 1].replace(' ', '-') c_new_seg_str = ' '.join('/'.join(y) for y in c_new_seg) fp_out.write( c_new_seg_str.encode('UTF-8') + '\n') #something done is_continue = False if len(c_new_seg) == len(c_seg_line): if c_new_seg[ss_i][1] == c_seg_line[ss_i][ 1]: #only compare the pos of the word pass else: tmp_part.remove(rep_w) continue for front_i in range(ss_i - 1, -1, -1): if c_new_seg[front_i][0] == c_seg_line[ front_i][0] and c_new_seg[ front_i][1] == c_seg_line[ front_i][1]: pass else: tmp_part.remove(rep_w) is_continue = True break if is_continue == True: continue for back_i in range( ss_i + 1, len(c_seg_line)): if c_new_seg[back_i][0] == c_seg_line[ back_i][0] and c_new_seg[ back_i][1] == c_seg_line[ back_i][1]: pass else: tmp_part.remove(rep_w) break else: tmp_part.remove(rep_w) continue #DONE for ti in tmp_part: rep_ret.add(ti) fp_out.write('>>>>>> %s\n' % rep_word.encode('UTF-8')) #for ret_i in rep_ret: fp_out.write(' '.join(y.encode('UTF-8') for y in rep_ret)) fp_out.write('\n') ss_i += 1 ''' for i in range(1, len(pat_line)): fp_out.write(' '+pat_line[i].encode('UTF-8')) ''' fp.close() fp_out.close() pynlpir.close()
def parse_words(s): pynlpir.open() key_words = pynlpir.get_key_words(s, weighted=True) pynlpir.close() return key_words
def partition(self, input_path, output_path): """ 分词,把input _path 里的文本文件分词,结果存在output_path :param input_path: 文本文件路径 :param output_path: 分词结果的路径 :return: 编码错误的词的错误 """ f3 = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8') f3_name = f3.name stop_set = [] f_stop_list = open(self.root_dir + '/nlp/stop_list.txt', 'r', encoding='utf-8') for line in f_stop_list: if line.split(): stop_set.append(line.split()[0]) stop_set = set(stop_set) os.chdir(input_path) f_lst = os.listdir(os.getcwd()) cnt1 = 0 nlpir = pynlpir.nlpir pynlpir.open() big_dic = self.root_dir + '/nlp/new_bigdic.txt' nlpir.ImportUserDict(big_dic.encode('utf-8')) for f_item in f_lst: try: ans_lst = [] f = open(f_item, 'r', encoding='utf-8') s = bytes(f.read(), encoding='utf-8') f.close() size = ctypes.c_int() result = nlpir.ParagraphProcessA(s, ctypes.byref(size), True) result_t_vector = ctypes.cast(result, ctypes.POINTER(nlpir.ResultT)) words = [] for i in range(0, size.value): r = result_t_vector[i] word = s[r.start:r.start + r.length] words.append((word, r.sPOS)) for word, pos in words: # try: if word.decode('utf-8') not in stop_set: if pos.decode('utf-8') > b'z'.decode( 'utf-8') or pos.decode( 'utf-8').upper() == pos.decode( 'utf-8') and pos.decode('utf-8') != '': ans_lst.append( (pos.decode('utf-8'), word.decode('utf-8'))) f3.write((word.decode('utf-8') + ' ' + pos.decode('utf-8') + '\n')) keys = pynlpir.get_key_words(s, max_words=10, weighted=False) ans_set = list(set(ans_lst)) frequency = [0 for k in range(len(ans_set))] for k in range(len(ans_set)): for item in ans_lst: if item == ans_set[k]: frequency[k] += 1 type_lst = [] for item in ans_set: # ans_set: ('COMPANY_OF_INDUSTRY_56', '兴业银行') if item[0] not in type_lst: type_lst.append(item[0]) type_lst.sort() ans_s = '' main_character = self.select_main_character( f_item, ans_set, frequency) # print('return things',main_character) if main_character: f2 = open(output_path + f_item, 'w', encoding='utf-8') main_company = main_character[0] main_industry = main_character[1] if main_company: ans_s += 'Main company: ' for _ in range(len(main_company)): ans_s += str(main_company[_][0][1]) + '\t' + str( main_company[_][0][0]) ans_s += '\n' if main_industry: ans_s += 'Main industry: ' for _ in range(len(main_industry)): ans_s += str(main_industry[_][0][1]) + '\t' + str( main_industry[_][0][0]) ans_s += '\n' f2.write(ans_s) # 如果这两个同时为空,那么就是无主题的了,抛弃之 # 这里是在数分词器给出的关键词词频 keys_f = [0 for l in range(len(keys))] # 这里是找文中出现的人名,同时数了关键词的词频 commen_last_name = [ '王', '李', '张', '刘', '陈', '杨', '黄', '赵', '吴', '周', '徐', '孙', '马', '朱', '胡', '郭', '何', '高', '林', '郑', '谢', '罗', '梁', '宋', '唐', '许', '韩', '冯', '邓', '曹', '彭', '曾', '蕭', '田', '董', '袁', '潘', '于', '蒋', '蔡', '余', '杜', '叶', '程', '苏', '魏', '吕', '丁', '任', '沈', '姚', '卢', '姜', '崔', '钟', '谭', '陆', '汪', '范', '金', '石', '廖', '贾', '夏', '韦', '付', '方', '白', '邹', '孟', '熊', '秦', '邱', '江', '尹', '薛', '闫', '段', '雷', '侯', '龙', '史', '陶', '黎', '贺', '顾', '毛', '郝', '龚', '邵', '万', '钱', '严', '覃', '武', '戴', '莫', '孔', '向', '汤' ] ans3 = '' f3.seek(0) for line in f3: if len(line.split()) == 2: name = line.split()[0] pos = line.split()[1] for l in range(len(keys)): if name == keys[l]: keys_f[l] += 1 # if name[0] in commen_last_name and name not in ['万元','周一','周二','周三','周四','周五','周六','周日','周天'] and len(name) in [2,3] and pos=='nr': # ans3+=' '+name ans2 = 'Key words: ' for l in range(len(keys)): ans2 += str(keys[l]) + ': ' + str(keys_f[l]) + ' ' f2.write(ans2) # f2.write('\n\nRelated person: '+ans3) f2.close() else: continue except Exception as e: print('Exception in partition_main_character', e) pynlpir.close() return cnt1
def close(self): """关闭与释放nlp""" pynlpir.close() self.postagger.release() self.recognizer.release() self.parser.release()
def __init__(self, filename=TRAINSETFILE, IsTraining=True, IsSegment=True): #区分训练集和测试集,是否要分词 #训练集和测试集的局别只在于训练集的前四项为用户属性 #而测试集的前1项为用户属性 #如果分词,读取后的类里面包含的有用信息是: #用户信息列表 #用户词频列表 #总词典 self.userlist = [] self.userinfo = [] self.dict = Counter({}) self.IsTraining = IsTraining self.IsSegment = IsSegment self.IsDF = False with open(filename, encoding='GB18030') as file: filereader = csv.reader(file, dialect='excel-tab', quoting=csv.QUOTE_NONE) if not IsSegment: for item in filereader: self.userlist.append(item) else: pynlpir.open() if IsTraining: infoflag = 4 else: infoflag = 1 # count_test =0 for userquery in filereader: begin = datetime.now() userdict = {} userdictflag = {} #计算df时需要用一个标志标记该词是否在该文档中计算过 self.userinfo.append(userquery[:infoflag]) for item in userquery[infoflag:]: # #使用counter类,但循环变多反而变慢了 第一条0.194s # userdict = Counter(pynlpir.segment(item)) # userdict = Counter({word[0]:value for word,value in userdict.items() if word[1] in wordset}) # self.dict += Counter({word:1 for word in userdict}) # #使用counter类,改变了循环的顺序,但是还是变慢了,第一条0.161 # userdict = [word[0] for word in pynlpir.segment(item) if word[1] in wordset] # userdict = Counter(userdict) # self.dict += Counter({word:1 for word in userdict}) # # 使用counter类 # pass #最开始的循环计数,第一条 0.149s for word in pynlpir.segment(item): if word[1] in wordset: word = word[0] if word in userdict.keys(): userdict[word] += 1 userdictflag[word] = False else: userdict[word] = 1 userdictflag[word] = True if word not in self.dict.keys(): self.dict[word] = 0 if userdictflag[word]: self.dict[word] += 1 self.userlist.append(userdict) end = datetime.now() print(end - begin) # count_test +=1 # if count_test>100: # break pynlpir.close() self.IsDF = True
def __del__(self): pynlpir.close()
def corpus_segment(corpus_path, seg_path): ''''' corpus_path是未分词语料库路径 seg_path是分词后语料库存储路径 ''' max_seg = 80000 max_train_seg = 70000 pynlpir.open() #分词系统 catelist = os.listdir(corpus_path) # 获取corpus_path下的所有子目录 ''''' py_data01/留学/ 其中子目录的名字就是类别名,例如: ''' # 获取每个目录(类别)下所有的文件 finish = ['产经', '法治', '房产', '教育', '金融', '军事', '能源', '台湾', '文化', '证券'] for category in catelist: ''''' 这里category就是类别,如军事 ''' if category not in finish: i = 0 flag = 0 #class_path = corpus_path + category + "/" # 拼出分类子目录的路径如:train_corpus/art/ class_path = os.path.join(corpus_path, category) #seg_dir = seg_path + category + "/" # 拼出分词后存贮的对应目录路径如:train_corpus_seg/art/ seg_dir = os.path.join(seg_path, category) seg_test_dir = os.path.join( "D:\\Py_Learn\\textclassify_work\\results\\test_corpus_seg", category) #print("wh-1",seg_dir) if not os.path.exists(seg_dir): # 是否存在分词目录,如果没有则创建该目录 os.makedirs(seg_dir) if not os.path.exists(seg_test_dir): # 是否存在分词目录,如果没有则创建该目录 os.makedirs(seg_test_dir) years = os.listdir(class_path) # 获取未分词语料库中某一类别中的所有文本 for year in years: # 遍历类别目录下的所有文件 if flag == 1: break yearname = os.path.join( class_path, year) # 拼出文件名年份路径如:train_corpus/art/21.txt months = os.listdir(yearname) for month in months: if flag == 1: break print("cleaning:" + category + "month" + month) month_path = os.path.join(yearname, month) raw_path = os.listdir(month_path) for document in raw_path: i += 1 if i > max_seg: flag = 1 break fullname = os.path.join(month_path, document) content = readfile(fullname) # 读取文件内容 '''''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等, 接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容 ''' #content = content.replace('\n', '') # 删除换行 #content = content.replace(' ', '') # 删除空行、多余的空格 try: con_segx = pynlpir.segment(content, pos_english=True) except UnicodeDecodeError: print(category + " " + document + " UnicodeDecodeError_wh") content_seg = [ element[0] for element in con_segx if element[1] == 'noun' ] #content_seg = jieba.cut(content) # 为文件内容分词 if i <= max_train_seg: savefile( seg_dir + "\\" + document, " ".join(content_seg)) # 将处理后的文件保存到分词后语料目录 else: savefile(seg_test_dir + "\\" + document, " ".join(content_seg)) pynlpir.close() print("中文语料分词结束!!!")
# -*- coding: utf-8 -*- """ Created on Mon May 6 19:00:50 2019 @author: 92111 """ import pynlpir pynlpir.open() with open("test.txt", "r", encoding='utf-8') as f1: text = f1.read() seg_list = pynlpir.segment(text, 0) f2 = open("result.txt", "a", encoding='utf-8') for word in seg_list: f2.write(word + " ") f2.close() pynlpir.close()
def parse_words(s): #调用pynlpir模块分析字符串的关键字及其weight pynlpir.open() key_words = pynlpir.get_key_words(s, weighted=True) pynlpir.close() return key_words