def get_result(self, paragraph): self.paragraph = paragraph self.segments = pynlpir.segment(self.paragraph, pos_names='all', pos_tagging=False) self.key_words = pynlpir.get_key_words(self.paragraph, weighted=False, max_words=20) self.new_sentence_wordlist = [0] * len(self.key_words) key_words = pynlpir.get_key_words(self.paragraph, max_words=20, weighted=True) self.key_weight = [item[1] for item in key_words] sentence_dict = self.cal_text_simliarity() keys = list(sentence_dict.keys()) val = list(sentence_dict.values()) temp = sorted( list(map(val.index, heapq.nlargest(self.maxSumarySize, val)))) for i in temp[:2]: if keys[i] != self.sentence()[0]: self.result.append(keys[i]) self.result.insert(0, self.sentence()[0]) if len(",".join(self.result)) < self.length: self.result.append(keys[temp[2]]) return ",".join(self.result)
def test_get_key_words(self): """Tests that the get_key_words() function works as expected.""" s = '我们都是美国人。' key_words = pynlpir.get_key_words(s) weighted_key_words = pynlpir.get_key_words(s, weighted=True) expected_key_words = ['美国'] expected_weighted_key_words = [('美国', 0.01)] self.assertEqual(expected_key_words, key_words) self.assertEqual(expected_weighted_key_words, weighted_key_words)
def test_get_key_words(self): """Tests that the get_key_words() function works as expected.""" s = "我们都是美国人。" key_words = pynlpir.get_key_words(s) weighted_key_words = pynlpir.get_key_words(s, weighted=True) expected_key_words = ["美国"] expected_weighted_key_words = [("美国", 2.2)] self.assertEqual(expected_key_words, key_words) self.assertEqual(expected_weighted_key_words, weighted_key_words)
def load_doc_list(): pynlpir.open() doc_list = os.listdir(SOURCE_DOC_DIR_PATH) segment_list = [] for doc in doc_list: fr = codecs.open(SOURCE_DOC_DIR_PATH + doc, 'r', 'utf-8') line_list = fr.read() fr.close() ''' line_list = line_list.split(NEW_LINE) line_list.pop() # seg_str = '' for i in range(len(line_list)): segment = pynlpir.segment(line_list[i], pos_tagging=False) seg_str = '' for seg in segment: seg_str += seg + ' ' line_list[i] = seg_str.strip() # segment_list.append(' '.join(line_list)) temp_str = ' '.join(line_list) ''' key_word_list = pynlpir.get_key_words(line_list, max_words=10, weighted=True) for key_word in key_word_list: print(key_word[0], '\t', key_word[1]) pynlpir.close() exit(0)
def GetKeyWorld(filePath): #使用PYNLPIR getkeywokld来实现 #filePath='/home/yuanzhu/Desktop/NewsData/20190603/20190603419.json' try: pr.open() #filePath='/home/yuanzhu/Desktop/NewsData/20190501/20190501181.json' dicNews = GetDictFromJsonFile(filePath) content = dicNews['content'] # segs=pr.segment(content) # for seg in segs: # print(seg) tupkeywords = pr.get_key_words( content, weighted=True) #使用TF-IDF算法提取关键词(貌似还挺有效果) keywords = [] for i, w in enumerate(tupkeywords): keywords.append(w[0]) if i == 9: break i += 1 except Exception as e: strLogErr = 'Get {} keyworld error :{}'.format(filePath, e) print(strLogErr) return None print("FilePath=", filePath) print('获取热点:', keywords) return keywords
def __init__(self, paragraph, maxSumarySize=2): self.paragraph = paragraph self.maxSumarySize = maxSumarySize self.segments = pynlpir.segment(paragraph, pos_names='all', pos_tagging=False) self.key_words = pynlpir.get_key_words(paragraph, weighted=False, max_words=20) self.new_sentence_wordlist = [0] * len(self.key_words) key_words = pynlpir.get_key_words(paragraph, max_words=20, weighted=True) self.key_weight = [item[1] for item in key_words] self.sentence_simlarity = {} self.result = []
def get_key_words(): s = '' max_words = MAX_WORDS_DEFAULT max_hot_words = MAX_HOT_WORDS_DEFAULT update_hot_word = UPDATE_HOT_WORD_DEFAULT # get doc if request.method == 'POST': s = request.form.get('s', type=str, default='') update_hot_word = request.form.get('update_hot_word', type=str, default=UPDATE_HOT_WORD_DEFAULT) # 是否更新hot_word表 try: max_words = request.form.get('max_words', type=str, default=MAX_WORDS_DEFAULT) if max_words != '': # 有max_words参数(可能是默认值'3') print('[POST] max_words yes') max_words = int(max_words.strip()) print('\tmax_words =', max_words) else: max_words = MAX_WORDS_DEFAULT print('[POST] max_words no') except: # max_words参数处理异常,设置默认值3 max_words = MAX_WORDS_DEFAULT try: max_hot_words = request.form.get('max_hot_words', type=str, default=MAX_HOT_WORDS_DEFAULT) if max_hot_words != '': max_hot_words = int(max_hot_words.strip()) else: max_hot_words = MAX_HOT_WORDS_DEFAULT except: max_hot_words = MAX_HOT_WORDS_DEFAULT elif request.method == 'GET': s = request.args.get('s') update_hot_word = request.args.get('update_hot_word') if update_hot_word != 'False': update_hot_word = 'True' try: max_words = int(request.args.get('max_words').strip()) except: max_words = MAX_WORDS_DEFAULT try: max_hot_words = int(request.args.get('max_hot_words').strip()) except: max_hot_words = MAX_HOT_WORDS_DEFAULT # get key words if s == '': # 文章内容为空,不分析 return 'null' else: # 分析关键词 pynlpir.open() key_word_list = pynlpir.get_key_words(s, max_words=max_words, weighted=False) # temp_str = '' for i in range(len(key_word_list)): key_word_list[i] = key_word_list[i] pynlpir.close() if update_hot_word == 'True': # 新开一个线程,更新数据库 print('[update_hot_word] True') t = threading.Thread(target=db_helper.update_tables, args=(','.join(key_word_list), max_hot_words)) t.setDaemon(True) t.start() else: print('[update_hot_word] False') return ','.join(key_word_list)
def readF(path, n=0): for file in os.listdir(path): f=open(path+file,'r') s=f.read() x=pynlpir.get_key_words(s,weighted=True) dic={} for i in x: dic[i[0]]=i[1] vct.append(dic)
def allword_by_pynlpir(inputfile, word_dict, max_words=1000): weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words) for word, weight in weighted_word_list: try: word_dict.setdefault(word, 0) word_dict[word] += weight except Exception as e: print (e)
def work2(): # nlpir词性分析与关键词提取 pynlpir.open() s = '因为明天是周三,所以我要有数据结构课,然而这课好难。' segments = pynlpir.segment(s, pos_names='all', pos_english=False) # 全分析 for segment in segments: print(segment[0], '\t', segment[1]) key_words = pynlpir.get_key_words(s, weighted=True) # 关键词提取 for key_word in key_words: print(key_word[0], '\t', key_word[1]) pynlpir.close()
def get_key_words(text): pynlpir.open() result = [] keywords = pynlpir.get_key_words(text, weighted=True) if len(keywords) == 0: return result for i in range(len(keywords)): keyword = keywords[i][0] result.append(keyword) pynlpir.close() return result
def nlpir_keywords(text, n): pynlpir.open() # print '关键词测试:\n' key_words = list(pynlpir.get_key_words(text, n, weighted=False)) # for key_word in key_words: # print key_word[0], '\t', key_word[1] pynlpir.close() print key_words return key_words
def nlpir_keywords(text,n): pynlpir.open() # print '关键词测试:\n' key_words = list(pynlpir.get_key_words(text,n,weighted=False)) # for key_word in key_words: # print key_word[0], '\t', key_word[1] pynlpir.close() print key_words return key_words
def allword_by_pynlpir(inputfile, word_dict, max_words=1000): weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words) for word, weight in weighted_word_list: try: word_dict.setdefault(word, 0) word_dict[word] += weight except Exception as e: print(e)
def find_keyword(text, keyword_dict): keyword_list = [] keyword_pair_list = pynlpir.get_key_words(text, weighted=True) for keyword_pair in keyword_pair_list: keyword_list.append(keyword_pair[0]) keyword_id_list = [] for keyword in keyword_list: try: keyword_id_list.append(keyword_dict[keyword]) except: keyword_id_list.append(keyword) return keyword_id_list
def allclass_by_pynlpir(inputfile, word_dict, max_words=1000): weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words) for word, weight in weighted_word_list: try: word_class = word_to_class(word) k = word + "\t" + word_class word_dict.setdefault(k, 0) word_dict[k] += weight except Exception as e: print (e)
def cut_pos_nlpir(doc, topK=20): #s = filter_tags(doc) soup = BeautifulSoup(doc, 'lxml') s = soup.get_text() try: s = ''.join(s.split()) ws = pynlpir.get_key_words(s, topK) return ' '.join(ws).encode('utf-8') except: print 'error: ' + s traceback.print_exc() raise
def allclass_by_pynlpir(inputfile, word_dict, max_words=1000): weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words) for word, weight in weighted_word_list: try: word_class = word_to_class(word) k = word + "\t" + word_class word_dict.setdefault(k, 0) word_dict[k] += weight except Exception as e: print(e)
def word_by_pynlpir(inputfile, word_dict, max_words=1000): weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words) for word, weight in weighted_word_list: try: word_class = word_to_class(word) if word_class in ['time word', 'numeral', 'adverb', 'verb', \ 'locative word', 'distinguishing word']: continue if len(word) < 2: continue word_dict.setdefault(word, 0) word_dict[word] += weight except Exception as e: print ("exception " , e)
def st_WordCloud(): # 生成三体词云 in_text = codecs.open('data/st.txt', 'r', encoding='UTF-8').read() pynlpir.open() nlpir.AddUserWord(c_char_p("三体".encode())) nlpir.AddUserWord(c_char_p("罗辑".encode())) key_words = pynlpir.get_key_words(in_text, max_words=300, weighted=True) # 停用词 stopwords = pd.read_csv("data/stop_words.txt", index_col=False, quoting=3, sep="\n", names=['stopword'], encoding='utf-8') words = [word for word, wegiht in key_words] keywords_df = pd.DataFrame({'keywords': words}) # 去掉停用词 keywords_df = keywords_df[~keywords_df.keywords.isin(stopwords.stopword. tolist())] word_freq = [] for word in keywords_df.keywords.tolist(): for w, k in key_words: if word == w: word_freq.append((word, k)) pynlpir.close() print(word_freq) font = r'C:\Windows\Fonts\msyh.ttc' # 指定字体,不指定会报错 # color_mask = imread("resource/ge.jpg") # 读取背景图片 color_mask = imread("resource/timg.jpg") # 读取背景图片 wcloud = WordCloud( font_path=font, # 背景颜色 background_color="white", # 词云形状 mask=color_mask, # 允许最大词汇 max_words=2000, # 最大号字体 max_font_size=80) wcloud.generate_from_frequencies(dict(word_freq)) # 以下代码显示图片 plt.imshow(wcloud) plt.axis("off") plt.show() wcloud.to_file("data/wcimage/三体词云_2.png")
def main(): pynlpir.open() # pynlpir.nlpir.AddUserWord(c_char_p("手机壳".encode())) # pynlpir.nlpir.AddUserWord(c_char_p("炫亮".encode())) # text = '弗洛米iPhone7/7plus手机壳/保护套苹果7plus超薄全包硅胶透明电镀软壳5.5英寸炫亮黑☆炫亮电镀' # text="“赶考”五年成绩非凡 全面从严治党永远在路上" text = codecs.open('data/new.txt', 'r', encoding='UTF-8').read() r_out = pynlpir.segment(text, pos_english=False) key_words = pynlpir.get_key_words(text, weighted=True) pynlpir.close() print(key_words) for x in r_out: print(x)
def qe(row): question = row["question"] html = getHtmlbyQuestion(question) if html == None: return 0 # test properties = getProperties(html) discription = getDiscription(html) keywords = pynlpir.get_key_words(question, weighted=False) #True weightedDict1 = matchKeyWords(keywords, discription) weightedDict2 = proExpansion(keywords, properties) dictMerged = weightedDict1.copy() dictMerged.update(weightedDict2) #所有{扩展关键词:权重}字典 answer = row["answer"] return calScore(question, answer, dictMerged)
def getKeyWords(string, words=10, way=1): keywords = [] if (way == 1): pynlpir.open() str = string.encode('utf-8') wordslist = pynlpir.get_key_words(str, words, False) for each in wordslist: # print(each) keywords.append(each) if (way == 2): textrank = analyse.textrank wordslist = textrank(string) for each in wordslist[0:words]: # print(each) keywords.append(each) return keywords
def word_by_pynlpir(inputfile, word_dict, max_words=1000): weighted_word_list = pynlpir.get_key_words(inputfile, weighted=True, max_words=max_words) for word, weight in weighted_word_list: try: word_class = word_to_class(word) if word_class in ['time word', 'numeral', 'adverb', 'verb', \ 'locative word', 'distinguishing word']: continue if len(word) < 2: continue word_dict.setdefault(word, 0) word_dict[word] += weight except Exception as e: print("exception ", e)
def ie(): """ 信息熵权重 :return: """ # 获取查询 db_tool = tool.DBtool() queries = db_tool.select_queries() segmented_queries = db_tool.select_segmented_queries() # 获取权重 pynlpir.open() temp = {} for query in queries: temp[query['query_id']] = pynlpir.get_key_words(query['query'], weighted=True, max_words=100) pynlpir.close() # 组成字符串 for query in segmented_queries: weight = temp[query['query_id']] dic = {} min_w = 2.0 for w in weight: dic[w[0].encode('utf-8')] = w[1] if w[1] < min: min_w = w[1] words = query['segmented_query'].strip().split(' ') weight = [] for word in words: if word in dic: weight.append(dic[word]) else: weight.append(min_w / 2) s = sum(weight) weight = ['%.5f' % (x / s) for x in weight] # 更新数据库 db_tool.update_weight(query['query_id'], ' '.join(weight), 'weight_ie') db_tool.close()
def post(self, request): obj_id = request.POST['obj_id'] school = MySchool.objects.get(id=int(obj_id)) feeds = [] # weibo # App Key:802677147 # App Secret:f75be23800d779cc9dbbf6b467b7ff61 # Redirect url: https://api.weibo.com/oauth2/default.html # code: 4ccb7879bf204466b80e02c106d09727 # read baidu params = {'keyword': school.name} # send a 3rd party service request baidu_consumer.delay(params) # read saved feeds feeds = MyBaiduStream.objects.filter( school=school).order_by('-last_updated')[:100] content = loader.get_template(self.template_name) tieba_html = content.render(Context({ 'obj': school, 'feeds': feeds, })) # hot topics pynlpir.open() # must have this line! topics = feeds[:50] content = loader.get_template(self.newsticker_template_name) newsticker_html = content.render(Context({ 'objs': topics, 'keywords': pynlpir.get_key_words(''.join([f.name + f.description for f in feeds]), max_words=50, weighted=True) })) pynlpir.close() #newsticker_html = '' return HttpResponse(json.dumps({'bd_html': tieba_html, 'news_html': newsticker_html}), content_type='application/javascript')
def myseg_get_keywords(filename2w, filename2seg, limitlist): dataMat = [] labelMat = [] fr = open(filename2w) fl = open(filename2seg, 'w') limits = open(limitlist) arrayLimits = limits.readlines() lengthLimits = len(arrayLimits) arrayOLines = fr.readlines() length = len(arrayOLines) for j in range(length): flag = 1 lineArr = arrayOLines[j].strip().split(';') for li in range(lengthLimits): limitsArr = arrayLimits[li].strip().split(';') if str(lineArr[1]) == str(limitsArr[1]): flag = 0 if flag == 0: pass else: if len(lineArr) < 3: pass else: seg = pynlpir.get_key_words(lineArr[1], weighted=True) fl.write(str(j)) fl.write(";") fl.write(str(lineArr[1])) fl.write(";") fl.write(str(lineArr[2])) fl.write(";") for item in seg: fl.write(str(item[0])) fl.write(":") fl.write(str(item[1])) fl.write(",") fl.write(";\n") fl.close() pynlpir.close()
def pynlpir(): import pynlpir # read corpus pos_text = configReader(section='path', option='pos_text') fileList = getFileList(pos_text) corpus = [dataReader(pos_text + f, 'r+') for f in fileList] # pynlpir open pynlpir.open() for i in range(len(corpus)): fileName = ext_path + os.path.splitext(fileList[i])[0] + '_ext.txt' print "Writing pynlpir extraction: %s" % fileName try: keys = pynlpir.get_key_words(corpus[i], weighted=True) except: RaiseErr('extract', fileName) # output data = ["%-20s %-10.8f\n" % (key[0], key[1]) for key in keys] try: dataWriter(data, fileName, 'w') except: RaiseErr('rit_ext', fileName) pynlpir.close()
def post(self,request): obj_id = request.POST['obj_id'] school = MySchool.objects.get(id=int(obj_id)) feeds = [] # weibo # App Key:802677147 # App Secret:f75be23800d779cc9dbbf6b467b7ff61 # Redirect url: https://api.weibo.com/oauth2/default.html # code: 4ccb7879bf204466b80e02c106d09727 # read baidu params = {'keyword':school.name} # send a 3rd party service request baidu_consumer.delay(params) # read saved feeds feeds = MyBaiduStream.objects.filter(school=school).order_by('-last_updated')[:100] content = loader.get_template(self.template_name) tieba_html= content.render(Context({ 'obj':school, 'feeds': feeds, })) # hot topics pynlpir.open() # must have this line! topics = feeds[:50] content = loader.get_template(self.newsticker_template_name) newsticker_html= content.render(Context({ 'objs':topics, 'keywords': pynlpir.get_key_words(''.join([f.name+f.description for f in feeds]), max_words=50, weighted=True) })) pynlpir.close() return HttpResponse(json.dumps({'bd_html':tieba_html,'news_html':newsticker_html}), content_type='application/javascript')
import pynlpir pynlpir.open() str = "聊天机器人到底该怎么做?" segs = pynlpir.segment(str) segments = pynlpir.segment(str, pos_tagging=True, pos_names="all", pos_english=False) for seg in segs: pass #print(seg[0],'\t',seg[1]) key_words = pynlpir.get_key_words(str, weighted=True) for key_word in key_words: pass #print(key_word[0],'\t',key_word[1]) for segment in segments: print(segment[0], "\t", segment[1]) k_ws = pynlpir.get_key_words(str, weighted=True) for k_w in k_ws: print(k_w[0], '\t', k_w[1])
def test_issue_23(self): """Tests for issue #20 -- get key words with no count returned.""" s = '我们很好,你呢' weighted_key_words = pynlpir.get_key_words(s, weighted=True) expected_weighted_key_words = [('我们', -1.00)] self.assertEqual(expected_weighted_key_words, weighted_key_words)
from pynlpir import nlpir import sys # print sys.getdefaultencoding() # reload(sys) # # sys.setdefaultencoding("utf-8") pynlpir.open() # pynlpir.open(encoding='utf-8') s = 'NLPIR分词系统前身为2000年发布的ICTCLAS词法分析系统,从2009年开始,为了和以前工作进行大的区隔,并推广NLPIR自然语言处理与信息检索共' \ '享平台,调整命名为NLPIR分词系统。' print (s) print (pynlpir.segment(s,pos_tagging=False)) print (pynlpir.get_key_words(s, weighted=True)) c=pynlpir.segment(s,pos_tagging=False) c=str("|".join(c)).encode('utf-8') print (c) import jieba tokens = jieba.cut(s, cut_all=False) print ('精确模式:') print("|".join(tokens))
# -*- coding: utf-8 -*- # @Time : 2019/4/8 17:57 # @Author : Mr.Robot # @Site : # @File : progress_segment.py # @Software: PyCharm import pynlpir pynlpir.open() if __name__ == "__main__": s = "服药便溏,大便呈糊状,大便日2次,胸闷胸痛,最近胸腔积液2次抽取,干咳少痰,难咯,口干欲饮,纳差,形瘦面灰,鼻准红赤,尿少色黄,有乙肝,肝硬化病史," segment = pynlpir.segment(s, pos_tagging=False) print(segment) key_words = pynlpir.get_key_words(s) print(key_words) pynlpir
#coding:utf-8 import sys reload(sys) sys.setdefaultencoding("utf-8") import pynlpir pynlpir.open() s = '聊天机器人到底该怎么做呢?' segments = pynlpir.segment(s) for segment in segments: print segment[0], '\t', segment[1] #extracting keywords key_words = pynlpir.get_key_words(s, weighted=True) for key_word in key_words: print key_word[0], '\t', key_word[1] ##extracting all information s = '海洋是如何形成的' segments = pynlpir.segment(s, pos_names='all') for segment in segments: print segment[0], '\t', segment[1] pynlpir.close()
import pynlpir if __name__ == '__main__': pynlpir.open() s = '我爱你中国' # segment_list = pynlpir.segment(s, pos_tagging=False) # for seg in segment_list: # print(seg) key_word_list = pynlpir.get_key_words(s, max_words=10, weighted=True) for key_word in key_word_list: print(key_word[0], '\t', key_word[1]) pynlpir.close()
def Participle(list_datas, filename_stopwords): #分词 time_start = time.time() print("正在分词...") list_garbagesT.clear() list_words_stop = GetListWords(filename_stopwords) pynlpir.open() for data in list_datas: # segments = pynlpir.segment(data.content, pos_names='all',pos_english=False) # file_nlp.write('\n') # for segment in segments: # file_nlp.write("[ %s : %s ]" % (segment[0], segment[1])) # file_nlp.write('\n') if len(data.content) < 8: data.error = "内容过短" list_garbagesT.append(data) continue list_words = pynlpir.get_key_words(data.content, max_words=70) if len(list_words) == 0: data.error = "没有分词结果" list_garbagesT.append(data) continue #print("开始停词") for word in list_words: if word in list_words_stop: #print("停了个词" + word) continue if word == '': data.error = "包含空白分词" list_garbagesT.append(data) break #统计词频 contentT = data.content count = 0 while contentT.find(word) > -1: contentT = contentT.replace(word, '', 1) count += 1 if count == 0: data.error = "分词不属于原文" list_garbagesT.append(data) break #保存词频统计结果 data.dict_words_tfidf[word] = count if len(data.dict_words_tfidf) == 0: data.error = "词频统计结果为空" list_garbagesT.append(data) continue #清除垃圾数据 for data in list_garbagesT: list_datas.remove(data) list_garbages.append(data) list_garbagesT.clear() pynlpir.close() time_end = time.time() print("用时 : %.2f s" % (time_end - time_start)) return list_datas
def partition(input_path, output_path): ''' 分词,把input _path 里的文本文件分词,结果存在output_path :param input_path: 文本文件路径 :param output_path: 分词结果的路径 :return: 编码错误的词的错误 ''' f3 = tempfile.NamedTemporaryFile(mode='w+t', encoding='utf-8') f3_name = f3.name stop_set = [] f_stop_list = open( 'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/extract_main_word/stop_list.txt', 'r', encoding='utf-8') for line in f_stop_list: stop_set.append(line.split()[0]) stop_set = set(stop_set) os.chdir(input_path) f_lst = os.listdir(os.getcwd()) cnt1 = 0 nlpir = pynlpir.nlpir pynlpir.open() nlpir.ImportUserDict( b'C:/Users/i-zhanghaoran/Desktop/Extract_main_word&Sentiment_anaylsis/new_bigdic.txt' ) for item in f_lst: ans_lst = [] f = open(item, 'r', encoding='utf-8') s = bytes(f.read(), encoding='utf-8') f.close() size = ctypes.c_int() result = nlpir.ParagraphProcessA(s, ctypes.byref(size), True) result_t_vector = ctypes.cast(result, ctypes.POINTER(nlpir.ResultT)) words = [] for i in range(0, size.value): r = result_t_vector[i] word = s[r.start:r.start + r.length] words.append((word, r.sPOS)) f2 = open(output_path + item, 'w', encoding='utf-8') for word, pos in words: # try: if word.decode('utf-8') not in stop_set: if pos.decode('utf-8') > b'z'.decode('utf-8') or pos.decode( 'utf-8').upper() == pos.decode( 'utf-8') and pos.decode('utf-8') != '': ans_lst.append((pos.decode('utf-8'), word.decode('utf-8'))) f2.write( (word.decode('utf-8') + ' ' + pos.decode('utf-8') + '\n')) f3.write( (word.decode('utf-8') + ' ' + pos.decode('utf-8') + '\n')) # except: # cnt1+=1 # else: # f2.write(word.decode('utf-8') + '\n') keys = pynlpir.get_key_words(s, max_words=10, weighted=False) ans_set = list(set(ans_lst)) feqrence = [0 for k in range(len(ans_set))] for k in range(len(ans_set)): for item in ans_lst: if item == ans_set[k]: feqrence[k] += 1 f2.write('\n\nMy tags: ') type_lst = [] for item in ans_set: # ans_set: ('COMPANY_OF_INDUSTRY_56', '兴业银行') if item[0] not in type_lst: type_lst.append(item[0]) type_lst.sort() ans_s = '' for k in range(len(type_lst)): ans_s += str(type_lst[k]) + ': ' for l in range(len(ans_set)): if ans_set[l][0] == type_lst[k]: # 这里插入一个函数,来表示股票与基金间的关系 ans_s += stock2fund(ans_set, feqrence, l) # ans_s+=' ('+str(ans_set[l][1])+': '+str(feqrence[l])+')' ans_s += '\n' f2.write(ans_s) f2.write('\n\nkeyword: ') # 这里是在数分词器给出的关键词词频 keys_f = [0 for l in range(len(keys))] commen_last_name = [ '王', '李', '张', '刘', '陈', '杨', '黄', '赵', '吴', '周', '徐', '孙', '马', '朱', '胡', '郭', '何', '高', '林', '郑', '谢', '罗', '梁', '宋', '唐', '许', '韩', '冯', '邓', '曹', '彭', '曾', '蕭', '田', '董', '袁', '潘', '于', '蒋', '蔡', '余', '杜', '叶', '程', '苏', '魏', '吕', '丁', '任', '沈', '姚', '卢', '姜', '崔', '钟', '谭', '陆', '汪', '范', '金', '石', '廖', '贾', '夏', '韦', '付', '方', '白', '邹', '孟', '熊', '秦', '邱', '江', '尹', '薛', '闫', '段', '雷', '侯', '龙', '史', '陶', '黎', '贺', '顾', '毛', '郝', '龚', '邵', '万', '钱', '严', '覃', '武', '戴', '莫', '孔', '向', '汤' ] ans3 = '' f3.seek(0) for line in f3: if len(line.split()) == 2: name = line.split()[0] pos = line.split()[1] for l in range(len(keys)): if name == keys[l]: keys_f[l] += 1 if name[0] in commen_last_name and name not in [ '万元', '周一', '周二', '周三', '周四', '周五', '周六', '周日', '周天' ] and len(name) in [2, 3] and pos == 'nr': ans3 += ' ' + name ans2 = '' for l in range(len(keys)): ans2 += str(keys[l]) + ': ' + str(keys_f[l]) + ' ' f2.write(ans2) f2.write('\n\nRelated person: ' + ans3) f2.close() pynlpir.close() return cnt1
__author__ = 'gohper' # -*- coding:utf-8 -*- import pynlpir pynlpir.open() s = "聊天机器人到底该怎么做呢?" segs = pynlpir.segment(s) for seg in segs: print seg[0], '\t', seg[1] print("_____") s1 = "海洋是如何形成的" segs = pynlpir.segment(s1, pos_names='all') for seg in segs: print seg[0], '\t', seg[1] print("_________") key_words = pynlpir.get_key_words(s, weighted=True) for key_word in key_words: print key_word[0], '\t', key_word[1] pynlpir.close()
def extract_tags(s, topK=5, weighted=False): return pynlpir.get_key_words(s, topK, weighted)