def textRank_ppt(url, num_abs): """ :param url: :param num_abs: 生成ppt张数 :return: """ title, texts = article_extract(url) tr4w = TextRank4Keyword() tr4w.analyze( text=texts, lower=True, window=2 ) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 tr4s = TextRank4Sentence() tr4s.analyze(text=texts, lower=True, source='all_filters') print('关键词:') key_words = "" for item in tr4w.get_keywords(7, word_min_len=2): print(item.word, item.weight) key_words = key_words + item.word + "\n" #ppt generate prs = Presentation() slide1, body_shape1 = ppt1.add_slide(prs=prs, slide_title=title, style_number=0) slide2, body_shape2 = ppt1.add_slide(prs=prs, style_number=1, slide_title="关键词", content="") ppt1.add_paragraph(body_shape2, text=key_words, size=20) i = 0 #图片生成,并添加到ppt中 extract_image.pic_extract(url) print("句子:") for item in tr4s.get_key_sentences(num=(num_abs - 2) * 2): if i % 2 == 0: slide3, body_shape3 = ppt1.add_slide(prs=prs, style_number=1, slide_title="摘要", content="") try: ppt1.add_picture(slide2=slide3, pic_path="image1/image_" + str(i) + ".jpg") except: print("no picture") i += 1 # print(len(item.sentence),item.index) ppt1.add_paragraph(body_shape3, text=item.sentence, size=20) prs.save('test.pptx') print("ppt 已生成")
def get_abstract(data, a, b, c): """ 生成excel可以打开的的摘要文件 :param data: MySQL从取出的数据 :param a: 起始时间的 :param b: 结束时间 :param c: 话题ID :return: null """ now_path = os.getcwd() path = now_path.replace('\\', '/') tr4s = TextRank4Sentence() print('当前文章的摘要:') results = [] for i in range(len(data['CONTENT'])): # i = re.sub("[^\u4e00-\u9fa5]", '', i) # 记住只留文本?没有断句算不上摘要,这里需要用其他方式处理 # print('\u3002 \uff1b \uff0c \uff1a \u201c \u201d' # '\uff08 \uff09 \u3001 \uff1f \u300a \u300b') # # 。 ; , : “ ”( ) 、 ? 《 》 tmp = re.sub( "[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b0-9]", '', data['CONTENT'][i]) tr4s.analyze(text=tmp, lower=True) result = '' # print() # print('摘要:') for item in tr4s.get_key_sentences(num=3): # print(item.index, item.weight, item.sentence) result += item.sentence if len(result) != 0: results.append([ data['UPTIME'][i], data['TITLE'][i], data['AUTHOR'][i], result ]) # data['CONTENT'][i] = results column_name = ['更新时间', '标题/题目', '作者', '摘要'] tmp_text = pd.DataFrame(columns=column_name, data=results) tmp_text.to_csv('./data/textrank/topic{}_{}-{}abstract.csv'.format( c, a, b), encoding='utf_8_sig') print('>>>>>>>>>>>>>> 已经保存到csv等待计算或查看 >>>>>>>>>>>>>>>') # os.startfile(now_path + '/data/textrank/topic{}_{}-{}abstract.csv'.format(c, a, b)) # 弹出工作表 # 但是这里需要完整的工作路径才行 T:/AC/Python/PublicOpinionMonitor/data/textrank/topic{}_abstract.csv # print(results) # print(' >>>>>> >>>>>>>>> 将在10秒后关闭excel >>>>>>> >>>>>>>> ') time.sleep(10) clear_all_var() return
def abstract(fileName, step): tr4s = TextRank4Sentence() f = open(fileName, 'r') text = f.readlines() articleLen = countArticleLen(text) text = [t.split('\n')[0].strip() for t in text] if not judge(text): # 如果judge返回False则进入合并方法 text = ''.join(text) text = text.replace('……', ',') head, middles, last = splitPart(text, step) cate, result = mergeAbstract(tr4s, text, head, middles, last) else: cate, result = paragramAbstarct(tr4s, text) return cate, result, articleLen
def _36r_keyword_abstract(article, keywords_len, sentences_len): # 抽取关键词 tr4w = TextRank4Keyword() tr4w.analyze(text=article, lower=True, window=2) keywords = [] for item in tr4w.get_keywords(keywords_len, word_min_len=1): keywords.append(item.word) # 抽取摘要 tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): abstract.append(item.sentence + '。') abstract = '\n'.join(abstract) return keywords, abstract
def get_key_sents(cont, num=3): ''' 根据内容提取摘要, num=3,提取三段摘要, return: 下划线拼接提取出的摘要 ''' tr4s = TextRank4Sentence() tr4s.analyze(text=cont, lower=True, source='all_filters') key_sents = [] summarys = [] for item in tr4s.get_key_sentences(num=num): key_sents.append((item.index, item.weight, item.sentence)) for s in key_sents: summarys.append(s[2]) return '_'.join(summarys)
def text_rank_subtract(content: str, n=3): """ 传入中文字符串,使用text rank方法抽取摘要 :param content: (str)文章内容 :param n: 抽取摘要的句子数量n :return: summary_sentences(list)摘要句子列表 """ tr4s = TextRank4Sentence() tr4s.analyze(text=content, lower=True, source='all_filters') # 抽取摘要 summary_sentences = [ item.sentence for item in tr4s.get_key_sentences(num=n) ] return summary_sentences
def cal_key_sentence(self): # 输出重要的句子 tr4s = TextRank4Sentence() tr4s.analyze(text=self.text, lower=True, source='all_filters') # print('摘要:') # 重要性较高的3个句子 for item in tr4s.get_key_sentences( num=self.sentence_num): # sentence_num是生成关键句的个数 # index是语句在文本中位置,weight表示权重 # print('item.index, item.weight, item.sentence',item.index, item.weight, item.sentence) # self.import_sentence.append(str(item.index) + ' ' +item.sentence) self.import_sentence.append(item.sentence) # print('self.import_sentence',self.import_sentence[0][2:] + self.import_sentence[1][2:] + self.import_sentence[2][2:]) print('self.import_sentence', self.import_sentence)
def __summary(self, post: dict): ''' Summarize the post's filtered content. Args: post: The dict as a collection of all desired information in a post. ''' print(post['link'], 'nlp and takes time...') trw = TextRank4Keyword() trw.analyze(post['content_filtered'], lower=True) post['keywords'] = [i.word for i in trw.get_keywords(3)] trs = TextRank4Sentence() trs.analyze(post['content_filtered'], lower=True) post['summary'] = [i.sentence for i in trs.get_key_sentences(2)]
def topic_paragraph(self): with open('./1.txt') as f: data = f.read() # print(data) tr4s = TextRank4Sentence() tr4s.analyze(text=data, lower=True, source="all_filters") abstract = [] for item in tr4s.get_key_sentences(num=100): if len(item.sentence) < 300: abstract.append([item.index, item.sentence]) abstract = sorted(abstract[:1], key=lambda x: x[0]) abstract = [ "(%i) %s \n" % (i, x[i]) for i, x in enumerate(abstract, 1) ]
def getAbstarct(text, sentencesNum = 3, lists = False): """对text提取摘要,内容为sentencesNum个高权重句子, 返回结果默认为一个字符串,也可以是N个句子的列表""" if sentencesNum <2: sentencesNum = 2 tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') # print('摘要为:') ss = [] for item in tr4s.get_key_sentences(num=sentencesNum): # 打印句子的索引、权重和内容 # print(item.index, item.weight, item.sentence) ss.append(item.sentence) if lists: return ss #n个句子列表 return "。".join(ss) #默认一个字符串
def process(self, ): for i in self.text: self.article += i.getText() + '\n' self.article = self.article.strip() keywords = [] abstract = [] ##关键词 tr4w = TextRank4Keyword() tr4w.analyze(text=self.article, lower=True, window=2) for item in tr4w.get_keywords(4, word_min_len=1): keywords.append(item.word) ##摘要 tr4s = TextRank4Sentence() tr4s.analyze(text=self.article, lower=True, source = 'all_filters') for item in tr4s.get_key_sentences(num=3): abstract.append(item.sentence) return keywords, abstract
def dataframe_keyword_extraction(data): mod = TextRank4Sentence() label_index = data.columns.get_loc('content') for i in range(len(data)): res = "" print("处理第{}条数据".format(i)) content = data.iloc[i]['content'] if (len(content) < 300): continue mod.analyze(text=content, lower=False, source='all_filters') for str in mod.get_key_sentences(num=3): if len(res) < 256: res += str.sentence print(res) data.iloc[i, label_index] = res return data
def parse_abstract_textrank(text): import sys try: reload(sys) sys.setdefaultencoding('utf-8') except: pass from textrank4zh import TextRank4Keyword, TextRank4Sentence tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') res = "" for item in tr4s.get_key_sentences(num=3): res += item.sentence # index是语句在文本中位置,weight是权重 return res
def ks_plot(text, number): tr4s = TextRank4Sentence() tr4s.analyze(text, lower=True, source='no_stop_words') data = pd.DataFrame(data=tr4s.key_sentences) mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False plt.figure(facecolor='w') plt.plot(data['weight'], 'ro-', lw=2, ms=5, alpha=0.7) plt.grid(b=True) plt.xlabel(u'句子', fontsize=14) plt.ylabel(u'重要度', fontsize=14) plt.title(u'句子的重要度曲线', fontsize=18) plt.show() key_sentences = tr4s.get_key_sentences(num=number, sentence_min_len=4) for sentence in key_sentences: print(sentence['weight'], sentence['sentence'])
def cal_rouge_textRank(article_lst, summ_lst): rouge = Rouge() tr4s = TextRank4Sentence() rouges = np.zeros((3, 3)) cnt = 0 for article, summ in zip(*(article_lst, summ_lst)): tr4s.analyze(text=article, lower=True, source='all_filters') keysentence_list = list() for item in tr4s.get_key_sentences(num=10): s = ''.join(item.sentence) s = re.sub("\d{4,}", '', s) keysentence_list.append(s) hyps = "" for j, sentence in enumerate(keysentence_list): if j == 0 and len(sentence) > 60: hyps = sentence[:60] break if (len(hyps) + len(sentence)) <= 60: hyps += sentence else: break hyps = clean(hyps) summ = summ.strip() summ = clean(summ) summ_ids, hyps_ids = word_for_rouge(summ, hyps) rouge_score = rouge.get_scores(" ".join(hyps_ids)[:len(summ)], " ".join(summ_ids)) rouge1 = rouge_score[0]["rouge-1"] rouge2 = rouge_score[0]["rouge-2"] rougel = rouge_score[0]["rouge-l"] rouges[0] += np.array(list(rouge1.values())) rouges[1] += np.array(list(rouge2.values())) rouges[2] += np.array(list(rougel.values())) cnt += 1 rouges = rouges / cnt print("Rouge: Rouge-1 : F P R") print("Rouge: Rouge-2 : F P R") print("Rouge: Rouge-L : F P R") print(rouges)
def fit_transform(self, theta=0.5): datMat = self.loadData(self.data) word_segmentation = [] for i in range(len(datMat)): word_segmentation.append(self.word_segment(datMat[i])) print( "............................................................................................" ) print('文本已经分词完毕 !') # 得到文本数据的空间向量表示 corpus_tfidf = self.get_Tfidf_vector_representation(word_segmentation) # corpus_tfidf = self.get_Doc2vec_vector_representation(word_segmentation) dictTopic, clusterTopic = self.single_pass(corpus_tfidf, datMat, theta) print( "............................................................................................" ) print("得到的主题数量有: {} 个 ...".format(len(dictTopic))) print( "............................................................................................\n" ) # 按聚类语句数量对主题进行排序,找到重要的聚类群 clusterTopic_list = sorted(clusterTopic.items(), key=lambda x: len(x[1]), reverse=True) for k in clusterTopic_list[:30]: cluster_title = '\n'.join(k[1]) # print(''.join(cluster_title)) # 得到每个聚类中的的主题关键词 word = TextRank4Keyword() word.analyze(''.join(self.word_segment(''.join(cluster_title))), window=5, lower=True) w_list = word.get_keywords(num=10, word_min_len=2) sentence = TextRank4Sentence() sentence.analyze('\n'.join(k[1]), lower=True) s_list = sentence.get_key_sentences(num=3, sentence_min_len=5)[:30] print("【主题索引】:{} \n【主题声量】:{} \n【主题关键词】: {} \n【主题中心句】 :\n{}".format( k[0], len(k[1]), ','.join([i.word for i in w_list]), '\n'.join([i.sentence for i in s_list]))) print('\n') print("【标题】:", '\n'.join([content[:20] for content in k[1]])) print( "-------------------------------------------------------------------------" )
def sina_keyword_abstract(article, keywords_len, sentences_len): # 抽取关键词 tr4w = TextRank4Keyword() tr4w.analyze(text=article, lower=True, window=2) keywords = [] for item in tr4w.get_keywords(keywords_len, word_min_len=1): keywords.append(item.word) # 抽取摘要 tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): if str(item.sentence).startswith('原标题') or str(item.sentence).startswith('责任编辑') or str( item.sentence).startswith('来源'): continue abstract.append(item.sentence + '。') abstract = '\n'.join(abstract) return keywords, abstract
def text_keyword_abstract(article, keywords_len, sentences_len): tr4w = TextRank4Keyword() tr4w.analyze(text=article, lower=True, window=2) keywords = [] for item in tr4w.get_keywords(keywords_len, word_min_len=2): keywords.append(item.word) keywords = ' '.join(keywords) sentences = article.split('.') first_sentence = sentences[0] tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): if item.sentence != first_sentence: abstract.append(item.sentence + '.') abstract = '\n'.join(abstract) return keywords #, abstract
def key_text(text): tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') import_sentence = [] for item in tr4s.get_key_sentences(num=3): # sentence_num是生成关键句的个数 # index是语句在文本中位置,weight表示权重 # print('item.index, item.weight, item.sentence',item.index, item.weight, item.sentence) # self.import_sentence.append(str(item.index) + ' ' +item.sentence) import_sentence.append(item.sentence) # key_sentence = [i.sentence for i in tr4s.get_key_sentences(num=3)] # num生成关键句的个数 # 核心内容是 标题 + top-3文本本体关键句 key_sentences = import_sentence[0][2:] + import_sentence[1][ 2:] + import_sentence[2][2:] # 过长文本截断 if len(key_sentences) < 512: return key_sentences else: return key_sentences[:512]
def nlp(contents): tr4w = TextRank4Keyword() tr4w.analyze(text=''.join(i for i in contents), lower=True, window=2) tr4s = TextRank4Sentence() tr4s.analyze(text=''.join(i for i in contents), lower=True, source='all_filters') keyword = [item for item in tr4w.get_keywords(20, word_min_len=1)] keyphase = [ item for item in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2) ] keysentence = [item for item in tr4s.get_key_sentences(num=3)] return keyword, keyphase, keysentence
def cctv_abstract(date): """从CCTV新闻联播当天的内容中抽取10个句子作为摘要 :param date: str 日期,如:20181222 :return: str """ news = ts_pro.cctv_news(date=date) contents = "".join(list(news['content'])) tr4s = TextRank4Sentence() tr4s.analyze(text=contents, lower=True, source='all_filters') abstract = [] for i, item in enumerate(tr4s.get_key_sentences(num=10), 1): abstract.append("(%i) %s。\n" % (i, item.sentence)) return "".join(abstract)
def get_event(text): tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') for item in tr4s.get_key_sentences(num=len(tr4s.sentences)/2): if '說' not in item.sentence: index_list.append(item.index) #print(item.index, item.weight, item.sentence) index_list.sort() for i in index_list: #print(tr4w.sentences[i]) sentence_list.append(tr4s.sentences[i]) for i in index_list: s = tr4s.sentences[i] p = parse_tree(s) tmp_list = [] for i in range(len(p)): if p[i][6] != 'N': tmp_list.append((p[i], s)) parse_list.append(tmp_list) for list in parse_list: for s in list: index = get_index(s[0]) index.sort() event = '' for i in index: for j in range(i, len(s[0])): if s[0][j] == ':': start = j+1 if s[0][j] == '|' or s[0][j] == ')': end = j break if s[0][start:end] != '說': event += s[0][start:end] if len(event) != 0: events.append(event) return events
def summary_candidate_fin(text): tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') #設定只保留中文、英文、數字(去掉韓語日語德語,也會去掉表情符號等等) #reference: https://zhuanlan.zhihu.com/p/84625185 rule = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]") #print( '摘要:' ) tt = [] for i, item in enumerate(tr4s.get_key_sentences(num=3, sentence_min_len=80)): #print('第{}順位,利用textrank的第一次摘要: '.format(i+1)) #print(item.index, item.weight, item.sentence) s = SnowNLP(item.sentence) #print('利用snownlp再取一次的結果: ') secnd_sn = s.summary(3) #print(secnd_sn) for cont in secnd_sn: ttt = rule.sub(' ', str(cont)) if len(ttt.split(' ')) < 3 and len(ttt) > 12: tt.append(ttt) #print(' ') s = SnowNLP(text) #print('直接使用snownlp的摘要: ') word = {} first_sn = s.summary(3) for cont in first_sn: ttt = rule.sub(' ', str(cont)) if len(ttt.split(' ')) < 3 and len(ttt) > 12: if word.get(ttt) == None: word[ttt] = 1 tt.append(ttt) #print(first_sn) #print(' ') if len(tt) == 0: print('無適合的標題') tt.append("無適合的標題") return tt return tt
def get_summary(self, data, flag=0): text = "".join(data) if flag == 0: tr4w = TextRank4Keyword() tr4w.analyze(text=text, lower=True, window=2) #ret = tr4w.get_keywords() ret = tr4w.get_keyphrases(keywords_num=12, min_occur_num=0) if len(ret) > 0: return ret[0] else: return "" else: tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True) ret = tr4s.get_key_sentences(num=6, sentence_min_len=4) if len(ret) >= 0: return ret[0]['sentence'] else: return ""
def process_introduce(self, introduce): introduce = introduce.replace('\n', '').replace('\r', '').replace( '\t', '').replace('\xa0', '').replace('\u3000', '').replace(' ', '') tr4s = TextRank4Sentence() tr4s.analyze(text=introduce, lower=True, source='all_filters') l = len(tr4s.get_key_sentences()) num_sentences = 3 sentences = tr4s.get_key_sentences(num=num_sentences) sentences = sorted(sentences, key=lambda x: x.index, reverse=False) news = [] if l >= num_sentences: for i in range(num_sentences): news.append(sentences[i].sentence) else: for i in range(l): news.append(sentences[i].sentence) return ''.join(news)
def action_two(): import pandas as pd from textrank4zh import TextRank4Keyword, TextRank4Sentence news = pd.read_table('textrank/news.txt', encoding='GB18030', header=None) strings = '' for index in range(news.shape[0]): strings += news.loc[index, 0] tr4w = TextRank4Keyword() tr4w.analyze(text=strings, lower=True, window=3) print('关键词:') for item in tr4w.get_keywords(20, word_min_len=2): print(item.word, item.weight) tr4s = TextRank4Sentence() tr4s.analyze(text=strings, lower=True, source='all_filters') print('摘要:') # 重要性较高的三个句子 for item in tr4s.get_key_sentences(num=3): print(item.weight, item.sentence)
def Keyword(): text = codecs.open('/Users/liamtheron/Desktop/Deloiite/test.txt', 'r', encoding='utf-8').read() tr4w = TextRank4Keyword() tr4s = TextRank4Sentence() tr4w.analyze(text=text, lower=True, window=2) tr4s.analyze(text=text, lower=True) print('<关键词>:') for item in tr4w.get_keywords(20, word_min_len=1): print(item.word, item.weight) print() print('<关键短语>:') for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): print(phrase) print() print('<摘要>:') for item in tr4s.get_key_sentences(num=3): print(item.index, item.weight, item.sentence)
def textSummary(fileName, finalName): text = codecs.open(fileName, 'r', 'utf-8').read() tr4w = TextRank4Keyword() tr4w.analyze( text=text, lower=True, window=2 ) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 # 关键词 tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') sentences = {} for item in tr4s.get_key_sentences(num=50): if len(item.sentence) not in sentences.keys(): sentences[len(item.sentence)] = [] sentences[len(item.sentence)].append(item.sentence) continue sentences[len(item.sentence)].append(item.sentence) key_sentences = [] for i in range(3): key = max(sentences.keys()) # print(sentences[key]) key_sentences.append(sentences[key][0]) sentences.pop(key) text = re.split( pattern= r'[\u3000-\u301e\ufe10-\ufe19\ufe30-\ufe44\ufe50-\ufe6b\uff01-\uffee]', string=text) ks = [] for te in text: for k in key_sentences: if str(te) in str(k) and str(k) not in ks: ks.append(k) line = '' for k in ks: line += k + " " fw = codecs.open(finalName + "output.txt", 'a', 'utf-8') fw.write(line)
def get_textrank4zh_summarization(contents): """ 获取文本摘要 :param contents: string :return: dict of list [{x},{x}] """ # 定义返回前5个文本摘要 topK = 5 tr4s = TextRank4Sentence() tr4s.analyze(text=contents, lower=True, source='all_filters') # logger.info('使用textrank4zh提取摘要,默认提取5个') # print('摘要:') # for item in tr4s.get_key_sentences(num=5): # print('文本位置:{}, 权重:{},内容:{}'.format(item.index, item.weight, item.sentence)) # index是语句在文本中位置,weight是权重 result = tr4s.get_key_sentences(num=topK) return result
def extract_key_information(self, num_key_word=30, num_key_phrase=20, num_key_sentence=5): text = ''.join(self.article_list) # 创建分词类的实例 tr4w = TextRank4Keyword() # 对文本进行分析,设定窗口大小为2,并将英文单词小写 tr4w.analyze(text=text, lower=True, window=2) with open(self.rule_reference_filename, "a") as f: # 从关键词列表中获取前20个关键词 f.write( '###########################关 键 词##################################' + '\n') for item in tr4w.get_keywords(num=num_key_word, word_min_len=1): if item.word in self.stopwords or item.word in self.filter_dictionary: continue else: f.write(item.word + '\t' + str(item.weight) + '\n') with open(self.rule_reference_filename, "a") as f: # 从关键短语列表中获取20个关键短语 f.write( '##########################关 键 短 语##################################' + '\n') for phrase in tr4w.get_keyphrases(keywords_num=num_key_phrase, min_occur_num=2): f.write(phrase + '\n') # 创建分句类的实例 tr4s = TextRank4Sentence() # 英文单词小写,进行词性过滤并剔除停用词 tr4s.analyze(text=text, lower=True, source='all_filters') with open(self.rule_reference_filename, "a") as f: # 从关键短语列表中获取5关键短语 f.write( '###########################关 键 句##################################' + '\n') for item in tr4s.get_key_sentences(num=num_key_sentence): f.write( str(item.index) + str(item.weight) + str(item.sentence) + '\n') f.write('----------------' + '\n')