def get_dialog(text): tr4s = TextRank4Sentence(delimiters=dialog_sentence_delimiters) tr4s.analyze(text=text, lower=True, source='all_filters') #print(tr4s.sentences) #for s in tr4s.sentences: # print(s) sentences = [s for s in tr4s.sentences if '「' in s] dialog = [s.split('「') for s in sentences] speak = [s[1] for s in dialog] final_speak = [] for i in range(len(speak) - 1): if speak[i][-1] == ',': final_speak.append(speak[i] + speak[i + 1]) i += 1 else: final_speak.append(speak[i]) return final_speak #text = codecs.open('小紅帽.txt', 'r', 'utf-8').read() #print(get_dialog(text))
def summary_main(weibo_data):#摘要自动生成主函数 ''' 输入数据: weibo列表:[weibo1,weibo2,...] ''' word_result,word_weight = word_net(weibo_data,5) text_list = text_net(word_result,word_weight,weibo_data) text_str = '' for text in text_list: re_t = re_cut(text) if not len(re_t): continue if re_t[-1] != '。': text_str = text_str + re_t + '。' else: text_str = text_str + re_t #print text_str tr4s = TextRank4Sentence() tr4s.analyze(text=text_str, lower=True, source = 'all_filters') result = [] for item in tr4s.get_key_sentences(num=10): result.append(item.sentence) return result
def keys(number): key_words = '---关键词:\n' key_phrases = '---关键短语:\n' key_sentences = '---摘要:\n' txt = texts(number) # text = codecs.open(str(number) + '.txt', 'a+', 'utf-8').read() tr4w = TextRank4Keyword() tr4w.analyze( text=txt, lower=True, window=2 ) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 print('关键词:') print(tr4w.get_keywords(20, word_min_len=1)) for item in tr4w.get_keywords(20, word_min_len=1): key_words = key_words + item.word + '\n' print(item.word, item.weight) print() print('关键短语:') for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2): key_phrases = key_phrases + phrase + '\n' print(phrase) tr4s = TextRank4Sentence() tr4s.analyze(text=txt, lower=True, source='all_filters') print() print('摘要:') for item in tr4s.get_key_sentences(num=20): key_sentences = key_sentences + item.sentence + '\n' print(item.index, item.weight, item.sentence) text = open(str(number) + '.txt', 'a+', encoding='utf-8') text.write(key_words + '\n' + key_phrases + '\n' + key_sentences + '\n---全部文章:\n' + txt) # .encode(“gbk”, “ignore”) text.close() print('全部文章:') print(txt)
def catch_label(): print('in catch_label') global df_level paragraph = df_level[df_level["level"] == "text"] # 把所有為text level的內容都整理成新的df par = "" summary = [] summary_index = [] sentence = paragraph["topic"].values # 擷取句子 index = paragraph.index for i in range(len(sentence)): par += sentence[i] + "\n" print(sentence) print(index) # 文字摘要 tr4s = TextRank4Sentence() tr4s.analyze(text = par, lower = True, source = 'all_filters') for i in tr4s.get_key_sentences(num = 6): # num = 6 代表輸出最好的6句 summary.append(i.sentence) for j in range(len(sentence)): if i.sentence == sentence[j]: summary_index.append(index[j]) break print('catch_label end') return summary, summary_index
def __init__(self, path): """ 初始化函数接口,加载停用词表 :param path: 停用词表存储路径 """ self.tr4w = TextRank4Keyword(stop_words_file=path) self.tr4s = TextRank4Sentence(stop_words_file=path)
def save(self, *args, **kwargs): # 如果没有写摘要 if not self.excerpt: # 首先实例化一个 Markdown 类用于渲染 body 的文本 md = markdown.Markdown(extensions=[ 'markdown.extensions.extra', 'markdown.extensions.codehilite', ]) # 先将 Markdown 文本渲染成 HTML 文本 # strip_tags 去掉 HTML 文本全部 HTML 标签 # 从文本摘取前 54 个字符赋给 excerpt # self.excerpt = strip_tags(md.convert(self.body))[:50] tr4s = TextRank4Sentence() # tr4w = TextRank4Keyword() tr4s.analyze(text=strip_tags(md.convert(self.body)), lower=True, source='all_filters') # tr4w.analyze(text=strip_tags(md.convert(self.body)), lower=True, window=2) for item in tr4s.get_key_sentences(num=1): self.excerpt += (item.sentence + "。") self.excerpt = "摘要:" + self.excerpt if not self.id: self.created_time = timezone.now() self.modified_time = timezone.now() # 调用父类的 save 方法将数据保存到数据库中 super(Post, self).save(*args, **kwargs)
def get_summary_douhao(title, data, filters): # make up new delimiters which includes the "," delimiters = [ '?', '!', ',', ';', ',', '?', '!', '。', '……', '…', '-', '【', '】', '\n' ] tr4s = TextRank4Sentence(delimiters) tr4s.analyze(text=data, lower=True, source='all_filters') summary = "" # get the top 20 sentences items = tr4s.get_key_sentences(num=20, sentence_min_len=2) for i in xrange(len(items)): summary = "" length = 0 index = 0 preindex = 0 if filter_sentences(items[i].sentence, filters) and len( items[i].sentence) != 0 and len(items[i].sentence) <= 30: summary = items[i].sentence preindex = items[i].index index = items[i].index length = len(items[i].sentence) else: break for j in xrange(len(items) - i - 1): if filter_sentences(items[i + j + 1].sentence, filters) and len( items[i + j + 1].sentence) != 0 and length + len( items[i + j + 1].sentence) <= 30: if preindex - 1 == items[i + j + 1].index: summary = items[i + j + 1].sentence + ',' + summary preindex -= 1 elif index + 1 == items[i + j + 1].index: summary = summary + ',' + items[i + j + 1].sentence index += 1 if len(summary) >= 12: break if len(summary) >= 12: break # make sure that length of summary larger than 12 if len(summary) <= 12: summary = "" # sort the sentences by index items = sorted(items, key=lambda item: item["index"]) count = 0 # get upto max 4 sentences to makeup the summary for item in items: if filter_sentences(item.sentence, filters) and len(item.sentence) != 0 and len( item.sentence) <= 25 and count < 4: summary += item.sentence + ',' count += 1 if len(summary) > 15 or count >= 4: break return summary
def __init__(self): self.seg = Segmentor() self.seg.load(cwsPath) self.pos = Postagger() self.pos.load(posPath) self.parser = Parser() self.parser.load(parserPath) self.tr = TextRank4Sentence()
def zy(): if request.method == "POST": data = json.loads(request.form.get('data')) text = data['value'] tr4s = TextRank4Sentence() tr4s.analyze(text=str(text), lower=True, source='all_filters') for item in tr4s.get_key_sentences(num=1): return item.sentence
def get_summary(text, num=2): """提取摘要""" tr4s = TextRank4Sentence( stop_words_file= 'C:/Users/weiqing.xwq/Desktop/TextRank4ZH-master/textrank4zh/stopwords.txt' ) tr4s.analyze(text=text, lower=2, source='no_stop_words') return [item.sentence for item in tr4s.get_key_sentences(num)]
def summary_text_rank(rec): text = rec['article'] tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') rst = list(tr4s.get_key_sentences(sentence_min_len=1)) if len(rst) >= 1: return rst[0]['sentence'] return text
def do_extract_summarize(content): tr4s = TextRank4Sentence() tr4s.analyze(text=content, lower=True, source='no_stop_words') key_sentences = tr4s.get_key_sentences(num=5, sentence_min_len=2) key_stc = "" for sentences in key_sentences: key_stc = key_stc + sentences['sentence'] + " " return key_stc.strip()
def text_abstract(content_str): tr4s = TextRank4Sentence() tr4s.analyze(text=content_str, lower=True, source='all_filters') print() print('摘要:') for item in tr4s.get_key_sentences(num=3): print(item.index, item.weight, item.sentence)
def tr(): all_words = {'\n'} alternatives = [] passages = [] querys = [] answers = [] train = pickle.load(open('train_shuffle', 'rb')) n = 0 for i in train: n += 1 if n > 1001: break i = json.loads(i) alternative = [ ' '.join(jieba.cut(ii, cut_all=True, HMM=False)) for ii in i.get('alternatives').split('|') ] alternatives.append(alternative) passage = ' '.join( jieba.cut(i.get('passage').replace(' ', ''), cut_all=True, HMM=False)).replace(' ', ' , ').replace(' ', ' 。 ') if len(passage.split(' ')) > 300: trs = TextRank4Sentence() trs.analyze(text=i.get('passage').replace(' ', ''), lower=True, source='all_filters') passages.append(' '.join( jieba.cut('。'.join( [i.sentence for i in trs.get_key_sentences(1)])[0:300], cut_all=True, HMM=False)).replace(' ', ' , ').replace(' ', ' 。 ')) else: passages.append(passage) query = ' '.join( jieba.cut(i.get('query').replace(' ', ''), cut_all=True, HMM=False)).replace(' ', ' ').replace(' ', '') querys.append(query) answer = ' '.join( jieba.cut(i.get('answer').replace(' ', ''), cut_all=True, HMM=False)).replace(' ', ' ').replace(' ', '') answers.append(answer) for ii in alternative: ii = set(ii.split(' ')) all_words |= ii all_words |= set(passage.split(' ')) | set(query.split(' ')) token = text.Tokenizer() token.fit_on_texts(all_words) with open('token.pick', 'wb') as f: pickle.dump([token, alternatives, passages, querys, answers], f)
def abstract_extraction(articles): abstract_sentence = [] num_sentence = 1 tr4w = TextRank4Sentence() for i in range(len(articles)): tr4w.analyze(text=articles[i]) abstract_sentence.append( tr4w.get_key_sentences(num=num_sentence)[0]['sentence']) return abstract_sentence
def tencent_keyword_abstract(article, sentences_len): # 抽取摘要 tr4s = TextRank4Sentence() tr4s.analyze(text=article, lower=True, source='all_filters') abstract = [] for item in tr4s.get_key_sentences(num=sentences_len): abstract.append(item.sentence + '。') abstract = '\n'.join(abstract) return abstract
def ldaSentence(): doc_sen = request.args.get("content") #返回句子 tr4s = TextRank4Sentence() tr4s.analyze(text=doc_sen, lower=True, source='all_filters') item_sentence=str(tr4s.get_key_sentences(num=3)) return item_sentence
def getkeysent(self, text, num=2): '''对text提取摘要句,num设置关键句数量 ''' tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') keysent = [] for item in tr4s.get_key_sentences(num): keysent.append(item.sentence) return keysent
def get_key_sentence(index, text, result): tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') # print( '摘要:' ) abstract = [] for item in tr4s.get_key_sentences(num=3): # print(item.index, item.weight, item.sentence) abstract.append(item.sentence) result.append([(index, abstract)]) return result
def TextRankSentence(input): text = input['content'] tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') result = tr4s.get_key_sentences(num=5) # for item in tr4s.get_key_sentences(num=5): # result['sentence'] = item.sentence # print(item.index, item.weight, item.sentence) return result
def nlp(contents): tr4w = TextRank4Keyword() tr4w.analyze(text=''.join(i for i in contents), lower=True, window=2) tr4s = TextRank4Sentence() tr4s.analyze(text=''.join(i for i in contents), lower=True, source='all_filters') keyword = [item for item in tr4w.get_keywords(20, word_min_len=1)] keyphase = [item for item in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2)] keysentence = [item for item in tr4s.get_key_sentences(num=3)] return keyword, keyphase, keysentence
def get_abstract(texts: list) -> str: """ 利用textrank算法, 获得文本摘要 :param texts: list, 原文本 :return: str, 文本摘要 """ text = '\n'.join(texts) tr4s = TextRank4Sentence(delimiters='\n') tr4s.analyze(text=text, lower=True, source='all_filters') abstract = tr4s.get_key_sentences(num=1)[0]['sentence'] return abstract
def get_key_sentences(text, num=1): """ 利用textrank算法,获取文本摘要 :param text: string,原文本 :param num: int,指定摘要条数 :return: string,文本摘要 """ tr4s = TextRank4Sentence(delimiters='\n') tr4s.analyze(text=text, lower=True, source='all_filters') abstract = '\n'.join([item.sentence for item in tr4s.get_key_sentences(num=num)]) return abstract
def input(path='./test/doc/01.txt'): from textrank4zh import TextRank4Sentence text = codecs.open(path, 'r', 'utf-8').read() tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') # res = tr4s.get_key_sentences(num=3) # for item in res: # print(item.index, item.weight, item.sentence) return tr4s
def get_chinese_summary(text): tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') result = [] for item in tr4s.get_key_sentences(num=3): # print(item.index, item.weight, item.sentence) result.append(item.sentence) return result
def input(path): import codecs text = codecs.open(path, 'r', 'gbk').read() # text = codecs.open(path, 'r', 'utf8').read() tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') return tr4s
def summarize(self, content: str, title: str = None, proportion=0.3): tr4s = TextRank4Sentence() if title != None: text = "。".join([title, content]) else: text = content tr4s.analyze(text=text, lower=True, source='all_filters') summarySentences = tr4s.get_key_sentences(num=len(tr4s.sentences) * proportion) summarySentences.sort(key=lambda item: item.index) return [sen.sentence + "。" for sen in summarySentences]
def getAbs(text): tr4w = TextRank4Keyword() tr4w.analyze( text=text, lower=True, window=2 ) # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象 tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') absText = '' for item in tr4s.get_key_sentences(num=1): absText = item.sentence return absText
def text_abstract(text): tr4s = TextRank4Sentence() tr4s.analyze(text) sentences_count = len(tr4s.sentences) // 4 if sentences_count <= 5: sentences_num = sentences_count else: sentences_num = 5 abstract_sentences = tr4s.get_key_sentences(num=sentences_num) abstract_sentences.sort(key=lambda x: x['index']) return abstract_sentences
def keysentences_extraction(text): tr4s = TextRank4Sentence() tr4s.analyze(text, lower=True, source='all_filters') # text -- 文本内容,字符串 # lower -- 是否将英文文本转换为小写,默认值为False # source -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。 # -- 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters' # sim_func -- 指定计算句子相似度的函数 # 获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要 keysentences = tr4s.get_key_sentences(num=3, sentence_min_len=6) return keysentences