예제 #1
0
def get_dialog(text):
    tr4s = TextRank4Sentence(delimiters=dialog_sentence_delimiters)
    tr4s.analyze(text=text, lower=True, source='all_filters')

    #print(tr4s.sentences)
    #for s in tr4s.sentences:
    #    print(s)

    sentences = [s for s in tr4s.sentences if '「' in s]

    dialog = [s.split('「') for s in sentences]

    speak = [s[1] for s in dialog]

    final_speak = []

    for i in range(len(speak) - 1):
        if speak[i][-1] == ',':
            final_speak.append(speak[i] + speak[i + 1])
            i += 1
        else:
            final_speak.append(speak[i])
    return final_speak


#text = codecs.open('小紅帽.txt', 'r', 'utf-8').read()
#print(get_dialog(text))
예제 #2
0
def summary_main(weibo_data):#摘要自动生成主函数
    '''
        输入数据:
        weibo列表:[weibo1,weibo2,...]
    '''

    word_result,word_weight = word_net(weibo_data,5)

    text_list = text_net(word_result,word_weight,weibo_data)
    
    text_str = ''
    for text in text_list:
        re_t = re_cut(text)
        if not len(re_t):
            continue
        if re_t[-1] != '。':
            text_str = text_str + re_t + '。'
        else:
            text_str = text_str + re_t
    #print text_str
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text_str, lower=True, source = 'all_filters')

    result = []
    for item in tr4s.get_key_sentences(num=10):
        result.append(item.sentence)

    return result
예제 #3
0
def keys(number):
    key_words = '---关键词:\n'
    key_phrases = '---关键短语:\n'
    key_sentences = '---摘要:\n'
    txt = texts(number)
    # text = codecs.open(str(number) + '.txt', 'a+', 'utf-8').read()
    tr4w = TextRank4Keyword()
    tr4w.analyze(
        text=txt, lower=True, window=2
    )  # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
    print('关键词:')
    print(tr4w.get_keywords(20, word_min_len=1))
    for item in tr4w.get_keywords(20, word_min_len=1):
        key_words = key_words + item.word + '\n'
        print(item.word, item.weight)
    print()
    print('关键短语:')
    for phrase in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2):
        key_phrases = key_phrases + phrase + '\n'
        print(phrase)
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=txt, lower=True, source='all_filters')
    print()
    print('摘要:')
    for item in tr4s.get_key_sentences(num=20):
        key_sentences = key_sentences + item.sentence + '\n'
        print(item.index, item.weight, item.sentence)
    text = open(str(number) + '.txt', 'a+', encoding='utf-8')
    text.write(key_words + '\n' + key_phrases + '\n' + key_sentences +
               '\n---全部文章:\n' + txt)  # .encode(“gbk”, “ignore”)
    text.close()
    print('全部文章:')
    print(txt)
예제 #4
0
def catch_label():
    print('in catch_label')
    global df_level
    paragraph = df_level[df_level["level"] == "text"] # 把所有為text level的內容都整理成新的df
    par = ""
    summary = []
    summary_index = []
    sentence = paragraph["topic"].values # 擷取句子
    index = paragraph.index
    for i in range(len(sentence)):
        par += sentence[i] + "\n"
    print(sentence)
    print(index)

    # 文字摘要
    tr4s = TextRank4Sentence()
    tr4s.analyze(text = par, lower = True, source = 'all_filters')

    for i in tr4s.get_key_sentences(num = 6): # num = 6 代表輸出最好的6句
        summary.append(i.sentence)

        for j in range(len(sentence)):
            if i.sentence == sentence[j]:
                summary_index.append(index[j])
                break
    print('catch_label end')
    return summary, summary_index
예제 #5
0
 def __init__(self, path):
     """
     初始化函数接口,加载停用词表
     :param path: 停用词表存储路径
     """
     self.tr4w = TextRank4Keyword(stop_words_file=path)
     self.tr4s = TextRank4Sentence(stop_words_file=path)
예제 #6
0
    def save(self, *args, **kwargs):
        # 如果没有写摘要
        if not self.excerpt:
            # 首先实例化一个 Markdown 类用于渲染 body 的文本
            md = markdown.Markdown(extensions=[
                'markdown.extensions.extra',
                'markdown.extensions.codehilite',
            ])
            # 先将 Markdown 文本渲染成 HTML 文本
            # strip_tags 去掉 HTML 文本全部 HTML 标签
            # 从文本摘取前 54 个字符赋给 excerpt
            # self.excerpt = strip_tags(md.convert(self.body))[:50]
            tr4s = TextRank4Sentence()
            # tr4w = TextRank4Keyword()
            tr4s.analyze(text=strip_tags(md.convert(self.body)), lower=True, source='all_filters')
            # tr4w.analyze(text=strip_tags(md.convert(self.body)), lower=True, window=2)
            for item in tr4s.get_key_sentences(num=1):
                self.excerpt += (item.sentence + "。")
            self.excerpt = "摘要:" + self.excerpt
        if not self.id:
            self.created_time = timezone.now()
        self.modified_time = timezone.now()

        # 调用父类的 save 方法将数据保存到数据库中
        super(Post, self).save(*args, **kwargs)
예제 #7
0
def get_summary_douhao(title, data, filters):
    # make up new delimiters which includes the ","
    delimiters = [
        '?', '!', ',', ';', ',', '?', '!', '。', '……', '…', '-', '【', '】', '\n'
    ]
    tr4s = TextRank4Sentence(delimiters)
    tr4s.analyze(text=data, lower=True, source='all_filters')

    summary = ""

    # get the top 20 sentences
    items = tr4s.get_key_sentences(num=20, sentence_min_len=2)

    for i in xrange(len(items)):
        summary = ""
        length = 0
        index = 0
        preindex = 0
        if filter_sentences(items[i].sentence, filters) and len(
                items[i].sentence) != 0 and len(items[i].sentence) <= 30:
            summary = items[i].sentence
            preindex = items[i].index
            index = items[i].index
            length = len(items[i].sentence)
        else:
            break
        for j in xrange(len(items) - i - 1):
            if filter_sentences(items[i + j + 1].sentence, filters) and len(
                    items[i + j + 1].sentence) != 0 and length + len(
                        items[i + j + 1].sentence) <= 30:
                if preindex - 1 == items[i + j + 1].index:
                    summary = items[i + j + 1].sentence + ',' + summary
                    preindex -= 1
                elif index + 1 == items[i + j + 1].index:
                    summary = summary + ',' + items[i + j + 1].sentence
                    index += 1

            if len(summary) >= 12:
                break

        if len(summary) >= 12:
            break

    # make sure that length of summary larger than 12
    if len(summary) <= 12:
        summary = ""
        # sort the sentences by index
        items = sorted(items, key=lambda item: item["index"])
        count = 0
        # get upto max 4 sentences to makeup the summary
        for item in items:
            if filter_sentences(item.sentence,
                                filters) and len(item.sentence) != 0 and len(
                                    item.sentence) <= 25 and count < 4:
                summary += item.sentence + ','
                count += 1
                if len(summary) > 15 or count >= 4:
                    break
    return summary
예제 #8
0
 def __init__(self):
     self.seg = Segmentor()
     self.seg.load(cwsPath)
     self.pos = Postagger()
     self.pos.load(posPath)
     self.parser = Parser()
     self.parser.load(parserPath)
     self.tr = TextRank4Sentence()
예제 #9
0
파일: pmain.py 프로젝트: AstroChee/chrome-
def zy():
    if request.method == "POST":
        data = json.loads(request.form.get('data'))
        text = data['value']
        tr4s = TextRank4Sentence()
        tr4s.analyze(text=str(text), lower=True, source='all_filters')
        for item in tr4s.get_key_sentences(num=1):
            return item.sentence
예제 #10
0
def get_summary(text, num=2):
    """提取摘要"""
    tr4s = TextRank4Sentence(
        stop_words_file=
        'C:/Users/weiqing.xwq/Desktop/TextRank4ZH-master/textrank4zh/stopwords.txt'
    )
    tr4s.analyze(text=text, lower=2, source='no_stop_words')
    return [item.sentence for item in tr4s.get_key_sentences(num)]
예제 #11
0
def summary_text_rank(rec):
    text = rec['article']
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    rst = list(tr4s.get_key_sentences(sentence_min_len=1))
    if len(rst) >= 1:
        return rst[0]['sentence']
    return text
def do_extract_summarize(content):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=content, lower=True, source='no_stop_words')
    key_sentences = tr4s.get_key_sentences(num=5, sentence_min_len=2)
    key_stc = ""
    for sentences in key_sentences:
        key_stc = key_stc + sentences['sentence'] + " "
    return key_stc.strip()
예제 #13
0
def text_abstract(content_str):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=content_str, lower=True, source='all_filters')

    print()
    print('摘要:')
    for item in tr4s.get_key_sentences(num=3):
        print(item.index, item.weight, item.sentence)
def tr():
    all_words = {'\n'}
    alternatives = []
    passages = []
    querys = []
    answers = []
    train = pickle.load(open('train_shuffle', 'rb'))
    n = 0
    for i in train:
        n += 1
        if n > 1001:
            break
        i = json.loads(i)
        alternative = [
            ' '.join(jieba.cut(ii, cut_all=True, HMM=False))
            for ii in i.get('alternatives').split('|')
        ]
        alternatives.append(alternative)
        passage = ' '.join(
            jieba.cut(i.get('passage').replace(' ', ''),
                      cut_all=True,
                      HMM=False)).replace('   ', ' , ').replace('  ', ' 。 ')

        if len(passage.split(' ')) > 300:
            trs = TextRank4Sentence()
            trs.analyze(text=i.get('passage').replace(' ', ''),
                        lower=True,
                        source='all_filters')
            passages.append(' '.join(
                jieba.cut('。'.join(
                    [i.sentence for i in trs.get_key_sentences(1)])[0:300],
                          cut_all=True,
                          HMM=False)).replace('   ',
                                              ' , ').replace('  ', ' 。 '))
        else:
            passages.append(passage)

        query = ' '.join(
            jieba.cut(i.get('query').replace(' ', ''), cut_all=True,
                      HMM=False)).replace('   ', ' ').replace('  ', '')
        querys.append(query)

        answer = ' '.join(
            jieba.cut(i.get('answer').replace(' ', ''),
                      cut_all=True,
                      HMM=False)).replace('   ', ' ').replace('  ', '')
        answers.append(answer)

        for ii in alternative:
            ii = set(ii.split(' '))
            all_words |= ii
        all_words |= set(passage.split(' ')) | set(query.split(' '))

    token = text.Tokenizer()
    token.fit_on_texts(all_words)

    with open('token.pick', 'wb') as f:
        pickle.dump([token, alternatives, passages, querys, answers], f)
def abstract_extraction(articles):
    abstract_sentence = []
    num_sentence = 1
    tr4w = TextRank4Sentence()
    for i in range(len(articles)):
        tr4w.analyze(text=articles[i])
        abstract_sentence.append(
            tr4w.get_key_sentences(num=num_sentence)[0]['sentence'])
    return abstract_sentence
예제 #16
0
def tencent_keyword_abstract(article, sentences_len):
    # 抽取摘要
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=article, lower=True, source='all_filters')
    abstract = []
    for item in tr4s.get_key_sentences(num=sentences_len):
        abstract.append(item.sentence + '。')
    abstract = '\n'.join(abstract)
    return abstract
예제 #17
0
def ldaSentence():

    doc_sen = request.args.get("content")
    #返回句子
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=doc_sen, lower=True, source='all_filters')
    item_sentence=str(tr4s.get_key_sentences(num=3))

    return item_sentence
예제 #18
0
 def getkeysent(self, text, num=2):
     '''对text提取摘要句,num设置关键句数量
     '''
     tr4s = TextRank4Sentence()
     tr4s.analyze(text=text, lower=True, source='all_filters')
     keysent = []
     for item in tr4s.get_key_sentences(num):
         keysent.append(item.sentence)
     return keysent
예제 #19
0
def get_key_sentence(index, text, result):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    # print( '摘要:' )
    abstract = []
    for item in tr4s.get_key_sentences(num=3):
        # print(item.index, item.weight, item.sentence)
        abstract.append(item.sentence)
    result.append([(index, abstract)])
    return result
예제 #20
0
def TextRankSentence(input):

    text = input['content']
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    result = tr4s.get_key_sentences(num=5)
    # for item in tr4s.get_key_sentences(num=5):
    # result['sentence'] = item.sentence
    # print(item.index, item.weight, item.sentence)
    return result
        def nlp(contents):
            tr4w = TextRank4Keyword()
            tr4w.analyze(text=''.join(i for i in contents), lower=True, window=2)

            tr4s = TextRank4Sentence()
            tr4s.analyze(text=''.join(i for i in contents), lower=True, source='all_filters')

            keyword = [item for item in tr4w.get_keywords(20, word_min_len=1)]
            keyphase = [item for item in tr4w.get_keyphrases(keywords_num=20, min_occur_num=2)]
            keysentence = [item for item in tr4s.get_key_sentences(num=3)]
            return keyword, keyphase, keysentence
예제 #22
0
파일: main.py 프로젝트: jesHrz/tipdm
def get_abstract(texts: list) -> str:
    """
    利用textrank算法, 获得文本摘要
    :param texts: list, 原文本
    :return: str, 文本摘要
    """
    text = '\n'.join(texts)
    tr4s = TextRank4Sentence(delimiters='\n')
    tr4s.analyze(text=text, lower=True, source='all_filters')
    abstract = tr4s.get_key_sentences(num=1)[0]['sentence']
    return abstract
예제 #23
0
def get_key_sentences(text, num=1):
    """
    利用textrank算法,获取文本摘要
    :param text: string,原文本
    :param num: int,指定摘要条数
    :return: string,文本摘要
    """
    tr4s = TextRank4Sentence(delimiters='\n')
    tr4s.analyze(text=text, lower=True, source='all_filters')
    abstract = '\n'.join([item.sentence for item in tr4s.get_key_sentences(num=num)])
    return abstract
예제 #24
0
def input(path='./test/doc/01.txt'):
    from textrank4zh import TextRank4Sentence

    text = codecs.open(path, 'r', 'utf-8').read()
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    # res = tr4s.get_key_sentences(num=3)
    # for item in res:
    #     print(item.index, item.weight, item.sentence)

    return tr4s
예제 #25
0
def get_chinese_summary(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')

    result = []

    for item in tr4s.get_key_sentences(num=3):
        # print(item.index, item.weight, item.sentence)
        result.append(item.sentence)

    return result
예제 #26
0
def input(path):
    import codecs


    text = codecs.open(path, 'r', 'gbk').read()
    # text = codecs.open(path, 'r', 'utf8').read()

    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')

    return tr4s
예제 #27
0
 def summarize(self, content: str, title: str = None, proportion=0.3):
     tr4s = TextRank4Sentence()
     if title != None:
         text = "。".join([title, content])
     else:
         text = content
     tr4s.analyze(text=text, lower=True, source='all_filters')
     summarySentences = tr4s.get_key_sentences(num=len(tr4s.sentences) *
                                               proportion)
     summarySentences.sort(key=lambda item: item.index)
     return [sen.sentence + "。" for sen in summarySentences]
예제 #28
0
def getAbs(text):
    tr4w = TextRank4Keyword()
    tr4w.analyze(
        text=text, lower=True, window=2
    )  # py2中text必须是utf8编码的str或者unicode对象,py3中必须是utf8编码的bytes或者str对象
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text, lower=True, source='all_filters')
    absText = ''
    for item in tr4s.get_key_sentences(num=1):
        absText = item.sentence
    return absText
예제 #29
0
def text_abstract(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text)
    sentences_count = len(tr4s.sentences) // 4
    if sentences_count <= 5:
        sentences_num = sentences_count
    else:
        sentences_num = 5
    abstract_sentences = tr4s.get_key_sentences(num=sentences_num)
    abstract_sentences.sort(key=lambda x: x['index'])
    return abstract_sentences
예제 #30
0
def keysentences_extraction(text):
    tr4s = TextRank4Sentence()
    tr4s.analyze(text, lower=True, source='all_filters')
    # text    -- 文本内容,字符串
    # lower   -- 是否将英文文本转换为小写,默认值为False
    # source  -- 选择使用words_no_filter, words_no_stop_words, words_all_filters中的哪一个来生成句子之间的相似度。
    # 		  -- 默认值为`'all_filters'`,可选值为`'no_filter', 'no_stop_words', 'all_filters'
    # sim_func -- 指定计算句子相似度的函数

    # 获取最重要的num个长度大于等于sentence_min_len的句子用来生成摘要
    keysentences = tr4s.get_key_sentences(num=3, sentence_min_len=6)
    return keysentences