예제 #1
0
def summarize(text, sent_limit=None, char_limit=None, imp_require=None):
    '''
    Args:
      text: 要約対象の文章 (unicode string)
      sent_limit: 文数制限
      char_limit: 文字数制限
      imp_require: [0.0 - 1.0] 重要度の累積が全体のimp_requireを超えるように文を選択

    Returns:
      文を格納したリスト
    '''
    sentences = tools.sent_splitter_ja(text)
    scores, sim_mat = lexrank(sentences)
    sum_scores = sum(scores.itervalues())
    acc_scores = 0.0
    indexes = set()
    num_sent, num_char = 0, 0
    for i in sorted(scores, key=lambda i: scores[i], reverse=True):
        num_sent += 1
        num_char += len(sentences[i])
        if sent_limit is not None and num_sent > sent_limit:
            break
        if char_limit is not None and num_char > char_limit:
            break
        if imp_require is not None and acc_scores / sum_scores >= imp_require:
            break
        indexes.add(i)
        acc_scores += scores[i]

    if len(indexes) > 0:
        summary_sents = [sentences[i] for i in sorted(indexes)]
    else:
        summary_sents = sentences

    return summary_sents
예제 #2
0
파일: lexrank.py 프로젝트: nakagami/summpy
def summarize(text, sent_limit=None, char_limit=None, imp_require=None):
    '''
    Args:
      text: 要約対象の文章 (unicode string)
      sent_limit: 文数制限
      char_limit: 文字数制限
      imp_require: [0.0 - 1.0] 重要度の累積が全体のimp_requireを超えるように文を選択

    Returns:
      文を格納したリスト
    '''
    sentences = tools.sent_splitter_ja(text)
    scores, sim_mat = lexrank(sentences)
    sum_scores = sum(scores.itervalues())
    acc_scores = 0.0
    indexes = set()
    num_sent, num_char = 0, 0
    for i in sorted(scores, key=lambda i: scores[i], reverse=True):
        num_sent += 1
        num_char += len(sentences[i])
        if sent_limit is not None and num_sent > sent_limit:
            break
        if char_limit is not None and num_char > char_limit:
            break
        if imp_require is not None and acc_scores / sum_scores >= imp_require:
            break
        indexes.add(i)
        acc_scores += scores[i]

    if len(indexes) > 0:
        summary_sents = [sentences[i] for i in sorted(indexes)]
    else:
        summary_sents = sentences

    return summary_sents
예제 #3
0
def test_iter_docs():
    data_root = os.path.dirname(os.path.abspath(__file__)) + '/../data/extracted'
    for fname in iter_files(data_root):
        f = bz2.BZ2File(fname)
        for doc_str in iter_docs(f):
            print '-' * 70
            #print doc_str
            doc_str = doc_str.decode('utf-8')
            sents = tools.sent_splitter_ja(doc_str)
            for sent in sents:
                words = tools.word_segmenter_ja(sent)
                print '^', u'|'.join(words).encode('utf-8')
예제 #4
0
 def __iter__(self):
     for file_count, fname in enumerate(data.iter_files(self.data_root)):
         if self.test_ and file_count >= 100:
             break
         f = bz2.BZ2File(fname)
         for doc_str in data.iter_docs(f):
             doc_str = doc_str.decode('utf-8')
             sents = tools.sent_splitter_ja(doc_str, fix_parenthesis=True)
             for sent in sents:
                 sent = sent.strip()
                 if len(sent) == 0:
                     continue
                 words = tools.word_segmenter_ja(sent, baseform=True)
                 yield words
                 # TODO: 各名詞句ごとに区切る場合とまとめる場合を試す.
                 #words = tools.word_segmenter_ja(sent, baseform=True, np=False)
                 #yield words
         f.close()
예제 #5
0
 def __iter__(self):
     for file_count, fname in enumerate(data.iter_files(self.data_root)):
         if self.test_ and file_count >= 100:
             break
         f = bz2.BZ2File(fname)
         for doc_str in data.iter_docs(f):
             doc_str = doc_str.decode('utf-8')
             sents = tools.sent_splitter_ja(doc_str, fix_parenthesis=True)
             for sent in sents:
                 sent = sent.strip()
                 if len(sent) == 0:
                     continue
                 words = tools.word_segmenter_ja(sent, baseform=True)
                 yield words
                 # TODO: 各名詞句ごとに区切る場合とまとめる場合を試す.
                 #words = tools.word_segmenter_ja(sent, baseform=True, np=False)
                 #yield words
         f.close()
예제 #6
0
파일: lexrank.py 프로젝트: nus/summpy
def summarize(text, sent_limit=None, char_limit=None, imp_require=None, debug=False, **lexrank_params):
    """
    Args:
      text: text to be summarized (unicode string)
      sent_limit: summary length (the number of sentences)
      char_limit: summary length (the number of characters)
      imp_require: cumulative LexRank score [0.0-1.0]

    Returns:
      list of extracted sentences
    """
    debug_info = {}
    sentences = list(tools.sent_splitter_ja(text))
    scores, sim_mat = lexrank(sentences, **lexrank_params)
    sum_scores = sum(scores.itervalues())
    acc_scores = 0.0
    indexes = set()
    num_sent, num_char = 0, 0
    for i in sorted(scores, key=lambda i: scores[i], reverse=True):
        num_sent += 1
        num_char += len(sentences[i])
        if sent_limit is not None and num_sent > sent_limit:
            break
        if char_limit is not None and num_char > char_limit:
            break
        if imp_require is not None and acc_scores / sum_scores >= imp_require:
            break
        indexes.add(i)
        acc_scores += scores[i]

    if len(indexes) > 0:
        summary_sents = [sentences[i] for i in sorted(indexes)]
    else:
        summary_sents = sentences

    if debug:
        debug_info.update({"sentences": sentences, "scores": scores})

    return summary_sents, debug_info
예제 #7
0
파일: mcp_summ.py 프로젝트: nakagami/summpy
def summarize(text, char_limit, sent_len_min=None):
    '''
    最大被覆問題として要約する.

    Args:
      text: 要約対象のテキスト (unicode)
      char_limit: 文字数制限
      sent_len_min: 長さがこれ以下の文は要約に含めない.

    Returns: 要約文のリスト
      [
        u'こんにちは.',
        u'私は飯沼ではありません.',
        ...
      ]
    '''

    sents = tools.sent_splitter_ja(text)

    # pulpの変数はutfじゃないといけない
    words_list = [
        w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s)
    ]
    # 単語の重みを計算 tf (w)
    tf = collections.Counter()
    for words in words_list:
        for w in words:
            tf[w] += 1.0

    # 要約に出てきてほしくない文は除外する.
    if sent_len_min is not None:
        sents = [s for s in sents if len(s) > sent_len_min]

    sent_ids = [str(i) for i in range(len(sents))]  # sentence id
    # c
    sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents))

    # a: ある単語が文に含まれているか否か
    word_contain = dict()
    for id_, words in zip(sent_ids, words_list):
        word_contain[id_] = collections.defaultdict(lambda: 0)
        for w in words:
            word_contain[id_][w] = 1

    prob = pulp.LpProblem('summarize', pulp.LpMaximize)

    # 変数を設定
    # x
    sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary)
    # z
    word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary)

    # 最初に目的関数を追加する。sum(w*z)
    prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf])

    # 次に制約を追加していく。
    # sum(c*x) <= K
    prob += pulp.lpSum(
        [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids]
    ) <= char_limit, 'lengthRequirement'
    # sum(a*x) <= z, すべての単語について
    for w in tf:
        prob += pulp.lpSum(
            [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids]
        ) >= word_vars[w], 'z:{}'.format(w)

    prob.solve()
    # print("Status:", pulp.LpStatus[prob.status])

    sent_indices = []
    for v in prob.variables():
        # print v.name, "=", v.varValue
        if v.name.startswith('sents') and v.varValue == 1:
            sent_indices.append(int(v.name.split('_')[-1]))

    return [sents[i] for i in sent_indices]
예제 #8
0
def summarize(text, char_limit, sent_len_min=None):
    '''
    最大被覆問題として要約する.

    Args:
      text: 要約対象のテキスト (unicode)
      char_limit: 文字数制限
      sent_len_min: 長さがこれ以下の文は要約に含めない.

    Returns: 要約文のリスト
      [
        u'こんにちは.',
        u'私は飯沼ではありません.',
        ...
      ]
    '''

    sents = tools.sent_splitter_ja(text)

    # pulpの変数はutfじゃないといけない
    words_list = [
        w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s)
    ]
    # 単語の重みを計算 tf (w)
    tf = collections.Counter()
    for words in words_list:
        for w in words:
            tf[w] += 1.0

    # 要約に出てきてほしくない文は除外する.
    if sent_len_min is not None:
        sents = [s for s in sents if len(s) > sent_len_min]

    sent_ids = [str(i) for i in range(len(sents))]  # sentence id
    # c
    sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents))

    # a: ある単語が文に含まれているか否か
    word_contain = dict()
    for id_, words in zip(sent_ids, words_list):
        word_contain[id_] = collections.defaultdict(lambda: 0)
        for w in words:
            word_contain[id_][w] = 1

    prob = pulp.LpProblem('summarize', pulp.LpMaximize)

    # 変数を設定
    # x
    sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary)
    # z
    word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary)

    # 最初に目的関数を追加する。sum(w*z)
    prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf])

    # 次に制約を追加していく。
    # sum(c*x) <= K
    prob += pulp.lpSum([sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids
                        ]) <= char_limit, 'lengthRequirement'
    # sum(a*x) <= z, すべての単語について
    for w in tf:
        prob += pulp.lpSum(
            [word_contain[id_][w] * sent_vars[id_]
             for id_ in sent_ids]) >= word_vars[w], 'z:{}'.format(w)

    prob.solve()
    # print("Status:", pulp.LpStatus[prob.status])

    sent_indices = []
    for v in prob.variables():
        # print v.name, "=", v.varValue
        if v.name.startswith('sents') and v.varValue == 1:
            sent_indices.append(int(v.name.split('_')[-1]))

    return [sents[i] for i in sent_indices]
예제 #9
0
파일: mcp_summ.py 프로젝트: nus/summpy
def summarize(text, char_limit, sentence_filter=None, debug=False):
    '''
    select sentences in terms of maximum coverage problem

    Args:
      text: text to be summarized (unicode string)
      char_limit: summary length (the number of characters)

    Returns:
      list of extracted sentences

    Reference:
      Hiroya Takamura, Manabu Okumura.
      Text summarization model based on maximum coverage problem and its
      variant. (section 3)
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.222.6945
    '''
    debug_info = {}

    sents = list(tools.sent_splitter_ja(text))
    words_list = [
        # pulp variables should be utf-8 encoded
        w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s)
    ]

    tf = collections.Counter()
    for words in words_list:
        for w in words:
            tf[w] += 1.0

    if sentence_filter is not None:
        valid_indices = [i for i, s in enumerate(sents) if sentence_filter(s)]
        sents = [sents[i] for i in valid_indices]
        words_list = [words_list[i] for i in valid_indices]

    sent_ids = [str(i) for i in range(len(sents))]  # sentence id
    sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents))  # c

    word_contain = dict()  # a
    for id_, words in zip(sent_ids, words_list):
        word_contain[id_] = collections.defaultdict(lambda: 0)
        for w in words:
            word_contain[id_][w] = 1

    prob = pulp.LpProblem('summarize', pulp.LpMaximize)

    # x
    sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary)
    # z
    word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary)

    # first, set objective function: sum(w*z)
    prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf])

    # next, add constraints
    # limit summary length: sum(c*x) <= K
    prob += pulp.lpSum(
        [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids]
    ) <= char_limit, 'lengthRequirement'
    # for each term, sum(a*x) <= z
    for w in tf:
        prob += pulp.lpSum(
            [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids]
        ) >= word_vars[w], 'z:{}'.format(w)

    prob.solve()
    # print("Status:", pulp.LpStatus[prob.status])

    sent_indices = []
    for v in prob.variables():
        # print v.name, "=", v.varValue
        if v.name.startswith('sents') and v.varValue == 1:
            sent_indices.append(int(v.name.split('_')[-1]))

    return [sents[i] for i in sent_indices], debug_info