def summarize(text, sent_limit=None, char_limit=None, imp_require=None): ''' Args: text: 要約対象の文章 (unicode string) sent_limit: 文数制限 char_limit: 文字数制限 imp_require: [0.0 - 1.0] 重要度の累積が全体のimp_requireを超えるように文を選択 Returns: 文を格納したリスト ''' sentences = tools.sent_splitter_ja(text) scores, sim_mat = lexrank(sentences) sum_scores = sum(scores.itervalues()) acc_scores = 0.0 indexes = set() num_sent, num_char = 0, 0 for i in sorted(scores, key=lambda i: scores[i], reverse=True): num_sent += 1 num_char += len(sentences[i]) if sent_limit is not None and num_sent > sent_limit: break if char_limit is not None and num_char > char_limit: break if imp_require is not None and acc_scores / sum_scores >= imp_require: break indexes.add(i) acc_scores += scores[i] if len(indexes) > 0: summary_sents = [sentences[i] for i in sorted(indexes)] else: summary_sents = sentences return summary_sents
def test_iter_docs(): data_root = os.path.dirname(os.path.abspath(__file__)) + '/../data/extracted' for fname in iter_files(data_root): f = bz2.BZ2File(fname) for doc_str in iter_docs(f): print '-' * 70 #print doc_str doc_str = doc_str.decode('utf-8') sents = tools.sent_splitter_ja(doc_str) for sent in sents: words = tools.word_segmenter_ja(sent) print '^', u'|'.join(words).encode('utf-8')
def __iter__(self): for file_count, fname in enumerate(data.iter_files(self.data_root)): if self.test_ and file_count >= 100: break f = bz2.BZ2File(fname) for doc_str in data.iter_docs(f): doc_str = doc_str.decode('utf-8') sents = tools.sent_splitter_ja(doc_str, fix_parenthesis=True) for sent in sents: sent = sent.strip() if len(sent) == 0: continue words = tools.word_segmenter_ja(sent, baseform=True) yield words # TODO: 各名詞句ごとに区切る場合とまとめる場合を試す. #words = tools.word_segmenter_ja(sent, baseform=True, np=False) #yield words f.close()
def summarize(text, sent_limit=None, char_limit=None, imp_require=None, debug=False, **lexrank_params): """ Args: text: text to be summarized (unicode string) sent_limit: summary length (the number of sentences) char_limit: summary length (the number of characters) imp_require: cumulative LexRank score [0.0-1.0] Returns: list of extracted sentences """ debug_info = {} sentences = list(tools.sent_splitter_ja(text)) scores, sim_mat = lexrank(sentences, **lexrank_params) sum_scores = sum(scores.itervalues()) acc_scores = 0.0 indexes = set() num_sent, num_char = 0, 0 for i in sorted(scores, key=lambda i: scores[i], reverse=True): num_sent += 1 num_char += len(sentences[i]) if sent_limit is not None and num_sent > sent_limit: break if char_limit is not None and num_char > char_limit: break if imp_require is not None and acc_scores / sum_scores >= imp_require: break indexes.add(i) acc_scores += scores[i] if len(indexes) > 0: summary_sents = [sentences[i] for i in sorted(indexes)] else: summary_sents = sentences if debug: debug_info.update({"sentences": sentences, "scores": scores}) return summary_sents, debug_info
def summarize(text, char_limit, sent_len_min=None): ''' 最大被覆問題として要約する. Args: text: 要約対象のテキスト (unicode) char_limit: 文字数制限 sent_len_min: 長さがこれ以下の文は要約に含めない. Returns: 要約文のリスト [ u'こんにちは.', u'私は飯沼ではありません.', ... ] ''' sents = tools.sent_splitter_ja(text) # pulpの変数はutfじゃないといけない words_list = [ w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s) ] # 単語の重みを計算 tf (w) tf = collections.Counter() for words in words_list: for w in words: tf[w] += 1.0 # 要約に出てきてほしくない文は除外する. if sent_len_min is not None: sents = [s for s in sents if len(s) > sent_len_min] sent_ids = [str(i) for i in range(len(sents))] # sentence id # c sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents)) # a: ある単語が文に含まれているか否か word_contain = dict() for id_, words in zip(sent_ids, words_list): word_contain[id_] = collections.defaultdict(lambda: 0) for w in words: word_contain[id_][w] = 1 prob = pulp.LpProblem('summarize', pulp.LpMaximize) # 変数を設定 # x sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary) # z word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary) # 最初に目的関数を追加する。sum(w*z) prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf]) # 次に制約を追加していく。 # sum(c*x) <= K prob += pulp.lpSum( [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids] ) <= char_limit, 'lengthRequirement' # sum(a*x) <= z, すべての単語について for w in tf: prob += pulp.lpSum( [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids] ) >= word_vars[w], 'z:{}'.format(w) prob.solve() # print("Status:", pulp.LpStatus[prob.status]) sent_indices = [] for v in prob.variables(): # print v.name, "=", v.varValue if v.name.startswith('sents') and v.varValue == 1: sent_indices.append(int(v.name.split('_')[-1])) return [sents[i] for i in sent_indices]
def summarize(text, char_limit, sent_len_min=None): ''' 最大被覆問題として要約する. Args: text: 要約対象のテキスト (unicode) char_limit: 文字数制限 sent_len_min: 長さがこれ以下の文は要約に含めない. Returns: 要約文のリスト [ u'こんにちは.', u'私は飯沼ではありません.', ... ] ''' sents = tools.sent_splitter_ja(text) # pulpの変数はutfじゃないといけない words_list = [ w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s) ] # 単語の重みを計算 tf (w) tf = collections.Counter() for words in words_list: for w in words: tf[w] += 1.0 # 要約に出てきてほしくない文は除外する. if sent_len_min is not None: sents = [s for s in sents if len(s) > sent_len_min] sent_ids = [str(i) for i in range(len(sents))] # sentence id # c sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents)) # a: ある単語が文に含まれているか否か word_contain = dict() for id_, words in zip(sent_ids, words_list): word_contain[id_] = collections.defaultdict(lambda: 0) for w in words: word_contain[id_][w] = 1 prob = pulp.LpProblem('summarize', pulp.LpMaximize) # 変数を設定 # x sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary) # z word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary) # 最初に目的関数を追加する。sum(w*z) prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf]) # 次に制約を追加していく。 # sum(c*x) <= K prob += pulp.lpSum([sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids ]) <= char_limit, 'lengthRequirement' # sum(a*x) <= z, すべての単語について for w in tf: prob += pulp.lpSum( [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids]) >= word_vars[w], 'z:{}'.format(w) prob.solve() # print("Status:", pulp.LpStatus[prob.status]) sent_indices = [] for v in prob.variables(): # print v.name, "=", v.varValue if v.name.startswith('sents') and v.varValue == 1: sent_indices.append(int(v.name.split('_')[-1])) return [sents[i] for i in sent_indices]
def summarize(text, char_limit, sentence_filter=None, debug=False): ''' select sentences in terms of maximum coverage problem Args: text: text to be summarized (unicode string) char_limit: summary length (the number of characters) Returns: list of extracted sentences Reference: Hiroya Takamura, Manabu Okumura. Text summarization model based on maximum coverage problem and its variant. (section 3) http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.222.6945 ''' debug_info = {} sents = list(tools.sent_splitter_ja(text)) words_list = [ # pulp variables should be utf-8 encoded w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s) ] tf = collections.Counter() for words in words_list: for w in words: tf[w] += 1.0 if sentence_filter is not None: valid_indices = [i for i, s in enumerate(sents) if sentence_filter(s)] sents = [sents[i] for i in valid_indices] words_list = [words_list[i] for i in valid_indices] sent_ids = [str(i) for i in range(len(sents))] # sentence id sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents)) # c word_contain = dict() # a for id_, words in zip(sent_ids, words_list): word_contain[id_] = collections.defaultdict(lambda: 0) for w in words: word_contain[id_][w] = 1 prob = pulp.LpProblem('summarize', pulp.LpMaximize) # x sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary) # z word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary) # first, set objective function: sum(w*z) prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf]) # next, add constraints # limit summary length: sum(c*x) <= K prob += pulp.lpSum( [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids] ) <= char_limit, 'lengthRequirement' # for each term, sum(a*x) <= z for w in tf: prob += pulp.lpSum( [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids] ) >= word_vars[w], 'z:{}'.format(w) prob.solve() # print("Status:", pulp.LpStatus[prob.status]) sent_indices = [] for v in prob.variables(): # print v.name, "=", v.varValue if v.name.startswith('sents') and v.varValue == 1: sent_indices.append(int(v.name.split('_')[-1])) return [sents[i] for i in sent_indices], debug_info