def mean_word_vecs(model, positive=[], negative=[], skip_unknown=False): ''' gensim.Word2vecのモデルから、単語を足しあわせたベクトルを計算する。 this code is based on gensim.Word2vec.most_simialr どの単語も辞書にない場合はNoneを返す。 ''' model.init_sims() # add weights for each word, if not already present; default to 1.0 for # positive and -1.0 for negative words positive = [(word, 1.0) for word in positive] negative = [(word, -1.0) for word in negative] # compute the weighted average of all words all_words, mean = set(), [] for word, weight in positive + negative: if isinstance(word, numpy.ndarray): mean.append(weight * word) elif word in model.vocab: mean.append(weight * model.syn0norm[model.vocab[word].index]) #all_words.add(model.vocab[word].index) elif not skip_unknown: words = tools.word_segmenter_ja(word, np=False) words = [w for w in words if len(w.strip()) > 0] mean_ = mean_word_vecs(model, positive=words, skip_unknown=True) if mean_ is not None: mean.append(weight * mean_) #raise KeyError("word '%s' not in vocabulary" % word) if not mean: #raise ValueError("cannot compute similarity with no input") return None mean = matutils.unitvec(numpy.array(mean).mean(axis=0)).astype(numpy.float32) return mean
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9): """ compute centrality score of sentences. Args: sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ] continuous: if True, apply continuous LexRank. (see reference) sim_threshold: if continuous is False and smilarity is greater or equal to sim_threshold, link the sentences. alpha: the damping factor of PageRank Returns: tuple ( { # sentence index -> score 0: 0.003, 1: 0.002, ... }, similarity_matrix ) Reference: Günes Erkan and Dragomir R. Radev. LexRank: graph-based lexical centrality as salience in text summarization. (section 3) http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html """ graph = networkx.DiGraph() # sentence -> tf sent_tf_list = [] for sent in sentences: words = tools.word_segmenter_ja(sent) tf = collections.Counter(words) sent_tf_list.append(tf) sent_vectorizer = DictVectorizer(sparse=True) sent_vecs = sent_vectorizer.fit_transform(sent_tf_list) # compute similarities between senteces sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric="cosine") if continuous: linked_rows, linked_cols = numpy.where(sim_mat > 0) else: linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold) # create similarity graph graph.add_nodes_from(range(sent_vecs.shape[0])) for i, j in zip(linked_rows, linked_cols): if i == j: continue weight = sim_mat[i, j] if continuous else 1.0 graph.add_edge(i, j, {"weight": weight}) scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000) return scores, sim_mat
def test_iter_docs(): data_root = os.path.dirname(os.path.abspath(__file__)) + '/../data/extracted' for fname in iter_files(data_root): f = bz2.BZ2File(fname) for doc_str in iter_docs(f): print '-' * 70 #print doc_str doc_str = doc_str.decode('utf-8') sents = tools.sent_splitter_ja(doc_str) for sent in sents: words = tools.word_segmenter_ja(sent) print '^', u'|'.join(words).encode('utf-8')
def __iter__(self): for file_count, fname in enumerate(data.iter_files(self.data_root)): if self.test_ and file_count >= 100: break f = bz2.BZ2File(fname) for doc_str in data.iter_docs(f): doc_str = doc_str.decode('utf-8') sents = tools.sent_splitter_ja(doc_str, fix_parenthesis=True) for sent in sents: sent = sent.strip() if len(sent) == 0: continue words = tools.word_segmenter_ja(sent, baseform=True) yield words # TODO: 各名詞句ごとに区切る場合とまとめる場合を試す. #words = tools.word_segmenter_ja(sent, baseform=True, np=False) #yield words f.close()
def lexrank(sentences, sim_threshold=.1, alpha=0.9): ''' 文の重要度を計算する. Args: sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ] sim_threshold: 文間の類似度がsim_thresholdなら,エッジを作る. alpha: PageRankのダンピングファクタ Returns: 文のインデックス -> 重要度 のdict { 0: 0.003, 1: 0.002, ... } ''' graph = networkx.DiGraph() # sentence -> tf sent_tf_list = [] for sent in sentences: words = tools.word_segmenter_ja(sent) tf = collections.Counter(words) sent_tf_list.append(tf) sent_vectorizer = DictVectorizer(sparse=True) sent_vecs = sent_vectorizer.fit_transform(sent_tf_list) # 文間の類似度を計算,閾値以上のインデックス(行番号,列番号)を求める. sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine') linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold) # グラフ作成 graph.add_nodes_from(range(sent_vecs.shape[0])) for i, j in zip(linked_rows, linked_cols): if i == j: continue graph.add_edge(i, j) scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000) return scores, sim_mat
def summarize(text, char_limit, sent_len_min=None): ''' 最大被覆問題として要約する. Args: text: 要約対象のテキスト (unicode) char_limit: 文字数制限 sent_len_min: 長さがこれ以下の文は要約に含めない. Returns: 要約文のリスト [ u'こんにちは.', u'私は飯沼ではありません.', ... ] ''' sents = tools.sent_splitter_ja(text) # pulpの変数はutfじゃないといけない words_list = [ w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s) ] # 単語の重みを計算 tf (w) tf = collections.Counter() for words in words_list: for w in words: tf[w] += 1.0 # 要約に出てきてほしくない文は除外する. if sent_len_min is not None: sents = [s for s in sents if len(s) > sent_len_min] sent_ids = [str(i) for i in range(len(sents))] # sentence id # c sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents)) # a: ある単語が文に含まれているか否か word_contain = dict() for id_, words in zip(sent_ids, words_list): word_contain[id_] = collections.defaultdict(lambda: 0) for w in words: word_contain[id_][w] = 1 prob = pulp.LpProblem('summarize', pulp.LpMaximize) # 変数を設定 # x sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary) # z word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary) # 最初に目的関数を追加する。sum(w*z) prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf]) # 次に制約を追加していく。 # sum(c*x) <= K prob += pulp.lpSum( [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids] ) <= char_limit, 'lengthRequirement' # sum(a*x) <= z, すべての単語について for w in tf: prob += pulp.lpSum( [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids] ) >= word_vars[w], 'z:{}'.format(w) prob.solve() # print("Status:", pulp.LpStatus[prob.status]) sent_indices = [] for v in prob.variables(): # print v.name, "=", v.varValue if v.name.startswith('sents') and v.varValue == 1: sent_indices.append(int(v.name.split('_')[-1])) return [sents[i] for i in sent_indices]
def summarize(text, char_limit, sent_len_min=None): ''' 最大被覆問題として要約する. Args: text: 要約対象のテキスト (unicode) char_limit: 文字数制限 sent_len_min: 長さがこれ以下の文は要約に含めない. Returns: 要約文のリスト [ u'こんにちは.', u'私は飯沼ではありません.', ... ] ''' sents = tools.sent_splitter_ja(text) # pulpの変数はutfじゃないといけない words_list = [ w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s) ] # 単語の重みを計算 tf (w) tf = collections.Counter() for words in words_list: for w in words: tf[w] += 1.0 # 要約に出てきてほしくない文は除外する. if sent_len_min is not None: sents = [s for s in sents if len(s) > sent_len_min] sent_ids = [str(i) for i in range(len(sents))] # sentence id # c sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents)) # a: ある単語が文に含まれているか否か word_contain = dict() for id_, words in zip(sent_ids, words_list): word_contain[id_] = collections.defaultdict(lambda: 0) for w in words: word_contain[id_][w] = 1 prob = pulp.LpProblem('summarize', pulp.LpMaximize) # 変数を設定 # x sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary) # z word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary) # 最初に目的関数を追加する。sum(w*z) prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf]) # 次に制約を追加していく。 # sum(c*x) <= K prob += pulp.lpSum([sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids ]) <= char_limit, 'lengthRequirement' # sum(a*x) <= z, すべての単語について for w in tf: prob += pulp.lpSum( [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids]) >= word_vars[w], 'z:{}'.format(w) prob.solve() # print("Status:", pulp.LpStatus[prob.status]) sent_indices = [] for v in prob.variables(): # print v.name, "=", v.varValue if v.name.startswith('sents') and v.varValue == 1: sent_indices.append(int(v.name.split('_')[-1])) return [sents[i] for i in sent_indices]
def summarize(text, char_limit, sentence_filter=None, debug=False): ''' select sentences in terms of maximum coverage problem Args: text: text to be summarized (unicode string) char_limit: summary length (the number of characters) Returns: list of extracted sentences Reference: Hiroya Takamura, Manabu Okumura. Text summarization model based on maximum coverage problem and its variant. (section 3) http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.222.6945 ''' debug_info = {} sents = list(tools.sent_splitter_ja(text)) words_list = [ # pulp variables should be utf-8 encoded w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s) ] tf = collections.Counter() for words in words_list: for w in words: tf[w] += 1.0 if sentence_filter is not None: valid_indices = [i for i, s in enumerate(sents) if sentence_filter(s)] sents = [sents[i] for i in valid_indices] words_list = [words_list[i] for i in valid_indices] sent_ids = [str(i) for i in range(len(sents))] # sentence id sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents)) # c word_contain = dict() # a for id_, words in zip(sent_ids, words_list): word_contain[id_] = collections.defaultdict(lambda: 0) for w in words: word_contain[id_][w] = 1 prob = pulp.LpProblem('summarize', pulp.LpMaximize) # x sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary) # z word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary) # first, set objective function: sum(w*z) prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf]) # next, add constraints # limit summary length: sum(c*x) <= K prob += pulp.lpSum( [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids] ) <= char_limit, 'lengthRequirement' # for each term, sum(a*x) <= z for w in tf: prob += pulp.lpSum( [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids] ) >= word_vars[w], 'z:{}'.format(w) prob.solve() # print("Status:", pulp.LpStatus[prob.status]) sent_indices = [] for v in prob.variables(): # print v.name, "=", v.varValue if v.name.startswith('sents') and v.varValue == 1: sent_indices.append(int(v.name.split('_')[-1])) return [sents[i] for i in sent_indices], debug_info