示例#1
0
def mean_word_vecs(model, positive=[], negative=[], skip_unknown=False):
    '''
    gensim.Word2vecのモデルから、単語を足しあわせたベクトルを計算する。
    this code is based on gensim.Word2vec.most_simialr
    どの単語も辞書にない場合はNoneを返す。
    '''
    model.init_sims()

    # add weights for each word, if not already present; default to 1.0 for
    # positive and -1.0 for negative words
    positive = [(word, 1.0) for word in positive]
    negative = [(word, -1.0) for word in negative]

    # compute the weighted average of all words
    all_words, mean = set(), []
    for word, weight in positive + negative:
        if isinstance(word, numpy.ndarray):
            mean.append(weight * word)
        elif word in model.vocab:
            mean.append(weight * model.syn0norm[model.vocab[word].index])
            #all_words.add(model.vocab[word].index)
        elif not skip_unknown:
            words = tools.word_segmenter_ja(word, np=False)
            words = [w for w in words if len(w.strip()) > 0]
            mean_ = mean_word_vecs(model, positive=words, skip_unknown=True)
            if mean_ is not None:
                mean.append(weight * mean_)
            #raise KeyError("word '%s' not in vocabulary" % word)

    if not mean:
        #raise ValueError("cannot compute similarity with no input")
        return None

    mean = matutils.unitvec(numpy.array(mean).mean(axis=0)).astype(numpy.float32)
    return mean
示例#2
0
文件: lexrank.py 项目: nus/summpy
def lexrank(sentences, continuous=False, sim_threshold=0.1, alpha=0.9):
    """
    compute centrality score of sentences.

    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      continuous: if True, apply continuous LexRank. (see reference)
      sim_threshold: if continuous is False and smilarity is greater or
        equal to sim_threshold, link the sentences.
      alpha: the damping factor of PageRank

    Returns: tuple
      (
        {
          # sentence index -> score
          0: 0.003,
          1: 0.002,
          ...
        },
        similarity_matrix
      )
    
    Reference:
      Günes Erkan and Dragomir R. Radev.
      LexRank: graph-based lexical centrality as salience in text
      summarization. (section 3)
      http://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html
    """
    graph = networkx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = tools.word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # compute similarities between senteces
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric="cosine")

    if continuous:
        linked_rows, linked_cols = numpy.where(sim_mat > 0)
    else:
        linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)

    # create similarity graph
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i == j:
            continue
        weight = sim_mat[i, j] if continuous else 1.0
        graph.add_edge(i, j, {"weight": weight})

    scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000)
    return scores, sim_mat
示例#3
0
def test_iter_docs():
    data_root = os.path.dirname(os.path.abspath(__file__)) + '/../data/extracted'
    for fname in iter_files(data_root):
        f = bz2.BZ2File(fname)
        for doc_str in iter_docs(f):
            print '-' * 70
            #print doc_str
            doc_str = doc_str.decode('utf-8')
            sents = tools.sent_splitter_ja(doc_str)
            for sent in sents:
                words = tools.word_segmenter_ja(sent)
                print '^', u'|'.join(words).encode('utf-8')
示例#4
0
 def __iter__(self):
     for file_count, fname in enumerate(data.iter_files(self.data_root)):
         if self.test_ and file_count >= 100:
             break
         f = bz2.BZ2File(fname)
         for doc_str in data.iter_docs(f):
             doc_str = doc_str.decode('utf-8')
             sents = tools.sent_splitter_ja(doc_str, fix_parenthesis=True)
             for sent in sents:
                 sent = sent.strip()
                 if len(sent) == 0:
                     continue
                 words = tools.word_segmenter_ja(sent, baseform=True)
                 yield words
                 # TODO: 各名詞句ごとに区切る場合とまとめる場合を試す.
                 #words = tools.word_segmenter_ja(sent, baseform=True, np=False)
                 #yield words
         f.close()
示例#5
0
 def __iter__(self):
     for file_count, fname in enumerate(data.iter_files(self.data_root)):
         if self.test_ and file_count >= 100:
             break
         f = bz2.BZ2File(fname)
         for doc_str in data.iter_docs(f):
             doc_str = doc_str.decode('utf-8')
             sents = tools.sent_splitter_ja(doc_str, fix_parenthesis=True)
             for sent in sents:
                 sent = sent.strip()
                 if len(sent) == 0:
                     continue
                 words = tools.word_segmenter_ja(sent, baseform=True)
                 yield words
                 # TODO: 各名詞句ごとに区切る場合とまとめる場合を試す.
                 #words = tools.word_segmenter_ja(sent, baseform=True, np=False)
                 #yield words
         f.close()
示例#6
0
def lexrank(sentences, sim_threshold=.1, alpha=0.9):
    '''
    文の重要度を計算する.

    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      sim_threshold: 文間の類似度がsim_thresholdなら,エッジを作る.
      alpha: PageRankのダンピングファクタ

    Returns: 文のインデックス -> 重要度 のdict
      {
        0: 0.003,
        1: 0.002,
        ...
      }
    '''
    graph = networkx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = tools.word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # 文間の類似度を計算,閾値以上のインデックス(行番号,列番号)を求める.
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')
    linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)

    # グラフ作成
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i == j:
            continue
        graph.add_edge(i, j)

    scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000)
    return scores, sim_mat
示例#7
0
def lexrank(sentences, sim_threshold=.1, alpha=0.9):
    '''
    文の重要度を計算する.

    Args:
      sentences: [u'こんにちは.', u'私の名前は飯沼です.', ... ]
      sim_threshold: 文間の類似度がsim_thresholdなら,エッジを作る.
      alpha: PageRankのダンピングファクタ

    Returns: 文のインデックス -> 重要度 のdict
      {
        0: 0.003,
        1: 0.002,
        ...
      }
    '''
    graph = networkx.DiGraph()

    # sentence -> tf
    sent_tf_list = []
    for sent in sentences:
        words = tools.word_segmenter_ja(sent)
        tf = collections.Counter(words)
        sent_tf_list.append(tf)

    sent_vectorizer = DictVectorizer(sparse=True)
    sent_vecs = sent_vectorizer.fit_transform(sent_tf_list)

    # 文間の類似度を計算,閾値以上のインデックス(行番号,列番号)を求める.
    sim_mat = 1 - pairwise_distances(sent_vecs, sent_vecs, metric='cosine')
    linked_rows, linked_cols = numpy.where(sim_mat >= sim_threshold)

    # グラフ作成
    graph.add_nodes_from(range(sent_vecs.shape[0]))
    for i, j in zip(linked_rows, linked_cols):
        if i == j:
            continue
        graph.add_edge(i, j)

    scores = networkx.pagerank_scipy(graph, alpha=alpha, max_iter=1000)
    return scores, sim_mat
示例#8
0
def summarize(text, char_limit, sent_len_min=None):
    '''
    最大被覆問題として要約する.

    Args:
      text: 要約対象のテキスト (unicode)
      char_limit: 文字数制限
      sent_len_min: 長さがこれ以下の文は要約に含めない.

    Returns: 要約文のリスト
      [
        u'こんにちは.',
        u'私は飯沼ではありません.',
        ...
      ]
    '''

    sents = tools.sent_splitter_ja(text)

    # pulpの変数はutfじゃないといけない
    words_list = [
        w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s)
    ]
    # 単語の重みを計算 tf (w)
    tf = collections.Counter()
    for words in words_list:
        for w in words:
            tf[w] += 1.0

    # 要約に出てきてほしくない文は除外する.
    if sent_len_min is not None:
        sents = [s for s in sents if len(s) > sent_len_min]

    sent_ids = [str(i) for i in range(len(sents))]  # sentence id
    # c
    sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents))

    # a: ある単語が文に含まれているか否か
    word_contain = dict()
    for id_, words in zip(sent_ids, words_list):
        word_contain[id_] = collections.defaultdict(lambda: 0)
        for w in words:
            word_contain[id_][w] = 1

    prob = pulp.LpProblem('summarize', pulp.LpMaximize)

    # 変数を設定
    # x
    sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary)
    # z
    word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary)

    # 最初に目的関数を追加する。sum(w*z)
    prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf])

    # 次に制約を追加していく。
    # sum(c*x) <= K
    prob += pulp.lpSum(
        [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids]
    ) <= char_limit, 'lengthRequirement'
    # sum(a*x) <= z, すべての単語について
    for w in tf:
        prob += pulp.lpSum(
            [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids]
        ) >= word_vars[w], 'z:{}'.format(w)

    prob.solve()
    # print("Status:", pulp.LpStatus[prob.status])

    sent_indices = []
    for v in prob.variables():
        # print v.name, "=", v.varValue
        if v.name.startswith('sents') and v.varValue == 1:
            sent_indices.append(int(v.name.split('_')[-1]))

    return [sents[i] for i in sent_indices]
示例#9
0
def summarize(text, char_limit, sent_len_min=None):
    '''
    最大被覆問題として要約する.

    Args:
      text: 要約対象のテキスト (unicode)
      char_limit: 文字数制限
      sent_len_min: 長さがこれ以下の文は要約に含めない.

    Returns: 要約文のリスト
      [
        u'こんにちは.',
        u'私は飯沼ではありません.',
        ...
      ]
    '''

    sents = tools.sent_splitter_ja(text)

    # pulpの変数はutfじゃないといけない
    words_list = [
        w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s)
    ]
    # 単語の重みを計算 tf (w)
    tf = collections.Counter()
    for words in words_list:
        for w in words:
            tf[w] += 1.0

    # 要約に出てきてほしくない文は除外する.
    if sent_len_min is not None:
        sents = [s for s in sents if len(s) > sent_len_min]

    sent_ids = [str(i) for i in range(len(sents))]  # sentence id
    # c
    sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents))

    # a: ある単語が文に含まれているか否か
    word_contain = dict()
    for id_, words in zip(sent_ids, words_list):
        word_contain[id_] = collections.defaultdict(lambda: 0)
        for w in words:
            word_contain[id_][w] = 1

    prob = pulp.LpProblem('summarize', pulp.LpMaximize)

    # 変数を設定
    # x
    sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary)
    # z
    word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary)

    # 最初に目的関数を追加する。sum(w*z)
    prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf])

    # 次に制約を追加していく。
    # sum(c*x) <= K
    prob += pulp.lpSum([sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids
                        ]) <= char_limit, 'lengthRequirement'
    # sum(a*x) <= z, すべての単語について
    for w in tf:
        prob += pulp.lpSum(
            [word_contain[id_][w] * sent_vars[id_]
             for id_ in sent_ids]) >= word_vars[w], 'z:{}'.format(w)

    prob.solve()
    # print("Status:", pulp.LpStatus[prob.status])

    sent_indices = []
    for v in prob.variables():
        # print v.name, "=", v.varValue
        if v.name.startswith('sents') and v.varValue == 1:
            sent_indices.append(int(v.name.split('_')[-1]))

    return [sents[i] for i in sent_indices]
示例#10
0
文件: mcp_summ.py 项目: nus/summpy
def summarize(text, char_limit, sentence_filter=None, debug=False):
    '''
    select sentences in terms of maximum coverage problem

    Args:
      text: text to be summarized (unicode string)
      char_limit: summary length (the number of characters)

    Returns:
      list of extracted sentences

    Reference:
      Hiroya Takamura, Manabu Okumura.
      Text summarization model based on maximum coverage problem and its
      variant. (section 3)
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.222.6945
    '''
    debug_info = {}

    sents = list(tools.sent_splitter_ja(text))
    words_list = [
        # pulp variables should be utf-8 encoded
        w.encode('utf-8') for s in sents for w in tools.word_segmenter_ja(s)
    ]

    tf = collections.Counter()
    for words in words_list:
        for w in words:
            tf[w] += 1.0

    if sentence_filter is not None:
        valid_indices = [i for i, s in enumerate(sents) if sentence_filter(s)]
        sents = [sents[i] for i in valid_indices]
        words_list = [words_list[i] for i in valid_indices]

    sent_ids = [str(i) for i in range(len(sents))]  # sentence id
    sent_id2len = dict((id_, len(s)) for id_, s in zip(sent_ids, sents))  # c

    word_contain = dict()  # a
    for id_, words in zip(sent_ids, words_list):
        word_contain[id_] = collections.defaultdict(lambda: 0)
        for w in words:
            word_contain[id_][w] = 1

    prob = pulp.LpProblem('summarize', pulp.LpMaximize)

    # x
    sent_vars = pulp.LpVariable.dicts('sents', sent_ids, 0, 1, pulp.LpBinary)
    # z
    word_vars = pulp.LpVariable.dicts('words', tf.keys(), 0, 1, pulp.LpBinary)

    # first, set objective function: sum(w*z)
    prob += pulp.lpSum([tf[w] * word_vars[w] for w in tf])

    # next, add constraints
    # limit summary length: sum(c*x) <= K
    prob += pulp.lpSum(
        [sent_id2len[id_] * sent_vars[id_] for id_ in sent_ids]
    ) <= char_limit, 'lengthRequirement'
    # for each term, sum(a*x) <= z
    for w in tf:
        prob += pulp.lpSum(
            [word_contain[id_][w] * sent_vars[id_] for id_ in sent_ids]
        ) >= word_vars[w], 'z:{}'.format(w)

    prob.solve()
    # print("Status:", pulp.LpStatus[prob.status])

    sent_indices = []
    for v in prob.variables():
        # print v.name, "=", v.varValue
        if v.name.startswith('sents') and v.varValue == 1:
            sent_indices.append(int(v.name.split('_')[-1]))

    return [sents[i] for i in sent_indices], debug_info