Exemplo n.º 1
0
    def test_idf_metrics(self):
        summarizer = LexRankSummarizer()

        sentences = [
            ("this", "sentence", "is", "simple", "sentence",),
            ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",),
            ("not", "every", "sentence", "makes", "me", "happy",),
            ("yes",),
            (),
            ("every", "day", "is", "happy", "day",),
        ]
        metrics = summarizer._compute_idf(sentences)

        expected = {
            "this": 6/2,
            "is": 6/3,
            "yes": 6/2,
            "simple": 6/2,
            "sentence": 6/3,
            "too": 6/1,
            "not": 6/1,
            "every": 6/2,
            "makes": 6/1,
            "me": 6/1,
            "happy": 6/2,
            "day": 6/1,
        }
        self.assertEqual(expected, metrics)
Exemplo n.º 2
0
def test_idf_metrics():
    summarizer = LexRankSummarizer()

    sentences = [
        ("this", "sentence", "is", "simple", "sentence",),
        ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",),
        ("not", "every", "sentence", "makes", "me", "happy",),
        ("yes",),
        (),
        ("every", "day", "is", "happy", "day",),
    ]
    metrics = summarizer._compute_idf(sentences)

    expected = {
        "this": math.log(6/3),
        "is": math.log(6/4),
        "yes": math.log(6/3),
        "simple": math.log(6/3),
        "sentence": math.log(6/4),
        "too": math.log(6/2),
        "not": math.log(6/2),
        "every": math.log(6/3),
        "makes": math.log(6/2),
        "me": math.log(6/2),
        "happy": math.log(6/3),
        "day": math.log(6/2),
    }
    assert expected == metrics
Exemplo n.º 3
0
def test_idf_metrics():
    summarizer = LexRankSummarizer()

    sentences = [
        ("this", "sentence", "is", "simple", "sentence",),
        ("this", "is", "simple", "sentence", "yes", "is", "too", "too", "too",),
        ("not", "every", "sentence", "makes", "me", "happy",),
        ("yes",),
        (),
        ("every", "day", "is", "happy", "day",),
    ]
    metrics = summarizer._compute_idf(sentences)

    expected = {
        "this": math.log(6/3),
        "is": math.log(6/4),
        "yes": math.log(6/3),
        "simple": math.log(6/3),
        "sentence": math.log(6/4),
        "too": math.log(6/2),
        "not": math.log(6/2),
        "every": math.log(6/3),
        "makes": math.log(6/2),
        "me": math.log(6/2),
        "happy": math.log(6/3),
        "day": math.log(6/2),
    }
    assert expected == metrics
Exemplo n.º 4
0
    def test_idf_metrics(self):
        summarizer = LexRankSummarizer()

        sentences = [
            (
                "this",
                "sentence",
                "is",
                "simple",
                "sentence",
            ),
            (
                "this",
                "is",
                "simple",
                "sentence",
                "yes",
                "is",
                "too",
                "too",
                "too",
            ),
            (
                "not",
                "every",
                "sentence",
                "makes",
                "me",
                "happy",
            ),
            ("yes", ),
            (),
            (
                "every",
                "day",
                "is",
                "happy",
                "day",
            ),
        ]
        metrics = summarizer._compute_idf(sentences)

        expected = {
            "this": 6 / 2,
            "is": 6 / 3,
            "yes": 6 / 2,
            "simple": 6 / 2,
            "sentence": 6 / 3,
            "too": 6 / 1,
            "not": 6 / 1,
            "every": 6 / 2,
            "makes": 6 / 1,
            "me": 6 / 1,
            "happy": 6 / 2,
            "day": 6 / 1,
        }
        self.assertEqual(expected, metrics)
Exemplo n.º 5
0
class LexRankSumy(Baseline):
    """ Description from https://arxiv.org/pdf/1905.13164.pdf
    LexRank (Erkan and Radev, 2004) is a widely-used graph-based extractive summarizer; 
    we build a graph with paragraphs as nodes andedges weighted by tf-idf cosine similarity; 
    we run a PageRank-like algorithm on this graph to rank and select paragraphs until 
    the length of the ground-truth summary is reached.
    """
    """ Implementation
    Wrapper of https://github.com/miso-belica/sumy
    """
    def __init__(self, name, language):
        super().__init__(name)
        self.language = language
        self.summarizer = LexRankSummarizer()

    def rank_sentences(self, dataset, document_column_name, **kwargs):
        all_sentences = []
        all_scores = []
        for document in tqdm(dataset[document_column_name]):
            sentences, scores = self.run_single(document)
            all_sentences.append(sentences)
            all_scores.append(scores)

        data = [{
            "sentences": sentences,
            "scores": scores
        } for sentences, scores in zip(all_sentences, all_scores)]
        return Baseline.append_column(dataset, data, self.name)

    def run_single(self, document):
        parser = PlaintextParser.from_string(document,
                                             Tokenizer(self.language))
        document = parser.document

        self.summarizer._ensure_dependencies_installed()

        sentences_words = [
            self.summarizer._to_words_set(s) for s in document.sentences
        ]
        if not sentences_words:
            return tuple()

        tf_metrics = self.summarizer._compute_tf(sentences_words)
        idf_metrics = self.summarizer._compute_idf(sentences_words)

        matrix = self.summarizer._create_matrix(sentences_words,
                                                self.summarizer.threshold,
                                                tf_metrics, idf_metrics)
        scores = self.summarizer.power_method(matrix, self.summarizer.epsilon)

        return list(map(str, document.sentences)), list(scores)
Exemplo n.º 6
0
def lexrank_scoring(text: str) -> Tuple[List[str], np.ndarray]:
    """
    LexRankアルゴリズムによって文に点数をつける。
    この点数は文の重要度とみなすことができる。

    Parameters
    ----------
    text : str
        分析対象のテキスト。

    Returns
    -------
    List[str]
        text を文のリストに分解したもの。
    np.ndarray
        文のリストに対応する重要度のリスト。
    """
    doc = nlp(text)

    # 文のリストと単語のリストをつくる
    sentences = []
    corpus = []
    for sent in doc.sents:
        sentences.append(sent.text)
        tokens = []
        for token in sent:
            # 文に含まれる単語のうち, 名詞・副詞・形容詞・動詞に限定する
            if token.pos_ in ('NOUN', 'ADV', 'ADJ', 'VERB'):
                # ぶれをなくすため, 単語の見出し語 Token.lemma_ を使う
                tokens.append(token.lemma_)
        corpus.append(tokens)
    # sentences = [文0, 文1, ...]
    # corpus = [[文0の単語0, 文0の単語1, ...], [文1の単語0, 文1の単語1, ...], ...]

    # sumyライブラリによるLexRankスコアリング
    lxr = LexRankSummarizer()
    tf_metrics = lxr._compute_tf(corpus)
    idf_metrics = lxr._compute_idf(corpus)
    matrix = lxr._create_matrix(corpus, lxr.threshold, tf_metrics, idf_metrics)
    scores = lxr.power_method(matrix, lxr.epsilon)
    # scores = [文0の重要度, 文1の重要度, ...]

    return sentences, scores