class LexRankSumy(Baseline): """ Description from https://arxiv.org/pdf/1905.13164.pdf LexRank (Erkan and Radev, 2004) is a widely-used graph-based extractive summarizer; we build a graph with paragraphs as nodes andedges weighted by tf-idf cosine similarity; we run a PageRank-like algorithm on this graph to rank and select paragraphs until the length of the ground-truth summary is reached. """ """ Implementation Wrapper of https://github.com/miso-belica/sumy """ def __init__(self, name, language): super().__init__(name) self.language = language self.summarizer = LexRankSummarizer() def rank_sentences(self, dataset, document_column_name, **kwargs): all_sentences = [] all_scores = [] for document in tqdm(dataset[document_column_name]): sentences, scores = self.run_single(document) all_sentences.append(sentences) all_scores.append(scores) data = [{ "sentences": sentences, "scores": scores } for sentences, scores in zip(all_sentences, all_scores)] return Baseline.append_column(dataset, data, self.name) def run_single(self, document): parser = PlaintextParser.from_string(document, Tokenizer(self.language)) document = parser.document self.summarizer._ensure_dependencies_installed() sentences_words = [ self.summarizer._to_words_set(s) for s in document.sentences ] if not sentences_words: return tuple() tf_metrics = self.summarizer._compute_tf(sentences_words) idf_metrics = self.summarizer._compute_idf(sentences_words) matrix = self.summarizer._create_matrix(sentences_words, self.summarizer.threshold, tf_metrics, idf_metrics) scores = self.summarizer.power_method(matrix, self.summarizer.epsilon) return list(map(str, document.sentences)), list(scores)
def lexrank_scoring(text: str) -> Tuple[List[str], np.ndarray]: """ LexRankアルゴリズムによって文に点数をつける。 この点数は文の重要度とみなすことができる。 Parameters ---------- text : str 分析対象のテキスト。 Returns ------- List[str] text を文のリストに分解したもの。 np.ndarray 文のリストに対応する重要度のリスト。 """ doc = nlp(text) # 文のリストと単語のリストをつくる sentences = [] corpus = [] for sent in doc.sents: sentences.append(sent.text) tokens = [] for token in sent: # 文に含まれる単語のうち, 名詞・副詞・形容詞・動詞に限定する if token.pos_ in ('NOUN', 'ADV', 'ADJ', 'VERB'): # ぶれをなくすため, 単語の見出し語 Token.lemma_ を使う tokens.append(token.lemma_) corpus.append(tokens) # sentences = [文0, 文1, ...] # corpus = [[文0の単語0, 文0の単語1, ...], [文1の単語0, 文1の単語1, ...], ...] # sumyライブラリによるLexRankスコアリング lxr = LexRankSummarizer() tf_metrics = lxr._compute_tf(corpus) idf_metrics = lxr._compute_idf(corpus) matrix = lxr._create_matrix(corpus, lxr.threshold, tf_metrics, idf_metrics) scores = lxr.power_method(matrix, lxr.epsilon) # scores = [文0の重要度, 文1の重要度, ...] return sentences, scores