def summarize(sentences, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a list of processed sentences. # sentences = _clean_text_by_sentences(text, language) # Creates the graph and calculates the similarity coefficient for every pair of nodes. i = 0 while i < len(sentences): sentences[i] = ''.join(str(x) for x in sentences[i]) i = i + 1 graph = summarizer._build_graph(sentences) summarizer._set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) i = 0 scores_list = [] while i < len(sentences): if sentences[i] in pagerank_scores.keys(): scores_list.append(pagerank_scores[sentences[i]]) else: scores_list.append(0) i = i + 1 return scores_list
def summarize(sentences, similarity_matrix, ratio=0.2, split=False): # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([i for i in range(len(sentences))]) _set_graph_edge_weights(graph, similarity_func=partial( cosine_similarity, similarity_matrix)) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_most_important_sentences( sentences, ratio, words=None) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) # print([(x.index, x.score) for x in extracted_sentences[:2]]) return _format_results(extracted_sentences, split, score=None)
def summarize_with_model(text, session, model, model_name, additional_stopwords): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") lang = detect(text)[:2] if lang == "en": paragraphs = text.split("\n") sentences = [] paragraph_index = 0 for paragraph in paragraphs: # Gets a list of processed sentences. if paragraph: tmp = _clean_text_by_sentences( paragraph, additional_stopwords) if tmp: for j, sent in enumerate(tmp): sent.paragraph = paragraph_index # Hacky way to overwrite token sent.token = len(sentences) + j sentences += tmp paragraph_index += 1 elif lang == "zh" or lang == "ko": # zh-Hant sometimes got misclassified into ko if model_name != "xling": raise ValueError("Only 'xling' model supports zh.") sentences = cut_sentences_by_rule(text) elif lang == "ja": if model_name != "xling": raise ValueError("Only 'xling' model supports ja.") if not JA_SUPPORT: raise ImportError("Missing dependencies for Japanese support.") sentences = ja_clean_and_cut_sentences(text) for i, sent in enumerate(sentences): # Hacky way to overwrite token sent.token = i else: return ["Language not suppored! (supported languages: en, zh, ja)"], None, lang # print([sentence.token for sentence in sentences if sentence.token]) # Creates the graph and calculates the similarity coefficient for every pair of nodes. similarities = attach_sentence_embeddings( session, sentences, model, batch_size=32) graph = _build_graph([x.token for x in sentences]) _set_graph_edge_weights(graph, partial(cosine_similarity, similarities)) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Sorts the sentences sentences.sort(key=lambda s: s.score, reverse=True) return sentences, graph, lang
def summarize_custom(text, ratio=0.2, split=False, scores=False, words=None, stopwords=None): cleaned_sentences = clean_sentences(text, stopwords=stopwords) graph = build_graph([sentence.token for sentence in cleaned_sentences]) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) _add_scores_to_sentences(cleaned_sentences, pagerank_scores) extracted_sentences = _extract_most_important_sentences( cleaned_sentences, ratio, words) extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split, scores)
def keywords( text: str, deaccent: bool = False, additional_stopwords: List[str] = None) -> Tuple[ List[Tuple[float, str]], Optional[Dict[str, List[str]]], Optional[summa.graph.Graph], Dict[str, float]]: if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") lang = detect(text)[:2] if lang == "en": # Gets a dict of word -> lemma tokens = _clean_text_by_word( text, "english", deacc=deaccent, additional_stopwords=additional_stopwords) split_text = list(_tokenize_by_word(text)) elif lang == "zh" or lang == "ko": # zh-Hant sometimes got misclassified into ko if not ZH_SUPPORT: raise ImportError("Missing dependencies for Chinese support.") tokens = zh_clean_and_cut_words(text) split_text = [x.text for x in tokens] tokens = {x.text: x for x in tokens} elif lang == "ja": if not JA_SUPPORT: raise ImportError("Missing dependencies for Japanese support.") tokens = ja_clean_and_cut_words(text) split_text = [x.text for x in tokens] tokens = {x.text: x for x in tokens} else: print("Language not suppored! (supported languages: en zh)") return [], {}, None, {} # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if not graph.nodes(): return [], {}, None, {} # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores) lemmas_to_word = None if lang == "en": lemmas_to_word = _lemmas_to_words(tokens) return extracted_lemmas, lemmas_to_word, graph, pagerank_scores
def summarize(text, additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") lang = detect(text)[:2] if lang == "en": paragraphs = text.split("\n") sentences = [] paragraph_index = 0 for paragraph in paragraphs: # Gets a list of processed sentences. if paragraph: tmp = _clean_text_by_sentences( paragraph, additional_stopwords) if tmp: for j, sent in enumerate(tmp): sent.paragraph = paragraph_index # Hacky way to overwrite token sent.token = len(sentences) + j sentences += tmp paragraph_index += 1 elif lang == "zh" or lang == "ko": # zh-Hant sometimes got misclassified into ko sentences = cut_sentences_by_rule(text) elif lang == "ja": raise NotImplementedError("No ja support yet.") else: return ["Language not suppored! (supported languages: en, zh)"], None, lang similarities = attach_sentence_embeddings( lang, sentences, batch_size=32) graph = _build_graph([x.token for x in sentences]) _set_graph_edge_weights(graph, partial(cosine_similarity, similarities)) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Sorts the sentences sentences.sort(key=lambda s: s.score, reverse=True) return sentences, graph, lang