def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text, language) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split, scores)
def summarize(text, namscores, original='pagerank', ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language) #print namscores # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pgrank_scores = _pagerank(graph, namscores, original) pagerank_scores = pgrank_scores #print '\n\npagerank scores\n',pgrank_scores #print '\n\nnamscores\n',namscores if original=='suraj': suraj_scores=[] for i in graph.nodes(): suraj_scores.append((pgrank_scores[i])*(namscores[i])) pagerank_scores = dict(zip(graph.nodes(),suraj_scores)) #print '\n\nhosa scores\n',new_scores # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split, scores)
def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text, language) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, language) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_most_important_sentences(sentences, ratio, words) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split, scores)
def summarize(text1, text2, ratio=0.2, words=None, language="english", split=False, scores=False, additional_stopwords=None): if not isinstance(text1, str): raise ValueError("Text parameter must be a Unicode object (str)!") if not isinstance(text2, str): raise ValueError("Text parameter must be a Unicode object (str)!") # Gets a list of processed sentences. # Each sentence is created as a syntactic unit with the source article number to be referenced later on as well languague = "english" sentencesText1 = _clean_text_by_sentences( text1, source=0, additional_stopwords=additional_stopwords) sentencesText2 = _clean_text_by_sentences( text2, source=1, additional_stopwords=additional_stopwords) allSentences = _clean_text_by_sentences( text1, source=0, additional_stopwords=additional_stopwords) + _clean_text_by_sentences( text2, source=1, additional_stopwords=additional_stopwords) # Creates the graph and calculates the similarity coefficient for every pair of nodes. graphCombined = _build_graph([sentence.token for sentence in allSentences]) _set_graph_edge_weights(graphCombined) graph1 = _build_graph([sentence.token for sentence in sentencesText1]) _set_graph_edge_weights(graph1) graph2 = _build_graph([sentence.token for sentence in sentencesText2]) _set_graph_edge_weights(graph2) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graphCombined) _remove_unreachable_nodes(graph1) _remove_unreachable_nodes(graph2) # PageRank cannot be run in an empty graph. if len(graphCombined.nodes()) == 0 or len(graph1.nodes()) == 0 or len( graph2.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores_combined = _pagerank(graphCombined) pagerank_scores_1 = _pagerank(graph1) pagerank_scores_2 = _pagerank(graph2) # print("pagerank_scores_combined") # print(pagerank_scores_combined) # print("pagerank_scores_1") # print(pagerank_scores_1) # print("pagerank_scores_2") # print(pagerank_scores_2) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentencesText1, pagerank_scores_1) _add_scores_to_sentences(sentencesText2, pagerank_scores_2) _add_scores_to_sentences(allSentences, pagerank_scores_combined) # I want to create a table that shows the scores of each sentence in the combined graph vs the summary individually and the scores attributed to it sentenceScores = [] for sentence in allSentences: if sentence.token in pagerank_scores_1: row = { "sentence": sentence.token, "combinedScore": pagerank_scores_combined[sentence.token], "summary1Score": pagerank_scores_1[sentence.token] } sentenceScores.append(row) if sentence.token in pagerank_scores_2: row = { "sentence": sentence.token, "combinedScore": pagerank_scores_combined[sentence.token], "summary2Score": pagerank_scores_2[sentence.token] } sentenceScores.append(row) df = pd.DataFrame(sentenceScores) print(df) # Extracts the most important sentences with the selected criterion. print("_____TEXT1_____") for sentence in sentencesText1: print(sentence) print(sentence.score) print("_____TEXT2_____") for sentence in sentencesText2: print(sentence) print(sentence.score) print("_____TEXT_COMBINED_____") for sentence in allSentences: print(sentence) print(sentence.score) summary1 = _extract_most_important_sentences(sentencesText1, ratio, words, 0) summary2 = _extract_most_important_sentences(sentencesText2, ratio, words, 1) summary1_combined = _extract_most_important_sentences( allSentences, ratio, words, 0) summary2_combined = _extract_most_important_sentences( allSentences, ratio, words, 1) # We want combined_graphs_similarity > separate_graphs_similarity separate_graphs_similarity = _get_similarity(summary1, summary2) combined_graphs_similarity = _get_similarity(summary1_combined, summary2_combined) # print("Similarity of separate graphs: " + str(separate_graphs_similarity)) # print("Similarity of combined graphs: " + str(combined_graphs_similarity)) # Sorts the extracted sentences by apparition order in the original text. summary1.sort(key=lambda s: s.index) summary2.sort(key=lambda s: s.index) summary1_combined.sort(key=lambda s: s.index) summary2_combined.sort(key=lambda s: s.index) summary1Text = summaryText(summary1) summary2Text = summaryText(summary2) summary1CombinedText = summaryText(summary1_combined) summary2CombinedText = summaryText(summary2_combined) # Compute the similarity score of the similar sentences here and the sentences computed without doing the one graph approach return summary1Text, summary2Text, summary1CombinedText, summary2CombinedText, separate_graphs_similarity, combined_graphs_similarity