def test_rouge_n(): candidate_text = "pulses may ease schizophrenic voices" candidate = PlaintextParser(candidate_text, Tokenizer("english")).document.sentences reference1_text = "magnetic pulse series sent through brain may ease schizophrenic voices" reference1 = PlaintextParser(reference1_text, Tokenizer("english")).document.sentences reference2_text = "yale finds magnetic stimulation some relief to schizophrenics imaginary voices" reference2 = PlaintextParser.from_string( reference2_text, Tokenizer("english")).document.sentences assert rouge_n(candidate, reference1, 1) == approx(4 / 10) assert rouge_n(candidate, reference2, 1) == approx(1 / 10) assert rouge_n(candidate, reference1, 2) == approx(3 / 9) assert rouge_n(candidate, reference2, 2) == approx(0 / 9) assert rouge_n(candidate, reference1, 3) == approx(2 / 8) assert rouge_n(candidate, reference2, 3) == approx(0 / 8) assert rouge_n(candidate, reference1, 4) == approx(1 / 7) assert rouge_n(candidate, reference2, 4) == approx(0 / 7)
def test_rouge_l_summary_level(self): reference_text = "one two three four five. one two three four five." reference = PlaintextParser(reference_text, Tokenizer("english")).document.sentences candidate_text = "one two six seven eight. one three eight nine five." candidates = PlaintextParser(candidate_text, Tokenizer("english")).document.sentences rouge_l_summary_level(candidates, reference)
def test_union_lcs(): reference_text = "one two three four five" reference = PlaintextParser(reference_text, Tokenizer("english")).document.sentences candidate_text = "one two six seven eight. one three eight nine five." candidates = PlaintextParser(candidate_text, Tokenizer("english")).document.sentences assert _union_lcs(candidates, reference[0]) == approx(4 / 5)
def test_union_lcs(self): reference_text = "one two three four five" reference = PlaintextParser(reference_text, Tokenizer("english")).document.sentences candidate_text = "one two six seven eight. one three eight nine five." candidates = PlaintextParser(candidate_text, Tokenizer("english")).document.sentences self.assertAlmostEqual(_union_lcs(candidates, reference[0]), 4 / 5)
def get_summarized(self, input, num_sentences ): parser = PlaintextParser(input, Tokenizer("english")) summarizer = LexRankSummarizer() #Summarize the document with defined no.of sentences summary = summarizer(parser.document, num_sentences) list_summary = [str(item) for item in summary] return list_summary
def get_parser(url, tokenizer): useragent = ' '.join([ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)", "AppleWebKit/537.36 (KHTML, like Gecko)", "Chrome/52.0.2743.116 Safari/537.36" ]) # Scrape Web Page With HTMLParser and Goose and select the best scrape html_parser = HtmlParser.from_url(url, tokenizer) article = Goose({'browser_user_agent': useragent}) # Goose raises IndexError when requesting unfamiliar sites. try: extract = article.extract(url=url) except: extract = article.extract(raw_html=requests.get(url).text) goose_parser = PlaintextParser(extract, tokenizer) # Aggregate Site Metadata meta = { k: v for (k, v) in extract.infos.items() if k not in ('cleaned_text', 'links', 'tweets', 'movies') } # Select Best Parser parser = ( html_parser if len(goose_parser.document.words) < len(html_parser.document.words) else # noqa goose_parser) return parser, meta
def get_best_sentences(text, num=1): sentence_count = num parser = PlaintextParser(text, Tokenizer('english')) stemmer = Stemmer('english') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('english') return [unicode(s) for s in summarizer(parser.document, sentence_count)]
def summarize_terms_text(txt): text_data = unidecode.unidecode(txt) clean_list, pure_list = prepare_for_regex(text_data) data_to_summarize = [] for clean, pure in zip(clean_list, pure_list): if re.findall(clause, clean): data_to_summarize.append(pure) text_data = " ".join(data_to_summarize) parser = PlaintextParser(text_data, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = KLSummarizer(stemmer) summary = summarizer(parser.document, SENTENCES_COUNT) if len(summary) == 0: summary = ["No Terms"] sentences = [str(x) for x in summary] message = HTML_OPEN + "<ul class='rolldown-list' id='myList'>" you_agree = [] they_agree = [] other_clause = [] for sentence in sentences: # TODO: logging in the future lower = sentence.lower() you_idx = lower.find("you") they_idx = lower.find("we") if (you_idx == -1 or you_idx > 15) and (they_idx == -1 or they_idx > 15): other_clause.append(sentence) elif you_idx == -1: they_agree.append(sentence) elif they_idx == -1: you_agree.append(sentence) elif you_idx < they_idx: you_agree.append(sentence) else: they_agree.append(sentence) if len(you_agree) > 0: message += YOU_AGREE_HEADER + "<li>" message += "</li><li>".join(you_agree) message += "</li>" if len(they_agree) > 0: message += THEY_AGREE_HEADER + "<li>" message += "</li><li>".join(they_agree) message += "</li>" if len(other_clause) > 0: message += OTHER_HEADER + "<li>" message += "</li><li>".join(other_clause) message += "</li>" message += "</ul></div>" return json.dumps(message)
def summarize(text, sentence_count, summarizer_type="lsa"): summarizer_class = AVAILABLE_METHODS[summarizer_type] parser = PlaintextParser(text, Tokenizer(LANGUAGE)) # Default is english stemmer = Stemmer(LANGUAGE) summarizer = summarizer_class(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return summarizer(parser.document, sentence_count)
def get_summary_per_section_edmund(cur_sents,each_summ_num): summarizer = EdmundsonSummarizer() parser = PlaintextParser(cur_sents, Tokenizer(LANGUAGE)) summ = summarizer(parser.document, each_summ_num) decoded = [] for line in summ: decoded.append(line._text) return decoded
def summary(self, int1, int2): # int1, int2 are the places between which to look for # the summary to be taken (slicing the corpus as a string) parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english")) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") self.summary_text = " ".join( map(lambda x: x._text, summarizer(parser.document, 20))) return self.summary_text
def summarize(url, language, sentences): """Return a generated summary of url content.""" text = get_text(url) parser = PlaintextParser(text, Tokenizer(language)) summarizer = LsaSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) for sentence in summarizer(parser.document, sentences): print(sentence)
def summarize(string, num_sentence=3): """ Summarize a sentence with sumy """ parser = PlaintextParser(string, tknz) parser.stop_word = get_stop_words(lang) summ_string = '' for sentence in summarizer(parser.document, num_sentence): summ_string += str(sentence) + ' ' return summ_string
def get_summary_per_section_lsa(cur_sents, each_summ_num): summarizer = LsaSummarizer() summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) parser = PlaintextParser(cur_sents, Tokenizer(LANGUAGE)) summ = summarizer(parser.document, each_summ_num) decoded = [] for line in summ: decoded.append(line._text) return decoded
def get_summary(text): parser = PlaintextParser(text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences paragraph = "" for sentance in summary: paragraph += sentance._text paragraph += " " return paragraph
def summary(text): stemmer = Stemmer(LANGUAGE) parser = PlaintextParser(text, Tokenizer(LANGUAGE)) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) short = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n" #print(sentence) return short
def test_rouge_l_sentence_level(): reference_text = "police killed the gunman" reference = PlaintextParser(reference_text, Tokenizer("english")).document.sentences candidate1_text = "police kill the gunman" candidate1 = PlaintextParser(candidate1_text, Tokenizer("english")).document.sentences candidate2_text = "the gunman kill police" candidate2 = PlaintextParser(candidate2_text, Tokenizer("english")).document.sentences candidate3_text = "the gunman police killed" candidate3 = PlaintextParser(candidate3_text, Tokenizer("english")).document.sentences assert rouge_l_sentence_level(candidate1, reference) == approx(3 / 4) assert rouge_l_sentence_level(candidate2, reference) == approx(2 / 4) assert rouge_l_sentence_level(candidate3, reference) == approx(2 / 4)
def summarize(text, text_language, num_sentences): parser = PlaintextParser(text, Tokenizer(text_language)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(text_language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(text_language) summary = '' for sentence in summarizer(parser.document, num_sentences): summary += str(sentence) return summary
def get_summaries(article, topic): parser = PlaintextParser(article, Tokenizer(LANGUAGE)) summarizer_list = [ head_summarizer, LsaSummarizer, LexRankSummarizer, LuhnSummarizer, EdmundsonSummarizer, ] #gensim_summarizer] # , lex_rank] summarizers_name = ["Head Summarizer", "LSA", "LexRank", "Luhn", "Edmund"] #, "Gensim"] summary_dict = {} for Summarizer, name in zip(summarizer_list, summarizers_name): #rint(name, ) summary = '' if Summarizer == head_summarizer: summary = head_summarizer(article) elif Summarizer == gensim_summarizer: ratio = 0.01 while len(summary.split()) < 15: summary = Summarizer(" ".join(article.split('\n')), ratio=ratio) ratio = ratio * 5 if len(summary.split()) > 40: summary = tokenize_article_to_sentences(summary)[0] elif Summarizer == EdmundsonSummarizer: summarizer = Summarizer() summarizer.bonus_words = topic.lower().split() summarizer.stigma_words = stop_words summarizer.null_words = stop_words for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += str(sentence) else: summarizer = Summarizer(stemmer) summarizer.stop_words = stop_words for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += str(sentence) #summary = re.sub("[\(\[].*?[\)\]]", "", summary) #summary = " ".join(summary.split()) if '( listen)' or '(listen)' in summary: summary = re.sub('\( listen\)', ' ', summary) summary_dict[name] = summary return summary_dict
def summarize(text, sentence_count, bonus_words, language='english'): ''' ''' summarizer = EdmundsonSummarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = bonus_words summarizer.stigma_words = ['zdfgthdvndadv'] summarizer.null_words = stopwords.words('english') summary = summarizer( PlaintextParser(text, Tokenizer(language)).document, sentence_count) return summary
def lsa_summaries (filepath,word_count): with open(filepath, 'rb') as filehandle: texts_str = pickle.load(filehandle) lsa_summary = [] for t in texts_str: parser = PlaintextParser(t, Tokenizer('english')) for i in range (len(t.split('.'))): summary,len_summary = lsa_summarizer(parser,i) if len_summary < word_count: continue else: break lsa_summary.append(summary) return lsa_summary
def test_rouge_l_sentence_level(self): reference_text = "police killed the gunman" reference = PlaintextParser(reference_text, Tokenizer("english")).document.sentences candidate1_text = "police kill the gunman" candidate1 = PlaintextParser(candidate1_text, Tokenizer("english")).document.sentences candidate2_text = "the gunman kill police" candidate2 = PlaintextParser(candidate2_text, Tokenizer("english")).document.sentences candidate3_text = "the gunman police killed" candidate3 = PlaintextParser(candidate3_text, Tokenizer("english")).document.sentences self.assertAlmostEqual(rouge_l_sentence_level(candidate1, reference), 3 / 4) self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference), 2 / 4) self.assertAlmostEqual(rouge_l_sentence_level(candidate2, reference), 2 / 4)
def summarize(text, summarizer, sentence_count, bonus_words=['MLK, rights'], language='english'): summarizer = summarizer(Stemmer(language)) summarizer.stop_words = get_stop_words(language) if isinstance(summarizer, EdmundsonSummarizer): summarizer.bonus_words = bonus_words summarizer.stigma_words = ['zdfgthdvndadv'] summarizer.null_words = summarizer.stop_words summary = summarizer( PlaintextParser(text, Tokenizer(language)).document, sentence_count) return summary
def summarizer(sentences): ''' imput: sentences output: file list Given the path to a directory returns the video files in that directory (end with .mp4) ''' stemmer = Stemmer(LANGUAGE) parser = PlaintextParser(text, Tokenizer(LANGUAGE)) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) # creates a list of the sentences in the summary for sentence in summarizer(parser.document, SENTENCES_COUNT): summary_list.append(str(sentence))
def blog_to_sum(tech, blog): docparser = PlaintextParser(blog, Tokenizer("english")) if tech == 'lsa': results = lsaa(docparser) elif tech == 'luhn': results = luhnn(docparser) elif tech == 'klsum': results = klsumm(docparser) elif tech == 'textrank': results = textrankk(docparser) elif tech == 'lexrank': results = lexrankk(docparser) return results
def textrank_test(doc_dir, ref_dir, summary_length=75, use_stop_words=True, use_lemmatizer=True): """ Args: doc_dir (str): Input chapters directory ref_dir (str): Reference summaries directory summary_length (int) : Length of summary """ docs = sorted(os.listdir(doc_dir)) refs = sorted(os.listdir(ref_dir)) documents = [] references = [] for d, r in zip(docs, refs): doc, ref = Utils.load_document(doc_dir + "/" + d, ref_dir + "/" + r) p_doc_headings = Utils.process_document(doc, use_stop_words, use_lemmatizer) p_doc = Utils.remove_headings(p_doc_headings) p_ref = Utils.process_document(ref, use_stop_words, use_lemmatizer) documents.append(p_doc) references.append(p_ref) #Processed reference references = Utils.join_docs(references) rouge_scores = [0.0] * len(documents) rogue_index = 0 for d, r in zip(docs, refs): # Perform Textrank text = 'ред '.join(' '.join(item) for item in documents[rogue_index]) parser = PlaintextParser(text, Utils.Tokenizer()) # Processed input summarizer = TextRankSummarizer() '''if stopwords: summarizer.stop_words = Utils.load_stop_words()''' summary = summarizer(parser.document, summary_length) p_sum = "" with open("textrank_summary_" + d, 'w', encoding="utf-8") as output_file: for sentence in summary: output_file.write(str(sentence) + "ред\n") p_sum += str(sentence) + " " rouge_scores[rogue_index] = Utils.calculate_rouge( p_sum, [references[rogue_index]], 1) rogue_index += 1 print(rouge_scores)
def summarize(selected_text, n=3): from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc. from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer #We're choosing Lexrank, other algorithms are also built in output = '' parser = PlaintextParser(selected_text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, n) #Summarize the document with 2 sentences for sentence in summary: output += str(sentence) return output
def summarize4(self, df): #http://ai.intelligentonlinetools.com/ml/text-summarization/ LANGUAGE = "english" SENTENCES_COUNT = 10 stopwords = nltk.corpus.stopwords.words('english') for row in df['conclusion']: if row == '0' or row == '': continue parser = PlaintextParser(row, Tokenizer(LANGUAGE)) print("--LsaSummarizer--") summarizer = LsaSummarizer() summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("--LuhnSummarizer--") summarizer = LuhnSummarizer() summarizer.stop_words = stopwords for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("--EdmundsonSummarizer--") summarizer = EdmundsonSummarizer() words = ("deep", "learning", "neural") summarizer.bonus_words = words words = ( "another", "and", "some", "next", ) summarizer.stigma_words = words words = ( "another", "and", "some", "next", ) summarizer.null_words = words for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def apply(text, interest, top_k=5): """ Return and tuple of list of tuple. The first list contains all the sentence about the interest and there corresponding weigth in the document The second list contains the top_k sentences of the document """ LANGUAGE = "english" parser = PlaintextParser(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sent_importance = summarizer.rate_sentences(parser.document) interesting_sent = [] for sent in sent_importance: if interest.lower() in sent._text.lower(): interesting_sent.append((sent._text, sent_importance[sent])) top_sent = summarizer(parser.document, top_k) top_sent = [(s._text, sent_importance[s]) for s in top_sent] return (interesting_sent, top_sent)
def summarizer(algo): text = excel_preprocessor() if algo is 'gensimTextRank': text_summarized = summarize(text, word_count=5000) return text_summarized elif algo is 'sumyTextRank': parser = PlaintextParser(text, Tokenizer('english')) stemmer = Stemmer('english') text_summarizer = TextRankSummarizer(stemmer) text_summarizer.stop_words = get_stop_words('english') summarized = text_summarizer(parser.document, 25) sent = [] for sentence in summarized: sent.append(str(sentence)) text_summarized = "\n\n".join(sent) return text_summarized else: msg = "Accepted Inputs are: \n1. gensimTextRank\n2. sumyTextRank" print(msg)