def document(self): # a abbr acronym b big blink blockquote cite code # dd del dfn dir dl dt em h h1 h2 h3 h4 # h5 h6 i ins kbd li marquee menu ol pre q # s samp strike strong sub sup tt u ul var headers = 'h1', 'h2', 'h3' annotated_text = self._article.main_text paragraphs = [] for paragraph in annotated_text: sentences, current_text = [], '' for (text, annotations) in paragraph: if annotations and any(h_tag in annotations for h_tag in headers): sentences.append( Sentence(text, self._tokenizer, is_heading=True)) elif not (annotations and 'pre' in annotations): # skip <pre> nodes current_text += ' ' + text new_sentences = self.tokenize_sentences(current_text) sentences.extend( Sentence(s, self._tokenizer) for s in new_sentences) paragraphs.append(Paragraph(sentences)) return ObjectDocumentModel(paragraphs)
def create_sumy_dom(text, tokenizer): """Creates a sumy style document from the sentences. **TODO: Assumes that paragraphs are specified by lines starting with a space """ from sumy.models.dom import Sentence, Paragraph, ObjectDocumentModel paragraphs = [] paragraph = [] for ii, line in enumerate(text): if line[0] != ' ' and ii > 0: # Last line was the last one in paragraph paragraphs.append(Paragraph(paragraph)) # Dump paragraph paragraph = [] # start new paragraph going forward # Process current line paragraph.append(Sentence(line, tokenizer)) if ii + 1 == len(text): # if last line just dump paragraphs.append(Paragraph(paragraph)) return ObjectDocumentModel(tuple(paragraphs))
def test_sentences_equal(self): sentence1 = Sentence("", Tokenizer("czech")) sentence2 = Sentence("", Tokenizer("czech")) self.assertEqual(sentence1, sentence2) sentence1 = Sentence("word another.", Tokenizer("czech")) sentence2 = Sentence("word another.", Tokenizer("czech")) self.assertEqual(sentence1, sentence2) sentence1 = Sentence("word another", Tokenizer("czech")) sentence2 = Sentence("another word", Tokenizer("czech")) self.assertNotEqual(sentence1, sentence2)
def build_sentence(sentence_as_string, is_heading=False): return Sentence(sentence_as_string, _TOKENIZER, is_heading)
def test_sentences_with_same_words_in_different_order_are_different(): sentence1 = Sentence("word another", Tokenizer("czech")) sentence2 = Sentence("another word", Tokenizer("czech")) assert sentence1 != sentence2
def test_same_sentences_equal(): sentence1 = Sentence("word another.", Tokenizer("czech")) sentence2 = Sentence("word another.", Tokenizer("czech")) assert sentence1 == sentence2
def test_empty_sentences_equal(): sentence1 = Sentence("", Tokenizer("czech")) sentence2 = Sentence("", Tokenizer("czech")) assert sentence1 == sentence2
#create a list of reference sentences to calculate ROUGE_N scores ref_sentences = [] trim_ref_sentences = [] for paragraph in parser._article.main_text: for sections in paragraph: for sentences in sections: try: if len(sentences) > 35: # trim off super short - likely a few word sentences ref_sentences.append(sentences) except TypeError: #catch type errors caused by annotated text ie h1, b, etc print("typeError") continue trim_ref_sentences.extend( Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) #define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE)
def summary(article_url): url = article_url #url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/potato" # url = "http://www.encyclopedia.com/plants-and-animals/plants/plants/cabbage" # url = "http://www.encyclopedia.com/medicine/diseases-and-conditions/pathology/accident" # url = "http://www.encyclopedia.com/earth-and-environment/atmosphere-and-weather/atmospheric-and-space-sciences-atmosphere/air" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # create a list of reference sentences to calculate ROUGE_N scores ref_sentences = [] trim_ref_sentences = [] for paragraph in parser._article.main_text: for sections in paragraph: for sentences in sections: try: if len(sentences) > 35: # trim off super short - likely a few word sentences ref_sentences.append(sentences) except TypeError: # catch type errors caused by annotated text ie h1, b, etc print("typeError") continue trim_ref_sentences.extend( Sentence(s, Tokenizer(LANGUAGE)) for s in ref_sentences) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # define summarizers for the summarizing methods being used summarizer_Lsa = Lsa(stemmer) summarizer_Lsa.stop_words = get_stop_words(LANGUAGE) summary_Lsa = summarizer_Lsa(parser.document, SENTENCES_COUNT) summarizer_LexRank = LexRank() summary_LexRank = summarizer_LexRank(parser.document, SENTENCES_COUNT) summarizer_Edmundson = Edmundson(stemmer) summarizer_Edmundson.null_words = get_stop_words(LANGUAGE) summarizer_Edmundson.bonus_words = parser.significant_words summarizer_Edmundson.stigma_words = parser.stigma_words summary_Edmundson = summarizer_Edmundson(parser.document, SENTENCES_COUNT) # print summaries summary_Lsa_trim = [] for sentence in summary_Lsa: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Lsa_trim.append(sentence) # calc rouge_n scores calc_value(summary_Lsa_trim, trim_ref_sentences) print('\n') summary_LexRank_trim = [] for sentence in summary_LexRank: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_LexRank_trim.append(sentence) # calc rouge_n scores calc_value(summary_LexRank_trim, trim_ref_sentences) print('\n') summary_Edmundson_trim = [] for sentence in summary_Edmundson: # trim off super short - likely a few word sentences if len(sentence._text) > 20: print(sentence) summary_Edmundson_trim.append(sentence) # calc rouge_n scores calc_value(summary_Edmundson_trim, trim_ref_sentences) # returns index of max 0=Ed, 1=Lsa, 2=Lex models = {0: "Edmundson Model", 1: "Lsa Model", 2: "LexRank Model"} best_summary = max_r_value(summary_Lsa_trim, summary_LexRank_trim, summary_Edmundson_trim, trim_ref_sentences) print( models.get(best_summary) + ' is the best model according to an average of the Rouge_3, 2 and 1 tests' ) #return the summary of the best model if (best_summary == 0): return summary_Edmundson_trim elif (best_summary == 1): return summary_Lsa_trim elif (best_summary == 2): return summary_LexRank_trim
def to_sentence(text): text = to_unicode(text).strip() return Sentence(text, Tokenizer("english"))