def test_split_into_words(self): sentences1 = PlaintextParser.from_string("One, two two. Two. Three.", Tokenizer("english")).document.sentences self.assertEqual(["One", "two", "two", "Two", "Three"], _split_into_words(sentences1)) sentences2 = PlaintextParser.from_string("two two. Two. Three.", Tokenizer("english")).document.sentences self.assertEqual(["two", "two", "Two", "Three"], _split_into_words(sentences2))
def summarize(text, n_sentences, sep='\n'): ''' Args: text (str or file): text itself or file in memory of text n_sentences (int): number of sentences to include in summary Kwargs: sep (str): separator to join summary sentences Returns: (str) n_sentences-long, automatically-produced summary of text ''' if isinstance(text, str): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif isinstance(text, file): parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE)) else: raise TypeError('text must be either str or file') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def summarizeFile(inputFile): summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") url = findURLS(inputFile) if url != None: if url[-1] == '.': url = url[0:-1] #print (url) #urlContent = 'Summary from URL ['+url+']: \n' urlContent = '' try: parser = HtmlParser.from_url(url, Tokenizer("english")) for sentence in summarizer(parser.document, 3): urlContent = urlContent + str(sentence) + '\n' except: #print (sys.exc_info()[0]) urlContent = '' content = inputFile.read() parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE)) #summarizer = LsaSummarizer(stem_word) #summarizer.stop_words = get_stop_words(LANGUAGE) #summary = 'Event Summary: \n' summary = '' try: for sentence in summarizer(parser.document, SENTENCES_COUNT_1): summary = summary + str(sentence) + '\n' except AssertionError: return None if url != None: return summary + urlContent return summary
def get_summary(source_text, compression_factor): """ Given some input source_text, returns its summary based on the chosen compression factor. """ summary = { 'source_text': source_text, 'compression_factor': compression_factor, 'summary': '', 'success': False } parser = PlaintextParser.from_string(source_text, Tokenizer("english")) summ_algo = LexRankSummarizer() final_line_num = \ int(source_text.count('.')/compression_factor) try: raw_summary = summ_algo(parser.document, final_line_num) for sentence in raw_summary: summary['summary'] += str(sentence) + ' ' except: pass summary['success'] = (len(summary['summary']) != 0) return summary
def summarize(corpus, length, algorithm): summarizer = None summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)" algorithm = algorithm.lower() try: parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(LANGUAGE)) if summarizer: summarizer.stop_words = get_stop_words(LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary except Exception as e: return str(e)
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary
def test_get_word_ngrams(self): sentences = PlaintextParser.from_string("This is a test.", Tokenizer("english")).document.sentences correct_ngrams = [("This", "is"), ("is", "a"), ("a", "test")] found_ngrams = _get_word_ngrams(2, sentences) for ngram in correct_ngrams: self.assertTrue(ngram in found_ngrams)
def summarize(string, summary_length = 1, language = "english"): string = string.lower() if string.isupper() else string parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)])
def sumrise(text = text, sentences = 5): if (validators.url(text)): text = web2text.getwebtxt(text) parser = PlaintextParser.from_string(text, Tokenizer('english')) summerizer = LsaSummarizer() summary = str(summerizer(parser.document, sentences)) return summary
def summarize(self, extracted_refs, facet_results, max_length=250, mode='citance'): ''' Summarizes the extracted references based on community detection Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary mode(str) -- can be citance, reference ''' citances = defaultdict(list) summarizer = LexRankSummarizer(Stemmer('english')) summary = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: citances[t[0]['topic']].append( {'refs': t[0]['sentence'], 'citance': self.clean_citation(t[0]['citation_text'])}) for topic, citance in citances.iteritems(): # Create graph of citation similarities vectorizer = TfidfVectorizer( tokenizer=self.tokenize, min_df=1, max_df=len(citances) * .9) cit_vectors = vectorizer.fit_transform( [e['citance'] for e in citance]).toarray() cit_text = { i: v for i, v in enumerate(citance)} cit_dict = {i: v for i, v in enumerate(cit_vectors)} cits = [] for e in cit_dict: # vector (numpy array) for e1 in cit_dict: if e != e1: simil = self.cossim(cit_dict[e], cit_dict[e1]) if simil > 0.1: cits.append((e, e1, simil)) G = nx.Graph() G.add_weighted_edges_from(cits) part = community.best_partition(G) clusters = defaultdict(list) tokenize = SentTokenizer(offsets=False) for k, v in part.iteritems(): clusters[v].extend(tokenize(citance[k]['refs'])) # clusters includes ref sentences that belong in each cluster # Find the most salient sentence in each cluster sal_in_cluster = {} # salient sentences for each cluster for i in clusters: parser = PlaintextParser.from_string( ' '.join(clusters[i]).replace('\\', ''), Tokenizer('english')) summ = summarizer(parser.document, 5) # 5 is the number of sentences returned by LexRank sal_in_cluster[i] = [unicode(s) for s in summ] # The most salient sentences in each cluster summary[topic.upper()] =\ self.pick_from_cluster( sal_in_cluster, max_length, weighted=False) return summary
def summarize(text): total = "" parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): total += str(sentence) return total
def summary(): max_sent = 10 language = 'english' url = request.form['summary'] tokenizer = Tokenizer(language) article = alt_extract(url) parser = PlaintextParser.from_string(article, tokenizer) summary = summarizer(parser, max_sent, language).decode('utf-8') return render_template('summary.html', url=url, summary=summary)
def getSummary(self, num_sentences): lex_rank = LexRankSummarizer() text = str(self.bpLargGetText()) parser = PlaintextParser.from_string(text, Tokenizer('english')) summary = lex_rank(parser.document, num_sentences) sentences = [] for sent in summary: sentences.append(str(sent)) return sentences
def summarize(self, extracted_refs, facet_results, max_length=250): ''' Summarizes the extracted references based on the facet results Args: extracted_refs(list) -- results of the method.run (e.g. simple.py) facet_results(dict) -- facets for each extracted reference Look at data/task1b_results1.json max_length(int) -- maximum length of the summary ''' summaries = defaultdict(lambda: defaultdict(list)) for t in extracted_refs: topic = t[0]['topic'] citance = t[0]['citance_number'] if isinstance(t[0]['sentence'][0], list): logger.warn('Unexpected, should check') summaries[topic.upper()]\ [facet_results[topic.upper()] [str(citance)]['SVM_LABEL']].append([t[0]['citation_text']]) summarizer = TextRankSummarizer(Stemmer('english')) final_summ = defaultdict(lambda: defaultdict(dict)) ret_summ = defaultdict(list) counts = defaultdict(lambda: defaultdict(dict)) for t in summaries: for facet in summaries[t]: if len(summaries[t][facet]) > 1: summs = list( itertools.chain.from_iterable(summaries[t][facet])) parser = PlaintextParser.from_string( ' '.join(summs), Tokenizer('english')) summ = summarizer(parser.document, max_length) final_summ[t][facet] = [unicode(sent) for sent in summ] counts[t][facet] = len(final_summ[t][facet]) else: final_summ[t][facet] = self.s_t(summaries[t][facet][0]) i = 0 while self.w_t.count_words(ret_summ[t]) < max_length: for fct in final_summ[t]: if i < len(final_summ[t][fct]): ret_summ[t].append(final_summ[t][fct][i]) i += 1 while self.w_t.count_words(ret_summ[t]) > max_length: ret_summ[t].pop() # summ = defaultdict(list) # tokzer = WordTokenizer(stem=False) # for k in final_summ: # i = 0 # while tokzer.count_words(summ[k]) < max_length: # for f in final_summ[k]: # if len(final_summ[k][f]) > i and\ # tokzer.count_words(summ[k]) < max_length: # summ[k].append(final_summ[k][f][i]) return ret_summ
def summarize(content): parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) text = '\n'.join( [str(sentence) for sentence in summarizer(parser.document, COUNT)] ) summary = Summary(content=content, summary=text) summary.save()
def summary(text, summarizer_class): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = summarizer_class(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) yield sentence
def summarize(filename, num_sentences): with open (filename, "r") as myfile: data=myfile.read() parser = PlaintextParser.from_string(data, Tokenizer('english')) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") summary = "" for sentence in summarizer(parser.document, num_sentences): summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " return summary
def summarizeText(self, body, numSentences = 10): """Summarizes body of text to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def get_summary(self, text): parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, 3) #Summarize the document with 5 sentences result = "" for sentence in summary: result += " " + str(sentence) return result
def summarize(text): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): result += str(sentence) + " " return result
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) self.assertEqual(len(sentences), 2)
def summarize(text, size=2): parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = LexRankSummarizer() summary = summarizer(parser.document, size) summarize_text="" for sentence in summary: summarize_text+=(str(sentence)+" ") summarize_text=summarize_text.strip() return summarize_text
def test_issue_5_sigma_can_multiply_matrix_v(self): """Source: https://github.com/miso-belica/sumy/issues/5""" parser = PlaintextParser.from_string( load_resource("articles/sigma_can_multiply_matrix_v.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(english_stemmer) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) self.assertEqual(len(sentences), 20)
def get_summary(text, max_sentences=5): parser = PlaintextParser.from_string(text, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") summary = [] for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10 summary.append(str(sentence._text.encode('ascii', 'ignore'))) return summary
def get_quotes(raw_text): parser = PlaintextParser.from_string(clean_text(raw_text), Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): sentences.append(sentence)
def test_issue_5_svd_converges(): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") parser = PlaintextParser.from_string( load_resource("articles/svd_converges.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def __init__(self, content): sentence_length = '50%' parser = PlaintextParser.from_string(content, Tokenizer(self.LANGUAGE)) stemmer = Stemmer(self.LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANGUAGE) summarized = summarizer(parser.document, sentence_length) for sentence in summarized: self.SUMMARY += "%s\n\n" % self._sentence(sentence) self.WORD_COUNT = self._word_counter(content) self.SUMMARY_COUNT = self._word_counter(self.SUMMARY)
def sum_spark(doc): parser = PlaintextParser.from_string(doc,Tokenizer('english')) summarizer = Summarizer(Stemmer('english')) summarizer.stop_words = stop_books texts=[] for sentence in summarizer(parser.document, 2): texts.append(str(sentence)) return texts
def summer_time(): from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer text_format = entry.get('1.0', tk.END) # We can use this parse format for all three when we use raw strings parser_config = PlaintextParser.from_string(text_format, Tokenizer("english")) summerTime = SummerTime() summer_all = summerTime.lex_rank_analysis(parser_config, 2) summer_all = summer_all + summerTime.luhn_analysis(parser_config, 2) summer_all = summer_all + summerTime.lsa_analysis(parser_config, 2) scrubbed = [] for sentence in summer_all: concat = str(sentence) + "\n\n\n" concat.replace("", "{") concat.replace("", "}") scrubbed.append(concat) output_display.insert(tk.END, scrubbed) print("\nAbout to print summer all results\n") print(summer_all)
def models_LUHN_LEX_LSA(article): ## Candidate models: # Bag of Words # FastText # word2vec # LDA (topic extraction) # skip-thoughts # doc2vec # LSTM LANGUAGE = "english" stop = get_stop_words(LANGUAGE) stemmer = Stemmer(LANGUAGE) parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) result = [] summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence in LUHNsentence: LUHNsummary = sentence for sentence in LEXsentence: LEXsummary = sentence for sentence in LSAsentence: LSAsummary = sentence result.append(LUHNsummary) result.append(LEXsummary) result.append(LSAsummary) return result
def summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords): # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions. ##print(srt_to_doc(srt_file)) parser = PlaintextParser.from_string(srt_to_doc(srt_file), Tokenizer(language)) if(summarizer == 'ED'): summarizer = EdmundsonSummarizer() with open(bonusWords,"r+") as f: bonus_wordsList = f.readlines() bonus_wordsList = [x.strip() for x in bonus_wordsList] f.close() with open(stigmaWords,"r+") as f: stigma_wordsList = f.readlines() stigma_wordsList = [x.strip() for x in stigma_wordsList] f.close() summarizer.bonus_words = (bonus_wordsList) summarizer.stigma_words = (stigma_wordsList) summarizer.null_words = get_stop_words(language) else: stemmer = Stemmer(language) summarizer = SUMMARIZERS[summarizer](stemmer) summarizer.stop_words = get_stop_words(language) ret = [] summarizedSubtitles = [] print() # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence for sentence in summarizer(parser.document, n_sentences): #print(sentence) # Index of the sentence index = int(re.findall("\(([0-9]+)\)", str(sentence))[0]) # Using the index we determine the subtitle to be selected item = srt_file[index] summarizedSubtitles.append(item) # add the selected subtitle to the result array ret.append(srt_item_to_range(item)) return ret,summarizedSubtitles
def run_sumy(text, algo='KL', sent_count=6): # time0 = time.time() parser = PlaintextParser.from_string(text, Tokenizer("english")) # time1 = time.time() stemmer = Stemmer("english") # time2 = time.time() if algo == 'KL': summarizer = KLSummarizer(stemmer) elif algo == 'LexRank': summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") # time3 = time.time() summary_list = summarizer(parser.document, sent_count) # time4 = time.time() # print('Parse time: {} \t Stem time: {} \t Stop words time: {} \t Summarizer time: {}'.format(time1, time2, time3, time4)) return summary_list
def prepare_data(self): # Remove short summaries with less than N words # Filter by most voted for each group # Load document vectors self.remove_short_reviews() self.lowercase_reviews() self.retrieve_top_docs() self.tokenize_reviews testdf = pd.DataFrame() for topic in self.train_df['defect_topic'].unique(): # Because we need to have legible sentences, we use raw comments instead of cleaned ones text = '\n'.join(self.train_df['comment'].loc[self.train_df['defect_topic'] == topic]) parser = PlaintextParser.from_string(text, Tokenizer("english")) self.train_features.append([topic, parser]) testdf = pd.concat([testdf, self.train_df['comment'].loc[self.train_df['defect_topic'] == topic]], sort=False) testdf.to_csv("./data/uc3/defects_summarizer/summary_data.csv")
def summarize_wth_sumy(sumy_summarizer, sentences, summary_len): """Summarize text using Sumy summarizer. Args: summarizer: sumy summarizer such as LexRankSummarizer, LsaSummarizer sentences: list, list of sentences to summarize summary_len: int, number of sentences in summary Return: tuple of sentences """ parser = PlaintextParser.from_string(" ".join(sentences), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = sumy_summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sum_sentences = [ sentence._text for sentence in summarizer(parser.document, summary_len) ] return sum_sentences
def extract_components(bill_id, url): url = url + "/text" browser = RoboBrowser(user_agent='Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', history=True) browser.open(url) content = browser.find("section", class_="legis-body") if content is None: return [] sections = content.find_all("section", class_="little-level") section_data = content.text parser = PlaintextParser.from_string(section_data, Tokenizer("english")) summarizer = LsaSummarizer() num_sentences = 10 if len(sections) > 10 else len(sections) summary = summarizer(parser.document, num_sentences) return list(set(summary)) '''
def lexRank_summarize(original_text): import sumy """ in a python sell: import nltk nltk.download('punkt') """ from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer myParser = PlaintextParser.from_string(original_text, Tokenizer('english')) # Creating a summary of 3 sentences. lexRank_summarizer = LexRankSummarizer() summary = lexRank_summarizer(myParser.document, sentences_count=5) for sentence in summary: print(sentence)
def TextRank_All(rsc_file, dst_file, count): with open(rsc_file, 'r', encoding='utf-8') as fr: docs = fr.read().split('###')[1:] with open(dst_file, 'w', encoding='utf-8') as fw: for doc_string in docs: if not doc_string: continue language = "chinese" parser = PlaintextParser.from_string(doc_string, Tokenizer(language)) stemmer = Stemmer(language) # 语言容器 summarizer = TextRankSummarizer(stemmer) # LSA算法 summarizer.stop_words = get_stop_words(language) for sentence in summarizer(parser.document, count): fw.write(str(sentence)) fw.write('\n') print(sentence) print('===========================================\n') fw.write('===========================================')
def get_summary_lex_rank(self, num_sentence): from sumy.parsers.plaintext import PlaintextParser # other parsers available for HTML etc. from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer # We're choosing Lexrank, other algorithms are also built in try: parser = HtmlParser.from_url(self.url, Tokenizer("english")) except: try: parser = PlaintextParser.from_string(self.body, Tokenizer("english")) except Exception as e: raise (e) summarizer = LexRankSummarizer() summary = summarizer(parser.document, num_sentence) out = '' for sentence in summary: out += str(sentence) return out
def main(req: func.HttpRequest) -> func.HttpResponse: ret = "" logging.info('Python HTTP trigger function processed a request.') text = str(req.get_body()) soup = BeautifulSoup(text, features="lxml") souped = soup.get_text() SENTENCES_COUNT = int(math.log(souped.count('.'), 4)) parser = PlaintextParser.from_string(souped, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, max(SENTENCES_COUNT, 2)): ret += str(sentence) return func.HttpResponse(re.sub(r'\\\w{3}','',ret))
def sumy_summarizer(text, ratio, summarizer_type): num_sent = int(len(text.split(".")) * ratio) parser = PlaintextParser.from_string(text, Tokenizer("english")) if ((summarizer_type == 'lexrank') or (summarizer_type == 'Lexrank')): summarizer_instance = LexRankSummarizer() elif ((summarizer_type == 'reduction') or (summarizer_type == 'reduction')): summarizer_instance = ReductionSummarizer() elif ((summarizer_type == 'lsa') or (summarizer_type == 'LSA')): summarizer_instance = LsaSummarizer() elif ((summarizer_type == 'luhn') or (summarizer_type == 'Luhn')): summarizer_instance = LuhnSummarizer() elif ((summarizer_type == 'KL') or (summarizer_type == 'kl')): summarizer_instance = KLSummarizer() summary_values = summarizer_instance(parser.document, num_sent) final_summary = [] for sent in summary_values: final_summary.append(str(sent)) summary_values = convert_to_string(final_summary) return summary_values
def return_paper(url): try: response = requests.get(url) search = BeautifulSoup(response.text, "html.parser") except ConnectionError as e: return ['', ''] title = "" content = "" year = "" try: title = search.find("div", {"class": "rprt_all"}) title = title.find('h1') title = title.get_text() except AttributeError as e: pass try: year = search.find("div", {"class": "cit"}) year = year.get_text() year = re.search(r'\d+', year).group() year = int(year) except AttributeError as e: pass try: content = search.find("div", {"class": "abstr"}) content = content.find('p') content = content.get_text() parser = PlaintextParser.from_string(content, Tokenizer("english")) # Using LexRank summarizer = LexRankSummarizer() #Summarize the document with 2 sentences summary = summarizer(parser.document, 2) sum_abstract = " " for i in summary: sum_abstract += str(i) content = sum_abstract except AttributeError as e: pass return [title, content, year, url]
def summary_benchmarks(sentences_string): ''' :param sentences_string: all sentences as one string, has been tokenized :return: ''' parser = PlaintextParser.from_string(sentences_string, Tokenizer("english")) print('=========== Basic Sum ============') summarizer = SumBasicSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('=========== LSA ============') summarizer = LsaSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========LexRank============') summarizer = LexRankSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========KL Divergence============') summarizer = KLSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========Luhn============') summarizer = LuhnSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence
def get_task(): if not request.form: return 'Missing parameter' if 'language' not in request.form: return 'Missing language' if 'text' not in request.form: return 'Missing text' language = request.form.get("language") text = request.form.get('text') if language == 'english': abstract = summarizer.summarize(text) if abstract.strip() == '': return jsonify({'abstract': text}) return jsonify({'abstract': abstract}) elif language == 'chinese': tr4s = TextRank4Sentence() tr4s.analyze(text=text, lower=True, source='all_filters') if len(tr4s.get_key_sentences()) < 3: return jsonify({'abstract': text}) abstract = '' for item in tr4s.get_key_sentences(num=3): abstract += item.sentence + '\n' return jsonify({'abstract': abstract.strip()}) elif language == 'japanese': parser = PlaintextParser.from_string(text, Tokenizer(language)) stemmer = Stemmer(language) summar = LsaSummarizer(stemmer) summar.stop_words = get_stop_words(language) if len(summar(parser.document, 3)) < 3: return jsonify({'abstract': text}) abstract = '' for sentence in summar(parser.document, 3): abstract += str(sentence) + '\n' return jsonify({'abstract': abstract.strip()}) return language + 'is not supported.'
def test_parse_plaintext(): parser = PlaintextParser.from_string( """ Ako sa máš? Ja dobre! A ty? No mohlo to byť aj lepšie!!! Ale pohodička. TOTO JE AKOŽE NADPIS A toto je text pod ním, ktorý je textový. A tak ďalej... """, Tokenizer("czech")) document = parser.document assert len(document.paragraphs) == 2 assert len(document.paragraphs[0].headings) == 0 assert len(document.paragraphs[0].sentences) == 5 assert len(document.paragraphs[1].headings) == 1 assert len(document.paragraphs[1].sentences) == 2
def get_summary(text): total_string = "" from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = int(text.count(".") * 0.2 + 1) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): total_string += str(sentence) return total_string
def summarize(self): txt = self.input_text.toPlainText() print(txt) parser = None # Testing summarizer and checking for errors try: parser = PlaintextParser.from_string(txt, Tokenizer("english")) print("summarizing") summarizer = LexRankSummarizer() print("summarizing") summary = summarizer(parser.document, 4) self.output_text.clear() for sentence in summary: print(sentence) self.output_text.insertPlainText(str(sentence)) except Exception as err: print("ERROR: " + str(err)) finally: print("finally")
def get_summary_scores(sentences, summarizer): text = '<SENTENCE>'.join(sentences) parser = PlaintextParser.from_string(text, Tokenizer('english')) try: rating_dict = summarizer.rate_sentences(parser.document) except: rating_dict = summarizer._compute_ratings(parser.document.sentences) ratings = np.array([rating_dict[s] for s in parser.document.sentences]) m = np.min(ratings) M = np.max(ratings) if m == M: ratings = np.ones(len(ratings)) else: ratings = (ratings - m) / (M - m) summ_sentences = [str(s) for s in parser.document.sentences] return summ_sentences, ratings
def get_info(path): with open(path, 'r') as f: text = f.read() LANGUAGE = "english" SENTENCES_COUNT = 10 #url = "https://en.wikipedia.org/wiki/Automatic_summarization" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain strings parser = PlaintextParser.from_string( text, Tokenizer(LANGUAGE)) #PlaintextParser.from_file for files stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SUMMARY", "\n") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) #print(keywords.keywords(text)) #original keywords '''keywordlist1=keywords.keywords(text,split=True) #list of keywords
def sumy_fun(text, lang, method): sumy_method_dict = {'lsa': lsa, 'edm': edm, 'luhn': luhn, 'lex': lex} if 'en' in lang: lang = 'en' lang_dict = {'fr': 'french', 'es': 'spanish', 'de': 'german', 'en': 'english'} Summarizer = sumy_method_dict[method] summary = '' LANGUAGE = lang_dict[lang] nb_sentence = len(text.split('.')) SENTENCES_COUNT = int(nb_sentence * 0.50) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): summary = summary + ' ' + str(sentence) return summary
def backlog_sumy(jenni, input): """Parses the backlog of the current channel and creates a summary with sumy""" backlog_length = 100 summary_length = 5 summarize_type = "sum-basic" channel = input.sender nick = input.nick cmds = input.group().split() if len(cmds) > 1 and cmds[1].isdigit(): backlog_length = int(cmds[1]) if len(cmds) > 2 and cmds[2].isdigit(): summary_length = int(cmds[2]) if len(cmds) > 3: summarize_type = cmds[3] jenni.say(summarize_type) # Backlog is only logged for channels if not channel.startswith("#"): return backlog = read_backlog(jenni, channel, backlog_length) backlog_str = "\n".join(backlog) # Get summary parser = PlaintextParser.from_string(backlog_str, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) # Allow selection of summarizer summarizer_class = next(cls for name, cls in AVAILABLE_METHODS.items() if summarize_type) summarizer = summarizer_class(stemmer) if summarizer_class is EdmundsonSummarizer: summarizer.null_words = get_stop_words(LANGUAGE) else: summarizer.stop_words = get_stop_words(LANGUAGE) jenni.say("Summary:") for sentence in summarizer(parser.document, summary_length): jenni.say(str(sentence))
def load_docsets(duc_dir): docset_paths = [ os.path.join(duc_dir, fname) for fname in os.listdir(duc_dir) ] docset_paths = [path for path in docset_paths if os.path.isdir(path)] docsets = {} for docset_path in docset_paths: print("\n" + docset_path) text = load_docset(docset_path) textDoc = [] for dom in text: for sentence in dom.sentences[1:]: textDoc.append(sentence.__unicode__()) x = ' '.join(textDoc) x = re.sub('\'\'', '', x) x = re.sub('``', '', x) x = re.sub('<SLUG>', '', x) x = re.sub('</SLUG>', '', x) y = PlaintextParser.from_string(x, Tokenizer(LANGUAGE)) summary = summarizer(y.document, SENTENCES_COUNT) folder_name = docset_path.split('/')[-1] names = folder_name[:-1] + '.M.250.' + folder_name[-1] paths = [ name + char for name, char in zip([names] * 10, ['.A', '.B', '.C', '.D', '.E', '.F', '.G', '.H', '.I', '.J']) ] # print(paths) for path in paths: try: # print(path) groundTruth = PlaintextParser.from_file( GtPath + path, Tokenizer(LANGUAGE)) res.append(rouge_1(summary, groundTruth.document.sentences)) print(res[-1]) except: # print('exp on') # print(path) pass
def _getGenericSummaryText(self, desiredWordCount): # Notice that in this function, the Sentence object is internal to the Sumy library. # In the rest of this project, the Sentence is the local type. That also why we use corpus.getSentenceIdByText # to get the possible ID of the sentence being used. Since the Sumy library may be using a different sentence # tokenizer, the sentence segmentation may not match to the one we used, so we may not actually find # the ID of the sentence in the corpus.getSentenceIdByText function. summarizer, langauage = self._initGenericSummarizer() parser = SumyPlaintextParser.from_string(self.corpus.getAllText(), SumyTokenizer(langauage)) # we must pass a sentence count to Sumy, so we divide the word count specified by 5 to make sure we have enough sentences: potentialSummarySentences = summarizer(parser.document, desiredWordCount / 5) numWordsInSummary = 0 finalSummary = [] for sentence in potentialSummarySentences: if isPotentialSummarySentence(sentence._text, len(sentence.words)): finalSummary.append(sentence._text) numWordsInSummary += len(sentence.words) self.usedSentences[self.corpus.getSentenceIdByText(sentence._text)] = True if numWordsInSummary >= desiredWordCount: break return ' '.join(finalSummary)
def get_summary(self, num_sentence): out = '' try: try: parser = HtmlParser.from_url(self.url, Tokenizer("english")) except: try: parser = PlaintextParser.from_string( self.body, Tokenizer("english")) except Exception as e: raise (e) stemmer = Stemmer('english') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('english') for sentence in summarizer(parser.document, num_sentence): out += str(sentence) except: return self.body return out
def test_real_example(self): parser = PlaintextParser.from_string( "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. " "Přerostly až v reparát z jazyka na konci školního roku. " "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. " "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě " "o rok mladších dětí budoval vedoucí pozici. " "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.", Tokenizer("czech")) summarizer = LuhnSummarizer(stem_word) summarizer.stop_words = get_stop_words("czech") returned = summarizer(parser.document, 2) self.assertEqual(len(returned), 2) self.assertEqual( to_unicode(returned[0]), "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením." ) self.assertEqual( to_unicode(returned[1]), "Připadal si, že je mezi malými dětmi a realizoval se tím, " "že si ve třídě o rok mladších dětí budoval vedoucí pozici.")
def summarize(string, summarizer_type): """ Function takes in a string and a summariser type as input. Output is a summary based on the chosen model and the sentence length """ # url = "https://en.wikipedia.org/wiki/Automatic_summarization" # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) #from text parser = PlaintextParser.from_string(string, Tokenizer(LANGUAGE)) # or for plain text files #parser = PlaintextParser.from_file("concatenated\\0bTrkuk4ReA2ysnhY2BaYs.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = summarizers[summarizer_type](stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = '' for sentence in summarizer(parser.document, SENTENCES_COUNT): result += str(sentence) + ' ' return result
def clean_article(article): """ """ cleaned = { '_id': article['_id'], 'title': article['title'].encode('ascii', errors='ignore') if article['title'] else None, 'issue': article['issue'], 'link': article['link'] } tokens = word_tokenize(article['body']) # words = [word for word in tokens if word.isalpha()] long_string = ' '.join(tokens) parser = PlaintextParser.from_string(long_string, Tokenizer(LANGUAGE)) for index, sentence in enumerate( summarizer(parser.document, SENTENCES_COUNT)): cleaned.update({ 'sentence_' + str(index): sentence._text.encode('ascii', errors='ignore') }) for key in range(SENTENCES_COUNT): nonsense_words = 0 if cleaned.get('sentence_' + str(key)): for word in cleaned[ 'sentence_' + str(key)].split(): # This .split() is very important if not CHECKER.check(word): nonsense_words += 1 cleaned['nonsense_words' + "_" + str(key)] = nonsense_words return cleaned
def summarize(data): # use sumy to summarize the text entered summarizer = LexRankSummarizer() parser = PlaintextParser.from_string(data["text"], Tokenizer("english")) summary = summarizer(parser.document, data["lines"]) # formatted summary text = "\t" # extract and format the text from the summarizer counter = 0 for line in summary: if counter >= 5: text += "<br><br>\t" counter = 0 text += str(line) + " " counter += 1 # respond to the user with the summarized text emit("response", text) pass
def summarise(self, filename): """ Generates a summary of the paper. :param filename: the name of the file to summaries :param name: the name of the file that will be written :return: a sumamry of the paper. """ paper = self.prepare_paper(filename) parser = PlaintextParser.from_string(paper, Tokenizer("english")) summary = self.summariser(parser.document, self.summary_length) # The "1" is only added her to stop the summary breaking the save function - it's a bit of an ungainly hack summary = [(unicode(x), 1) for x in summary] useful_functions.write_summary(SUMMARY_WRITE_LOC, summary, filename.strip(".txt")) for sentence in summary: print(sentence) print()