def klsum(parser, sentence_count): summarizer_6 = KLSummarizer(Stemmer(language)) summarizer_6.stop_words = get_stop_words(language) summary_6 = summarizer_6(parser.document, sentence_count) temp = '' for sentence in summary_6: temp = temp + str(sentence) return (temp)
def kl_summarizer(text, stemmer, language, sentences_count): parser = PlaintextParser.from_string(text, Tokenizer(language)) summarizer_luhn = KLSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words(language) sentences = [] for sentence in summarizer_luhn(parser.document, sentences_count): a = sentence sentences.append(str(a)) return "\n".join(sentences)
def __init__(): LANGUAGE = "english" SENTENCES_COUNT = 1 stemmer = Stemmer(LANGUAGE) lsaSummarizer = Lsa(stemmer) lsaSummarizer.stop_words = get_stop_words(LANGUAGE) luhnSummarizer = Luhn(stemmer) luhnSummarizer.stop_words = get_stop_words(LANGUAGE) # edmundsonSummarizer.bonus_words = get_bonus_words lexrankSummarizer = LexRank(stemmer) lexrankSummarizer.stop_words = get_stop_words(LANGUAGE) textrankSummarizer = TxtRank(stemmer) textrankSummarizer.stop_words = get_stop_words(LANGUAGE) sumbasicSummarizer = SumBasic(stemmer) sumbasicSummarizer.stop_words = get_stop_words(LANGUAGE) klSummarizer = KL(stemmer) klSummarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_string(text, 0, Tokenizer(LANGUAGE)) allvariations = [] for sentence in lsaSummarizer(parser.document, SENTENCES_COUNT): # print("Summarizing text via LSA: ") print((str(sentence))) allvariations.append(sentence) for sentence in luhnSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Luhn: ") print(str(sentence)) allvariations.append(sentence) for sentence in lexrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Lexrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in textrankSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Textrank: ") print(str(sentence)) allvariations.append(sentence) for sentence in sumbasicSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via Sumbasic: ") print(str(sentence)) allvariations.append(sentence) for sentence in klSummarizer(parser.document, SENTENCES_COUNT): #print("Summarizing text via klSum: ") print(str(sentence)) allvariations.append(sentence) return allvariations
def test_tf_idf_metric_should_be_real_number(): """https://github.com/miso-belica/sumy/issues/41""" summarizer = KLSummarizer() frequencies = summarizer.compute_tf([Sentence("There are five words, jop.", Tokenizer("english"))]) assert frequencies == { "there": 0.2, "are": 0.2, "five": 0.2, "words": 0.2, "jop": 0.2, }
def run_sumy(text, algo='KL', sent_count=3): parser = PlaintextParser.from_string(text, Tokenizer("english")) stemmer = Stemmer("english") if algo == 'KL': summarizer = KLSummarizer(stemmer) elif algo == 'LexRank': summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") summary_list = summarizer(parser.document, sent_count) return summary_list
def klReferenceSummary(path): sentencesList = [] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def klReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def KL(rsc_file, dst_file, count): language = "chinese" parser = PlaintextParser.from_file(rsc_file, Tokenizer(language), encoding='utf-8') stemmer = Stemmer(language) # 语言容器 summarizer = KLSummarizer(stemmer) # LSA算法 summarizer.stop_words = get_stop_words(language) with open(dst_file, 'w', encoding='utf-8') as f: for sentence in summarizer(parser.document, count): f.write(str(sentence)) f.write('\n') print(sentence)
def createSummary (text, language="english", num_sentences=3, method="lexrank"): #LANGUAGE = "english" #SENTENCES_COUNT = 5 # url = "https://en.wikipedia.org/wiki/Automatic_summarization" # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) # Language tokenizer tokenizer = Tokenizer(language) parser = PlaintextParser.from_string(text, tokenizer) # word stemming stemmer = Stemmer(language) if (method == "lexrank"): summarizer = LexRankSummarizer(stemmer) elif (method == "lsa"): summarizer = LSASummarizer(stemmer) elif (method == "luhn"): summarizer = LuhnSummarizer(stemmer) elif (method == "kl"): summarizer = KLSummarizer(stemmer) else: raise Exception (f'Unknown summarization method: ${method}') summarizer.stop_words = get_stop_words(language) result = [] for sentence in summarizer(parser.document, num_sentences): result.append (str(sentence)) return result
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join( [obj._text for obj in summarizer(parser.document, length)]) return summary
def find_relevant_quote(book_id, chapter, num_sentences=1, technique='luhn'): """ Create an extractive summary for a chapter of the book. Parameters: book_id: (str) the book identifier chapter: is the chapter number to summarize num_sentences: how many sentences to extract Returns: sentences: the extracted sentences """ chapter_filename = get_data_filename(book_id, 'book_chapters', chapter) parser = PlaintextParser.from_file(chapter_filename, Tokenizer("english")) if technique == 'lsa': summarizer = LsaSummarizer() elif technique == 'lexrank': summarizer = LexRankSummarizer() elif technique == 'textrank': summarizer = TextRankSummarizer() elif technique == 'kl': summarizer = KLSummarizer() elif technique == 'random': summarizer = RandomSummarizer() elif technique == 'reduction': summarizer = ReductionSummarizer() elif technique == 'sumbasic': summarizer = SumBasicSummarizer() else: summarizer = LuhnSummarizer() summary = summarizer(parser.document, num_sentences) return summary
def KLSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = KLSummarizer() summary = summarizer(parser.document, sentences) # for sentence in summary: # print(sentence) return summary
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def klsum_summarize(original_text, s): global cnt cnt += 1 if (cnt % 100 == 0): print(cnt) from sumy.summarizers.kl import KLSummarizer from sumy.nlp.tokenizers import Tokenizer from sumy.parsers.plaintext import PlaintextParser parser = PlaintextParser.from_string(original_text, Tokenizer('english')) kl_summarizer = KLSummarizer() kl_summary = kl_summarizer(parser.document, sentences_count=s) # Printing the summary res = "" for sentence in kl_summary: res += (str(sentence) + '.' + "\n") return res
def summarize_terms_text(txt): text_data = unidecode.unidecode(txt) clean_list, pure_list = prepare_for_regex(text_data) data_to_summarize = [] for clean, pure in zip(clean_list, pure_list): if re.findall(clause, clean): data_to_summarize.append(pure) text_data = " ".join(data_to_summarize) parser = PlaintextParser(text_data, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = KLSummarizer(stemmer) summary = summarizer(parser.document, SENTENCES_COUNT) if len(summary) == 0: summary = ["No Terms"] sentences = [str(x) for x in summary] message = HTML_OPEN + "<ul class='rolldown-list' id='myList'>" you_agree = [] they_agree = [] other_clause = [] for sentence in sentences: # TODO: logging in the future lower = sentence.lower() you_idx = lower.find("you") they_idx = lower.find("we") if (you_idx == -1 or you_idx > 15) and (they_idx == -1 or they_idx > 15): other_clause.append(sentence) elif you_idx == -1: they_agree.append(sentence) elif they_idx == -1: you_agree.append(sentence) elif you_idx < they_idx: you_agree.append(sentence) else: they_agree.append(sentence) if len(you_agree) > 0: message += YOU_AGREE_HEADER + "<li>" message += "</li><li>".join(you_agree) message += "</li>" if len(they_agree) > 0: message += THEY_AGREE_HEADER + "<li>" message += "</li><li>".join(they_agree) message += "</li>" if len(other_clause) > 0: message += OTHER_HEADER + "<li>" message += "</li><li>".join(other_clause) message += "</li>" message += "</ul></div>" return json.dumps(message)
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def KLSummary(document, sentences): parser = PlaintextParser.from_string(document, Tokenizer("english")) summarizer = KLSummarizer() summary = summarizer(parser.document, sentences) results = [] for sentence in summary: results.append(str(sentence)) return results
def __init__(self): """ Oracle summariser is not an actual, usable summariser. It extracts the best sentences from the paper possible by comparing them to the gold summaries. It represents the high-water mark in what ROUGE score it is possible for a summariser to achieve. """ self.summary_length = 10 self.summariser = KLSummarizer()
def apply_text_model(self, text: str) -> []: parser = PlaintextParser.from_string(text, Tokenizer("english")) init_sents = text2sentences(parser._text) summarizer = KLSummarizer() summary = summarizer(parser.document, self.summary_n_sent) summary_ind = [] for summary_sent in summary: summary_ind.append(init_sents.index(str(summary_sent))) return summary_ind
def generate_benchmark_summary(filename, num_summary): parser = PlaintextParser.from_file( 'data/text_summary/' + filename + '.txt', Tokenizer("english")) print('=========== Basic Sum ============') Basic_Sum_sentences = [] summarizer = SumBasicSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence Basic_Sum_sentences.append(str(sentence)) print('=========== LSA ============') LSA_sentences = [] summarizer = LsaSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence LSA_sentences.append(str(sentence)) print('===========LexRank============') LexRank_sentences = [] summarizer = LexRankSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence LexRank_sentences.append(str(sentence)) print('===========KL Divergence============') KL_sentences = [] summarizer = KLSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence KL_sentences.append(str(sentence)) print('===========Luhn============') Luhn_sentences = [] summarizer = LuhnSummarizer() summary = summarizer( parser.document, num_summary) # Summarize the document with 5 sentences for sentence in summary: print sentence Luhn_sentences.append(str(sentence)) return Basic_Sum_sentences, LSA_sentences, LexRank_sentences, KL_sentences, Luhn_sentences
def run_sumy(text, algo='KL', sent_count=6): # time0 = time.time() parser = PlaintextParser.from_string(text, Tokenizer("english")) # time1 = time.time() stemmer = Stemmer("english") # time2 = time.time() if algo == 'KL': summarizer = KLSummarizer(stemmer) elif algo == 'LexRank': summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words("english") # time3 = time.time() summary_list = summarizer(parser.document, sent_count) # time4 = time.time() # print('Parse time: {} \t Stem time: {} \t Stop words time: {} \t Summarizer time: {}'.format(time1, time2, time3, time4)) return summary_list
def train(self): """ train model Args: input_text: str of input text Returns: model: initialised model parser: parser """ self.model = KLSummarizer()
def get_summarizer(): if EXTRACTIVE_SUMMARIZATION_ALGO == 'luhn': return LuhnSummarizer() elif EXTRACTIVE_SUMMARIZATION_ALGO == 'kl': return KLSummarizer() elif EXTRACTIVE_SUMMARIZATION_ALGO == 'lsa': return LsaSummarizer() elif EXTRACTIVE_SUMMARIZATION_ALGO == 'textrank': return TextRankSummarizer() elif EXTRACTIVE_SUMMARIZATION_ALGO == 'lexrank': return LexRankSummarizer()
def __init__(self, language='english'): stemmer = Stemmer(language) self.language = language self.algorithms = { 'kl': KLSummarizer(stemmer), 'lex_rank': LexRankSummarizer(stemmer), 'lsa': LsaSummarizer(stemmer), 'text_rank': TextRankSummarizer(stemmer) } for alg in self.algorithms: self.algorithms[alg].stop_words = get_stop_words(language)
def summarizeKL(text): parser_string = PlaintextParser.from_string(text, Tokenizer('english')) from sumy.summarizers.kl import KLSummarizer print('\nKL') summarizer = KLSummarizer() # Summarize the document with 2 sentences sentences = summarizer(parser_string.document, 2) #print('\nSummarizing text from string:') summary = '' for sent in sentences: summary += str(sent) summary += '\n' return summary
def summarizeKL(): from sumy.summarizers.kl import KLSummarizer print('\nKL') summarizer = KLSummarizer() # Summarize the document with 2 sentences summary = summarizer(parser_string.document, 2) print('\nSummarizing text from string:') for sent in summary: print(sent) print('\nSummarizing text from .txt file:') summary = summarizer(parser_file.document, 2) for sent in summary: print(sent)
def evaluate_summary(file_name, input_dir, sent_count, lingua_franca_summary, show_summaries): method_name = inspect.stack()[0][3] try: process_logger.debug("in "+ method_name +" method") file_model_summary = open(input_dir + file_name +".model", "r") model_summary = file_model_summary.read() rouge_scores_dict = {} rouge_scores = rouge_evaluation(lingua_franca_summary, model_summary) rouge_scores_dict[">>LINGUA FRANCA"] = rouge_scores file_summary = open("Test System Summary/" + file_name + "-" + "LINGUA FRANCA" + ".txt", "w") file_summary.write(lingua_franca_summary) LANGUAGE = "english" parser = PlaintextParser.from_file(input_dir + file_name + ".txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) lsa_summarizer = LsaSummarizer(stemmer) rouge_scores = sumy_summarizers("LSA", lsa_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["LSA"] = rouge_scores lex_summarizer = LexRankSummarizer(stemmer) rouge_scores = sumy_summarizers("LEX RANK", lex_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["LEX RANK"] = rouge_scores luhn_summarizer = LuhnSummarizer(stemmer) rouge_scores = sumy_summarizers("LUHN", luhn_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["LUHN"] = rouge_scores text_rank_summarizer = TextRankSummarizer(stemmer) rouge_scores = sumy_summarizers("TEXT RANK", text_rank_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["TEXT RANK"] = rouge_scores sum_basic_summarizer = SumBasicSummarizer(stemmer) rouge_scores = sumy_summarizers("SUM BASIC", sum_basic_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["SUM BASIC"] = rouge_scores kl_summarizer = KLSummarizer(stemmer) rouge_scores = sumy_summarizers("KL SUM", kl_summarizer, parser.document, sent_count, model_summary, show_summaries, file_name) rouge_scores_dict["KL SUM"] = rouge_scores # score_reader(rouge_scores_dict) df_rouge, summarizer_list = process_rouge_scores(rouge_scores_dict) return df_rouge, summarizer_list except Exception as Ex: error_logger.error("Exception occurred in " + method_name + "| Exception:" + str(Ex)) return None
def dada_summarize(content: str, title: str = "") -> dict : response = dict() content = process_content(content) title = process_title(title) # textrank [need newline to split sentence] response["textrank"] = summarize(content) # textteaser [need newline to split sentence] cnt = int(len(content.split('\n'))*0.3) response['textteaser'] = "\n".join(tt.summarize(title, content, count=cnt)) ### sumy parser = PlaintextParser.from_string(content, tokenizer) # LSA summarizer = LsaSummarizer(stemmer) sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)] response['lsa'] = "\n".join(sentences) # textrank2 summarizer = TextRankSummarizer(stemmer) sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)] response['textrank2'] = "\n".join(sentences) # lexrank summarizer = LexRankSummarizer(stemmer) sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)] response['lexrank'] = "\n".join(sentences) # ruduction summarizer = ReductionSummarizer(stemmer) sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)] response['reduction'] = "\n".join(sentences) #kl-sum summarizer = KLSummarizer(stemmer) sentences = [str(i) for i in summarizer(parser.document, SENTENCES_COUNT)] response['kl-sum'] = "\n".join(sentences) # bert response['bert'] = model(content, ratio=0.4) return response
def get_summarizers(self, names): """Retrieves sumy summarizers algorithms Parameters: names (list): list of summarizer algorithm names Returns: dict:summarizers """ summarizers = {} for name in names: if name == "random": from sumy.summarizers.random import RandomSummarizer summarizers["random"] = RandomSummarizer(null_stemmer) elif name == "luhn": from sumy.summarizers.luhn import LuhnSummarizer summarizers["luhn"] = LuhnSummarizer(stemmer=null_stemmer) elif name == "lsa": from sumy.summarizers.lsa import LsaSummarizer summarizers["lsa"] = LsaSummarizer(stemmer=null_stemmer) elif name == "lexrank": from sumy.summarizers.lex_rank import LexRankSummarizer summarizers["lexrank"] = LexRankSummarizer(null_stemmer) elif name == "textrank": from sumy.summarizers.text_rank import TextRankSummarizer summarizers["textrank"] = TextRankSummarizer(null_stemmer) elif name == "sumbasic": from sumy.summarizers.sum_basic import SumBasicSummarizer summarizers["sumbasic"] = SumBasicSummarizer(null_stemmer) elif name == "kl-sum": from sumy.summarizers.kl import KLSummarizer summarizers["kl-sum"] = KLSummarizer(null_stemmer) elif name == "reduction": from sumy.summarizers.reduction import ReductionSummarizer summarizers["reduction"] = ReductionSummarizer(null_stemmer) for _, summarizer in summarizers.items(): summarizer.stop_words = frozenset( self.stop_words._get_stop_words(custom_stop_words=[])) return summarizers
def sumy_summarizer(text, ratio, summarizer_type): num_sent = int(len(text.split(".")) * ratio) parser = PlaintextParser.from_string(text, Tokenizer("english")) if ((summarizer_type == 'lexrank') or (summarizer_type == 'Lexrank')): summarizer_instance = LexRankSummarizer() elif ((summarizer_type == 'reduction') or (summarizer_type == 'reduction')): summarizer_instance = ReductionSummarizer() elif ((summarizer_type == 'lsa') or (summarizer_type == 'LSA')): summarizer_instance = LsaSummarizer() elif ((summarizer_type == 'luhn') or (summarizer_type == 'Luhn')): summarizer_instance = LuhnSummarizer() elif ((summarizer_type == 'KL') or (summarizer_type == 'kl')): summarizer_instance = KLSummarizer() summary_values = summarizer_instance(parser.document, num_sent) final_summary = [] for sent in summary_values: final_summary.append(str(sent)) summary_values = convert_to_string(final_summary) return summary_values
def summary_benchmarks(sentences_string): ''' :param sentences_string: all sentences as one string, has been tokenized :return: ''' parser = PlaintextParser.from_string(sentences_string, Tokenizer("english")) print('=========== Basic Sum ============') summarizer = SumBasicSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('=========== LSA ============') summarizer = LsaSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========LexRank============') summarizer = LexRankSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========KL Divergence============') summarizer = KLSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence print('===========Luhn============') summarizer = LuhnSummarizer() summary = summarizer(parser.document, 3) # Summarize the document with 5 sentences for sentence in summary: print sentence
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.kl import KLSummarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words import sys LANGUAGE = "english" SENTENCES_COUNT = int(sys.argv[2]) text_file = sys.argv[1] if __name__ == "__main__": parser = PlaintextParser.from_file(text_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def summarizer(stop_words): summarizer = KLSummarizer() summarizer.stop_words = stop_words return summarizer