def getSummarizedList(sqs): output = "" # Directory checking if not os.path.exists(Dir): os.makedirs(Dir) try: summary = open(Dir + "input.txt", "w", encoding='utf-8-sig') file = open(Dir + "headline_summary.txt", "w", encoding='utf-8-sig') except error_to_catch: print("!") date = "" # filtering data for i in sqs: title = i.title.rstrip() pub_date = dateReformat(i.pub_date) # Creating new date dataset if pub_date != date: if date != "": local_summary.close() sys.stdout = file #summarizer = LexRankSummarizer(Stemmer(LANGUAGE)) # LexRankSummarizer not work if # of sentenses > ~25 summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) headline = PlaintextParser.from_file(Dir + date + ".txt", Tokenizer(LANGUAGE)) for sentence in summarizer(headline.document, SENTENCES_COUNT): print(sentence) output = output + pub_date + "\n" date = pub_date local_summary = open(Dir + date + ".txt", "w", encoding='utf-8-sig') local_summary.write(title + ".\n") output = output + title + ".\n" #For last post summarization# if title == sqs.latest('pub_date').title.rstrip(): local_summary.close() sys.stdout = file summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) headline = PlaintextParser.from_file(Dir + date + ".txt", Tokenizer(LANGUAGE)) for sentence in summarizer(headline.document, SENTENCES_COUNT): print(sentence) ############################# summary.write(output) file.close() summary.close() testing = readSummarizerResultToList("headline_summary.txt") return testing
def summarizeEdmundsonTitle(self, text, SENTENCES_COUNT, LANGUAGE): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) nullWords = get_stop_words(LANGUAGE) summarizer = EdmundsonTitleMethod(stemmer, nullWords) summarizer.stop_words = get_stop_words(LANGUAGE) summaryList = summarizer(parser.document, SENTENCES_COUNT) summary = ''.join([str(sentence) for sentence in summaryList]) return summary
def Summarize_Content_Custom(Audio_Text, sentences_count, Summarize_Method): actual_sentences_count = float(len(sent_tokenize(Audio_Text))) * 0.5 parser = PlaintextParser.from_string(Audio_Text, Tokenizer("english")) stemmer = Stemmer("english") if (Summarize_Method == "Gensim"): #ratio: define length of the summary as a proportion of the text temp = summarize(Audio_Text, ratio=0.5) sen = sent_tokenize(temp) sen = Counter(sen) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LexRankSummarizer"): # Using LexRank(Sentence based ranking based on repeating sentences) summarizer_Lex = LexRankSummarizer(stemmer) summarizer_Lex.stop_words = get_stop_words("english") #Summarize the document with 2 sentences summary = summarizer_Lex(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LuhnSummarizer"): # Using LUHN(Sentence based on frequency of most important words) summarizer_luhn = LuhnSummarizer(stemmer) summarizer_luhn.stop_words = get_stop_words("english") summary_1 = summarizer_luhn(parser.document, actual_sentences_count) sen = Counter(summary_1) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "LsaSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_lsa2 = LsaSummarizer() summarizer_lsa2 = LsaSummarizer(stemmer) summarizer_lsa2.stop_words = get_stop_words("english") summary = summarizer_lsa2(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0]) elif (Summarize_Method == "TextRankSummarizer"): # Using LSA(Sentence based on frequency of most important words) summarizer_text = TextRankSummarizer() summarizer_text = TextRankSummarizer(stemmer) summarizer_text.stop_words = get_stop_words("english") summary = summarizer_text(parser.document, actual_sentences_count) sen = Counter(summary) temp = sen.most_common(sentences_count) for value in temp: print(value[0])
def summarize(srt_file, summarizer, n_sentences, language, bonusWords, stigmaWords): # Converting the srt file to a plain text document and passing in to Sumy library(The text summarization library) functions. ##print(srt_to_doc(srt_file)) parser = PlaintextParser.from_string(srt_to_doc(srt_file), Tokenizer(language)) if (summarizer == 'ED'): summarizer = EdmundsonSummarizer() with open(bonusWords, "r+") as f: bonus_wordsList = f.readlines() bonus_wordsList = [x.strip() for x in bonus_wordsList] f.close() with open(stigmaWords, "r+") as f: stigma_wordsList = f.readlines() stigma_wordsList = [x.strip() for x in stigma_wordsList] f.close() summarizer.bonus_words = (bonus_wordsList) summarizer.stigma_words = (stigma_wordsList) summarizer.null_words = get_stop_words(language) else: stemmer = Stemmer(language) summarizer = SUMMARIZERS[summarizer](stemmer) summarizer.stop_words = get_stop_words(language) ret = [] summarizedSubtitles = [] # Now the the document passed is summarized and we can access the filtered sentences along with the no of sentence # for sentence in parser.document: # print("sentence ",sentence) # print("cod ",srt_file) # for ob in srt_file: # sent=srt_to_doc([ob]) # print("sent ",sent[4:]) for sentence in summarizer(parser.document, n_sentences): # Index of the sentence # print("sentence ",sentence) index = int(re.findall("\(([0-9]+)\)", str(sentence))[0]) # Using the index we determine the subtitle to be selected item = srt_file[index] # print("item ",item) summarizedSubtitles.append(item) # add the selected subtitle to the result array ret.append(srt_item_to_range(item)) return ret, summarizedSubtitles
def summarize_url(url, summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def summarize(text): if isvalid(text): all_capital = False # to avoid that all capital letter sentence gives empty output: we lower all and the upper all later on if text.upper() == text: text = text.lower() all_capital = True if PY2: parser = PlaintextParser.from_string( text.decode('ascii', errors='ignore'), Tokenizer(LANGUAGE)) else: parser = PlaintextParser.from_string( text.encode().decode('ascii', errors='ignore'), Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [ str(s) for s in summarizer(parser.document, sentences_count=n_sentences) ] if all_capital: output_sentences = ' '.join(sentences).upper() all_capital = False else: output_sentences = ' '.join(sentences) return output_sentences else: return ''
def node_page(): nid = request.args.get('id') KDB = client.kg_scrapy items = KDB.kg_content.find_one({'_id': nid}) if items == None: return "没有内容" else: LANGUAGE = "chinese" SENTENCES_COUNT = 10 stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) if len(items['content']) > 500: SENTENCES_COUNT = 5 else: SENTENCES_COUNT = 3 parser = PlaintextParser.from_string(items['content'], Tokenizer(LANGUAGE)) summary = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): summary.append(str(sentence)) titles = [] titles_p = DB.pre_titles.find({"parent": items['_id']}) for item in titles_p: irank, grade, softmax = get_rank(item['title']) # print(irank,grade,softmax) # print((items[i])) item['rank'] = irank item['softmax'] = softmax item['grade'] = grade titles.append(item) return render_template("node.html", **locals())
def summarize_url(url,summarizer): # E.G. url = "http://www.cnn.com/2016/06/12/politics/hillary-clinton-bernie-sanders-meeting-tuesday/index.html" print 'Summarizing ', url parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if summarizer == 'luhn': summarizer = LuhnSummarizer(stemmer) elif summarizer == 'edmundson': summarizer = ESummarizer(stemmer) elif summarizer == 'lsa': summarizer = LsaSummarizer(stemmer) elif summarizer == 'lex': summarizer = LexSummarizer(stemmer) elif summarizer == 'text': summarizer = TextSummarizer(stemmer) elif summarizer == 'sb': summarizer = SumBasicSummarizer(stemmer) else: summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print summarizer sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): print sentence sentences.append(str(sentence)) return sentences
def summarizer(request): inp=request.POST['geturl'] LANGUAGE = "english" SENTENCES_COUNT = 10 url = str(inp) f = open("denemedosyasiU3.txt", "w") f.write(url) f.close() parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) return render(request, 'home.html', {'data1': sentence}) '''
def summarizer(input_obj, SENTENCES_COUNT=2, op='url'): LANGUAGE = "english" # SENTENCES_COUNT = 1 # url = "https://sea.pcmag.com/smartphones/17424/apple-iphone-x" # text = ' '.join(text.split()) # print(input_obj) # print(type(input_obj)) parser = None if op == 'text': text = input_obj['text'] parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif op == 'url': url = input_obj['link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) else: print('OP ERROR') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): # print(sentence) sentences.append(str(sentence)) return sentences # print(get_summarize("https://sea.pcmag.com/smartphones/17424/apple-iphone-x"))
def summarize(corpus, length, algorithm): summarizer = None summary = "No compatible summarizer was selected, please use one of these : textrank, lexrank, luhn, edmonson*, kl, lsa, sumbasic, random (* doesn\'t work yet)" algorithm = algorithm.lower() try: parser = PlaintextParser.from_string(corpus,Tokenizer(LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(LANGUAGE)) if summarizer: summarizer.stop_words = get_stop_words(LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary except Exception as e: return str(e)
def pdfToText(BookPAth, bookid): pdfContent = "" summaryText = "" MEDIA = ROOT_MEDIA + str(BookPAth) pdfFileObj = open(MEDIA, 'rb') PdfReader = PyPDF2.PdfFileReader(pdfFileObj) totalPage = PdfReader.numPages for page in range(totalPage): page = PdfReader.getPage(page) page_content = page.extractText() page_content = page_content pdfContent = pdfContent + page_content with open(RESULT_ROOT + "BookText" + bookid + ".txt", "w") as f: f.write(pdfContent) f.close() LANGUAGE = "czech" SENTENCES_COUNT = 50 parser = PlaintextParser.from_file( RESULT_ROOT + "BookText" + bookid + ".txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): summaryText = summaryText + str(sentence) with open(RESULT_ROOT + "SummaryText" + bookid + ".txt", "w") as f: f.write(summaryText) f.close() return pdfContent
def summarize(text, n_sentences, sep='\n'): ''' Args: text (str or file): text itself or file in memory of text n_sentences (int): number of sentences to include in summary Kwargs: sep (str): separator to join summary sentences Returns: (str) n_sentences-long, automatically-produced summary of text ''' if isinstance(text, str): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif isinstance(text, file): parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE)) else: raise TypeError('text must be either str or file') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
def summarize_article(article, vibe_description_file_path): try: article_url = article['alternate'][0]['href'] article_title = article['title'] article_newspaper = Article(article_url) article_newspaper.download() article_newspaper.parse() article_newspaper.nlp() text_content = article_newspaper.text update_json_file(vibe_description_file_path, 'textContent', text_content) LANGUAGE = 'english' parser = HtmlParser.from_url(article_url, Tokenizer('english')) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) article_summary = [] for sentence in summarizer(parser.document, 3): article_summary.append(sentence._text) except: print('Error summarizing article') return False update_json_file(vibe_description_file_path, 'summary', article_summary) update_json_file(vibe_description_file_path, 'keywords', article_newspaper.keywords) return True
def main(url, num_sentences=10, language='english'): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) for sentence in summarizer(parser.document, num_sentences): print(sentence)
def summarize_with_info(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus, Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) summarizer.bonus_words = parser.significant_words summarizer.stigma_words = parser.stigma_words elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) return summarizer(parser.document, length)
def summarize(self, corpus, length, algorithm): parser = PlaintextParser.from_string(corpus,Tokenizer(self.LANGUAGE)) if algorithm == "textrank": summarizer = TextRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lexrank": summarizer = LexRankSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "luhn": summarizer = LuhnSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "edmundson": summarizer = EdmundsonSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "kl": summarizer = KLSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "lsa": summarizer = LsaSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "sumbasic": summarizer = SumBasicSummarizer(Stemmer(self.LANGUAGE)) elif algorithm == "random": summarizer = RandomSummarizer(Stemmer(self.LANGUAGE)) else: raise NotImplemented("Summary algorithm is not available") summarizer.stop_words = get_stop_words(self.LANGUAGE) summary = " ".join([obj._text for obj in summarizer(parser.document, length)]) return summary
def SumySummarize(text): from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 3 import nltk nltk.download('punkt') # url = "https://en.wikipedia.org/wiki/Automatic_summarization" # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) s = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): s += (str)(sentence) return s
def get_summary(self, summary_length: int = 10) -> Iterator[str]: parser = HtmlParser.from_url(self.link, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, summary_length): yield sentence
def summarizeFile(inputFile): summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") url = findURLS(inputFile) if url != None: if url[-1] == '.': url = url[0:-1] #print (url) #urlContent = 'Summary from URL ['+url+']: \n' urlContent = '' try: parser = HtmlParser.from_url(url, Tokenizer("english")) for sentence in summarizer(parser.document, 3): urlContent = urlContent + str(sentence) + '\n' except: #print (sys.exc_info()[0]) urlContent = '' content = inputFile.read() parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE)) #summarizer = LsaSummarizer(stem_word) #summarizer.stop_words = get_stop_words(LANGUAGE) #summary = 'Event Summary: \n' summary = '' try: for sentence in summarizer(parser.document, SENTENCES_COUNT_1): summary = summary + str(sentence) + '\n' except AssertionError: return None if url != None: return summary + urlContent return summary
def compute(self, text): text = text.replace("\t", " ").replace("\f", " ").replace("\n", " ").strip() parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summ = " ".join([ sentence._text for sentence in summarizer(parser.document, self.limit_sentences) ]) if (len(summ) > 0): keywords = self.keyword_extractor.extract_keywords(summ) else: keywords = self.keyword_extractor.extract_keywords(text) keywords = [kw[0] for kw in keywords] metadata = { "keywords": keywords, "summary_orig": summ, "summary": self.fix_summary(summ) } return metadata
def get_sumy(text): """ 获取摘要 """ LANGUAGE = "chinese" SENTENCES_COUNT = 5 stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) l = [] # print(parser.document) items = [] # for s in parser.document: # items.append(s) for sentence in summarizer(parser.document, SENTENCES_COUNT): l.append(str(sentence)) # del sentence # try: # for s in parser.document: # items.append(s) # except: # pass return l, items
def edmunson(self, text): # Sprache wählen language = "german" # Die Prozentzahl vom Schieberegler ziehen divident = 100 / self.scale.get() # Den Text tokenizen und einen Stemmer zum Summarizer hinzufügen parser = PlaintextParser.from_string(text, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) # Spezifische Wortlisten definieren # Die bonus, stigma und null words sollen nicht genutzt werden aber es wird kein leerer Input akzeptiert summarizer.stop_words = get_stop_words(language) summarizer.bonus_words = ["nsdgdf"] summarizer.stigma_words = ["mtrtf"] summarizer.null_words = ["zngg"] summary = "" count = 0 # Anzahl der Sätzte zählen for sentence in summarizer(parser.document, 10000000000): count += 1 # Die Satzanzahl aus dem Przentanteil ermitteln sentence_number = round(count / divident) # Die Sätze zu einem Text zusammenfügen for sentence in summarizer(parser.document, sentence_number): summary += " " + str(sentence) return summary
def filter_stop(words): words_stop = list() for w in words: if w not in get_stop_words('CZECH'): words_stop.append(w) return words_stop
def post(self): """ Extract summary (key sentences) from text """ # data = api.payload data = request.json text = data['text'] num_sentences = data['num_sentences'] num_sentences = num_sentences if isinstance( num_sentences, int) else DEFAULT_NUM_SENTENCES log.debug('num_sentences={}'.format(num_sentences)) # log.debug('text: {}'.format(text)) # TODO: check for minimum number of sentences in text? summary_sentences = [] if text: parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, num_sentences) # summary_text = ' '.join([sentence._text for sentence in summary]) summary_sentences = [sentence._text for sentence in summary] log.debug('response body:\n{}'.format(summary_sentences)) return summary_sentences, 200, {'Access-Control-Allow-Origin': '*'}
def __init__(self, modelfn=None, classnames=None, language="english", explainer=None, summarizer=None, fm=962, topfeaturescount=100, sentencescount=6, logger=None): self.fm = fm self.modelfn = modelfn self.classnames = classnames self.topfeaturescount = topfeaturescount self.language = language self.sentencescount = sentencescount if explainer is not None: self.explainer = explainer else: self.explainer = lime_text.LimeTextExplainer( class_names=self.classnames) if summarizer is not None: self.summarizer = summarizer else: self.summarizer = TextRankSummarizer(Stemmer(self.language)) self.summarizer.stop_words = get_stop_words(self.language) if logger is not None: self.log = logger else: self.log = logging.getLogger()
def get_sumy( sentences_count: int = 10, body: str = "", url: Optional[str] = None ) -> str: from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words if url is None: from sumy.parsers.plaintext import PlaintextParser as Parser item = (body) else: from sumy.parsers.html import HtmlParser as Parser item = (body, url) DBG(f'Sumy HTML, url: {url}') tokenizer = Tokenizer(LANGUAGE) parser = Parser.from_string(*item, tokenizer) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, sentences_count) summary = [str(sentence) for sentence in summary] summary = ' '.join(summary) return summary
def models_LUHN_LEX_LSA_2(dataframe): LANGUAGE = "english" stop = get_stop_words(LANGUAGE) size = len(dataframe) stemmer = Stemmer(LANGUAGE) for i in range(0, size): article = dataframe.loc[i, "post_content"] parser = PlaintextParser.from_string(article, Tokenizer(LANGUAGE)) summarizerLUHN = LUHN(stemmer) summarizerLUHN.stop_words = stop summarizerLEX = LEX(stemmer) summarizerLEX.stop_words = stop summarizerLSA = LSA(stemmer) summarizerLSA.stop_words = stop LUHNsentence = summarizerLUHN(parser.document, 1) #summarize document with one sentence LEXsentence = summarizerLEX(parser.document, 1) #summarize document with one sentence LSAsentence = summarizerLSA(parser.document, 1) #summarize document with one sentence for sentence1 in LUHNsentence: LUHNsummary = sentence1 for sentence2 in LEXsentence: LEXsummary = sentence2 for sentence3 in LSAsentence: LSAsummary = sentence3 dataframe.loc[i, "LUHN"] = LUHNsummary dataframe.loc[i, "LEX"] = LEXsummary dataframe.loc[i, "LSA"] = LSAsummary
def store_summary(self): for item in self.doc_id_url: if item < len(self.document_info): #soup = self.document_info[item] s = requests.Session() response = s.get(self.doc_id_url[item]) if response.status_code != 404: parser = HtmlParser.from_url(self.doc_id_url[item], Tokenizer("english")) text = "" """ for tag in soup.findAll('p'): text = text + tag.text """ stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") for sentence in summarizer(parser.document, 5): print sentence if item in self.summary: self.summary[item] = self.summary[item] + sentence else: self.summary[item] = sentence
def get_sum(): # news_text = "Encoder contains the input words that want to be transformed (translate, generate summary), and each word is a vector that go through forward and backward activation with bi-directional RNN. Then calculate the attention value for each words in encoder reflects its importance in a sentence. Decoder generates the output word one at a time, by taking dot product of the feature vector and their corresponding attention for each timestamp." new_list, news_text = get_news_link_content() LANGUAGE = "english" SENTENCES_COUNT = 4 # print(news_text) # if __name__ == "__main__": # url = "https://en.wikipedia.org/wiki/Automatic_summarization" # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # import os # sss = os.path.join(os.path.dirname(__file__)) # parser = PlaintextParser.from_file(sss+"/xixi.txt", Tokenizer(LANGUAGE)) parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sum_newss = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): sum_newss += str(sentence) return sum_newss
def print_news(url, content='title'): #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) g = Goose() article = g.extract(url=url) #If there is a meta description available, print that else go for #summarize if content == 'full' and article.meta_description: print(article.meta_description) return news_text = article.cleaned_text parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) if content == 'title' or content == 'full': #Print article title print('\t* ' + str(article.title.encode('ascii', 'ignore'))) if content == 'full': #Print a n-line summary for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) return
def find_summary_stopwords(): p = PlaintextParser.from_file("testtext.txt", Tokenizer("english")) s_lsa = LsaSummarizer() s_lsa = LsaSummarizer(Stemmer("english")) s_lsa.stop_words = get_stop_words("english") for s in s_lsa(p.document, 2): print(s)
def sum_from_string(string, language="english", sentences_cout=100): parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stem_words = get_stop_words(language) sentences = summarizer(parser.document, sentences_cout) return sentences
def createSummary (text, language="english", num_sentences=3, method="lexrank"): #LANGUAGE = "english" #SENTENCES_COUNT = 5 # url = "https://en.wikipedia.org/wiki/Automatic_summarization" # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) # Language tokenizer tokenizer = Tokenizer(language) parser = PlaintextParser.from_string(text, tokenizer) # word stemming stemmer = Stemmer(language) if (method == "lexrank"): summarizer = LexRankSummarizer(stemmer) elif (method == "lsa"): summarizer = LSASummarizer(stemmer) elif (method == "luhn"): summarizer = LuhnSummarizer(stemmer) elif (method == "kl"): summarizer = KLSummarizer(stemmer) else: raise Exception (f'Unknown summarization method: ${method}') summarizer.stop_words = get_stop_words(language) result = [] for sentence in summarizer(parser.document, num_sentences): result.append (str(sentence)) return result
def get_data_list(URL, file_type=""): SUMMARY_SENTENCES_COUNT = 5 sentences = [] try: LANGUAGE = "english" # parser = None if file_type == "txt": parser = HtmlParser.from_string(URL, None, Tokenizer(LANGUAGE)) elif file_type == "pdf": content = read_pdf(URL) parser = HtmlParser.from_string(content, None, Tokenizer(LANGUAGE)) else: parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) document = parser.document stemmer = Stemmer(LANGUAGE) from sumy.summarizers.luhn import LuhnSummarizer LHS = LuhnSummarizer(stemmer) LHS.stop_words = get_stop_words(LANGUAGE) print("\nSummary using Luhn Summarizer") print("*******************************") for sentence in LHS(document, SUMMARY_SENTENCES_COUNT): sentences.append(str(sentence)) except Exception as e: print(str(e)) finally: return sentences
def summarize_news(news): """ Parameters ---------- news : LIST(str) List of news data to be summarized. Returns ------- sentence : str Most important sentence in the given articles. """ LANGUAGE = "english" SENTENCES_COUNT = 1 parsed = [] for data in news: parser = PlaintextParser.from_string(data, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): parsed.append(sentence) return parsed
def summarize(string, summary_length = 1, language = "english"): string = string.lower() if string.isupper() else string parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)])
def luhn(self,text_parser): assert isinstance(text_parser,plaintext.PlaintextParser) summarizer=Luhn() #EnglishStemmer()) #summarizer.stop_words=stopwords.words("english") summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE) return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
def summarize(url): summary = [] parser = HtmlParser.from_url(url,Tokenizer(lang)) stemmer = Stemmer(lang) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(lang) for sentence in summarizer(parser.document,sent): summary.append(sentence._text) return ' '.join(summary)
def summarize(text): total = "" parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): total += str(sentence) return total
def lsa(comment,parser,num): summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) LSAstr = '' for sentence in summarizer(parser.document,num): LSAstr += str(sentence) return LSAstr
def retreive_sumy(url): # "http://en.wikipedia.org/wiki/Automatic_summarization" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return summarizer(parser.document, SENTENCES_COUNT)
def summarize(filename, num_sentences): with open (filename, "r") as myfile: data=myfile.read() parser = PlaintextParser.from_string(data, Tokenizer('english')) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") summary = "" for sentence in summarizer(parser.document, num_sentences): summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " return summary
def summary(text, summarizer_class): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = summarizer_class(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) yield sentence
def summarize(content): parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) text = '\n'.join( [str(sentence) for sentence in summarizer(parser.document, COUNT)] ) summary = Summary(content=content, summary=text) summary.save()
def summary(self, int1, int2): # int1, int2 are the places between which to look for # the summary to be taken (slicing the corpus as a string) parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english")) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") self.summary_text = " ".join( map(lambda x:x._text, summarizer(parser.document, 20))) return self.summary_text
def summarizeText(self, body, numSentences = 10): """Summarizes body of text to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def summary(text): stemmer = Stemmer(LANGUAGE) parser = PlaintextParser(text, Tokenizer(LANGUAGE)) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) short = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n" #print(sentence) return short
def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) self.assertEqual(len(sentences), 2)
def summarizeUrl(self, url, numSentences = 10): """Summarizes text at a given url to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = HtmlParser.from_url(url, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def summarize(parser, sentences_count): stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = "" for sentence in summarizer(parser.document, sentences_count): sentences += " " + str(sentence) return sentences
def test_issue_5_sigma_can_multiply_matrix_v(self): """Source: https://github.com/miso-belica/sumy/issues/5""" parser = PlaintextParser.from_string( load_resource("articles/sigma_can_multiply_matrix_v.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(english_stemmer) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) self.assertEqual(len(sentences), 20)
def summarize(text): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): result += str(sentence) + " " return result
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def lsa(self,text_parser): assert isinstance(text_parser,plaintext.PlaintextParser) #process the text summarizer=LSA() #EnglishStemmer()) #summarizer.stop_words=stopwords.words("english") #we have to specify stop words summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE) return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
def get_summary(text, max_sentences=5): parser = PlaintextParser.from_string(text, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") summary = [] for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10 summary.append(str(sentence._text.encode('ascii', 'ignore'))) return summary
def get_lexrank(tweets): sens = [Sentence(t, TwokenizeWrapper()) for t in tweets] tweet_document = ObjectDocumentModel([Paragraph(sens)]) LANGUAGE = "english" stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) SENTENCES_COUNT = len(sens) lex_ranked = summarizer(tweet_document, SENTENCES_COUNT) if len(sens) != len(lex_ranked): print('lr error') return [lex_ranked[s] for s in sens]
def test_issue_5_svd_converges(): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") parser = PlaintextParser.from_string( load_resource("articles/svd_converges.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def summarize(method, length, url): html_content = fetch_url(url) iso_lang = detect_language(html_content) language = SUMY_LANGUAGES[iso_lang] stemmer = Stemmer(language) parser = HtmlParser.from_string(html_content, url, Tokenizer(language)) summarizer_class = AVAILABLE_METHODS[method] summarizer = build_summarizer(summarizer_class, get_stop_words(language), stemmer, parser) sentences = summarizer(parser.document, ItemsCount(length)) summary = ' '.join([unicode(sentence) for sentence in sentences]) return summary, iso_lang