def main(url, num_sentences=10, language='english'): parser = HtmlParser.from_url(url, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) for sentence in summarizer(parser.document, num_sentences): print(sentence)
def summarize(text, n_sentences, sep='\n'): ''' Args: text (str or file): text itself or file in memory of text n_sentences (int): number of sentences to include in summary Kwargs: sep (str): separator to join summary sentences Returns: (str) n_sentences-long, automatically-produced summary of text ''' if isinstance(text, str): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) elif isinstance(text, file): parser = PlaintextParser.from_file(text, Tokenizer(LANGUAGE)) else: raise TypeError('text must be either str or file') stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return '\n'.join(str(s) for s in summarizer(parser.document, n_sentences))
def summarizeFile(inputFile): summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") url = findURLS(inputFile) if url != None: if url[-1] == '.': url = url[0:-1] #print (url) #urlContent = 'Summary from URL ['+url+']: \n' urlContent = '' try: parser = HtmlParser.from_url(url, Tokenizer("english")) for sentence in summarizer(parser.document, 3): urlContent = urlContent + str(sentence) + '\n' except: #print (sys.exc_info()[0]) urlContent = '' content = inputFile.read() parser = PlaintextParser.from_string(content, Tokenizer(LANGUAGE)) #summarizer = LsaSummarizer(stem_word) #summarizer.stop_words = get_stop_words(LANGUAGE) #summary = 'Event Summary: \n' summary = '' try: for sentence in summarizer(parser.document, SENTENCES_COUNT_1): summary = summary + str(sentence) + '\n' except AssertionError: return None if url != None: return summary + urlContent return summary
def summarize(string, summary_length = 1, language = "english"): string = string.lower() if string.isupper() else string parser = PlaintextParser.from_string(string, Tokenizer(language)) stemmer = Stemmer(language) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(language) return ". ".join([str(sentence) for sentence in summarizer(parser.document, summary_length)])
def test_single_sentence(): document = build_document(("I am the sentence you like",)) summarizer = LsaSummarizer() summarizer.stopwords = ("I", "am", "the",) sentences = summarizer(document, 10) assert len(sentences) == 1 assert to_unicode(sentences[0]) == "I am the sentence you like"
def test_single_sentence(self): document = build_document(("I am the sentence you like",)) summarizer = LsaSummarizer() summarizer.stopwords = ("I", "am", "the",) sentences = summarizer(document, 10) self.assertEqual(len(sentences), 1) self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
def summarize(text): total = "" parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): total += str(sentence) return total
def lsa(comment,parser,num): summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) LSAstr = '' for sentence in summarizer(parser.document,num): LSAstr += str(sentence) return LSAstr
def summarize(filename, num_sentences): with open (filename, "r") as myfile: data=myfile.read() parser = PlaintextParser.from_string(data, Tokenizer('english')) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") summary = "" for sentence in summarizer(parser.document, num_sentences): summary += sentence.__unicode__().encode('ascii', 'ignore').replace('\"', '').replace('\'', '').strip() + " " return summary
def retreive_sumy(url): # "http://en.wikipedia.org/wiki/Automatic_summarization" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return summarizer(parser.document, SENTENCES_COUNT)
def summary(self, int1, int2): # int1, int2 are the places between which to look for # the summary to be taken (slicing the corpus as a string) parser = PlaintextParser(self.corpus[int1:int2], Tokenizer("english")) summarizer = LsaSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") self.summary_text = " ".join( map(lambda x:x._text, summarizer(parser.document, 20))) return self.summary_text
def summarize(content): parser = PlaintextParser.from_string(content.body, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) text = '\n'.join( [str(sentence) for sentence in summarizer(parser.document, COUNT)] ) summary = Summary(content=content, summary=text) summary.save()
def summarizeText(self, body, numSentences = 10): """Summarizes body of text to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def summary(text): stemmer = Stemmer(LANGUAGE) parser = PlaintextParser(text, Tokenizer(LANGUAGE)) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) short = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): short = short + ">" + "* " + str(sentence).decode('ascii','ignore') + "\n\n" #print(sentence) return short
def summarize(parser, sentences_count): stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = "" for sentence in summarizer(parser.document, sentences_count): sentences += " " + str(sentence) return sentences
def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) self.assertEqual(len(sentences), 2)
def summarizeUrl(self, url, numSentences = 10): """Summarizes text at a given url to numSentences """ #parser = PlaintextParser.from_string(body, Tokenizer(self.LANG)) parser = HtmlParser.from_url(url, Tokenizer(self.LANG)) stemmer = Stemmer(self.LANG) summarizer = SumySummarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANG) summary = ' '.join([str(sentence).decode('utf-8') for sentence in summarizer(parser.document, numSentences)]) return summary
def test_article_example(): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("articles/prevko_cz_1.txt"), Tokenizer("czech") ) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def summarize(text): parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) result = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): result += str(sentence) + " " return result
def test_issue_5_sigma_can_multiply_matrix_v(self): """Source: https://github.com/miso-belica/sumy/issues/5""" parser = PlaintextParser.from_string( load_resource("articles/sigma_can_multiply_matrix_v.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(english_stemmer) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) self.assertEqual(len(sentences), 20)
def lsa(self,text_parser): assert isinstance(text_parser,plaintext.PlaintextParser) #process the text summarizer=LSA() #EnglishStemmer()) #summarizer.stop_words=stopwords.words("english") #we have to specify stop words summarizer.stop_words=get_stop_words(settings.SUMMARIZER_LANGUAGE) return summarizer(text_parser.document,settings.SUMMARIZER_TOP_X_SENTENCES)
def test_document(self): document = build_document( ("I am the sentence you like", "Do you like me too",), ("This sentence is better than that above", "Are you kidding me",) ) summarizer = LsaSummarizer() summarizer.stopwords = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this",) sentences = summarizer(document, 2) self.assertEqual(len(sentences), 2) self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like") self.assertEqual(to_unicode(sentences[1]), "This sentence is better than that above")
def get_summary(text, max_sentences=5): parser = PlaintextParser.from_string(text, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") summary = [] for sentence in summarizer(parser.document, max_sentences): # sentence count set to 10 summary.append(str(sentence._text.encode('ascii', 'ignore'))) return summary
def lsaReferenceSummary(path): sentencesList=[] parser = PlaintextParser.from_file(path, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(sentence._text) sentencesList.append(sentence._text) return sentencesList
def test_dictionary_without_stop_words(self): summarizer = LsaSummarizer() summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] document = build_document( ("stop halt shut hmmm", "Stop Halt Shut Hmmm",), ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",), ("Some relevant sentence", "Some moRe releVant sentEnce",), ) expected = frozenset(["some", "more", "relevant", "sentence"]) dictionary = summarizer._create_dictionary(document) self.assertEqual(expected, frozenset(dictionary.keys()))
def test_issue_5_svd_converges(): """Source: https://github.com/miso-belica/sumy/issues/5""" pytest.skip("Can't reproduce the issue.") parser = PlaintextParser.from_string( load_resource("articles/svd_converges.txt"), Tokenizer("english") ) summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 20
def sum_spark(doc): parser = PlaintextParser.from_string(doc,Tokenizer('english')) summarizer = Summarizer(Stemmer('english')) summarizer.stop_words = stop_books texts=[] for sentence in summarizer(parser.document, 2): texts.append(str(sentence)) return texts
def getText(self, sentence_count=None): if sentence_count: self.SENTENCE_COUNT = sentence_count parser = HtmlParser.from_url(self.url, Tokenizer(self.LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(self.LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(self.LANGUAGE) text_list = [] for sentence in summarizer(parser.document, self.SENTENCE_COUNT): text_list.append(str(sentence)) return "\n".join(text_list)
def extract_titles (reviews): stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for item_id, review in reviews.iteritems(): print "Review: {}".format(review) print "\n" #sentences = re.split(r' *[\.\?!][\'"\)\]]* *', review) for sentence in summarizer(build_document_from_string(review), SENTENCES_COUNT): print sentence print "\n"
def summarize_text(textbody): parser = PlaintextParser.from_string(textbody, Tokenizer(LANG)) stemmer = Stemmer(LANG) summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANG) summary = summarizer(parser.document, SENTENCE_COUNT) summarized_text = '' for sentence in summary: summarized_text += str(sentence) + ' ' return summarized_text
def update_db(stored_result,db,query): result = resource.list(q= query, cx = search_engine_id).execute() query_json= stored_result[0] stored_sources = [] for news in query_json["News"]: news_dict = news[-1] url = news_dict["source"] response = requests.get(news_dict["source"]) stored_sources.append(news_dict["source"]) if 'Last-Modified' in response.headers: if time.strptime(response.headers['Last-Modified'],"%a, %d %b %Y %H:%M:%S %Z") > time.strptime(news_dict['last_modified'],"%a, %d %b %Y %H:%M:%S %Z"): current_result = {} parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(language=LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, 5) summary = '\n'.join([line._text for line in summary]) current_result['content'] = [] current_result['content'].append(summary) current_result['source'] = news_dict["source"] current_result['last_modified'] = response.headers['Last-Modified'] news.append(current_result) """else: print(url) stored_content = news_dict["content"] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(language=LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, 5) summary = '\n'.join([line._text for line in summary]) if stored_content[0] != summary: current_result = {} current_result['content'] = [] current_result['content'].append(summary) current_result['source'] = news_dict["source"] current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) news.append(current_result)""" for item in result['items']: try: if item['link'] not in stored_sources: url = item['link'] if 'pdf' in url or 'xml.gz' in url: continue current_result = {} current_result['source'] = url current_result['content'] = [] response = requests.get(url) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(language=LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = summarizer(parser.document, 5) summary = '\n'.join([line._text for line in summary]) current_result['content'].append(summary) if 'Last-Modified' in response.headers: current_result['last_modified'] = response.headers['Last-Modified'] else: current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) query_json['News'].append([current_result]) except urllib.error.HTTPError as e: current_result['content'] = ["No results available"] continue except TypeError: current_result['content'] = ["No results available"] continue except AttributeError: current_result['content'] = ["No results available"] continue except requests.exceptions.SSLError as e: current_result['content'] = ["No results available"] continue db["news"].save(query_json)
#USING LSA #Based on term frequency techniques with singular value decomposition to summarize texts. from sumy.parsers.plaintext import PlaintextParser #We're choosing a plaintext parser here, other parsers available for HTML etc. from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer file = "plain_text.txt" #name of the plain-text file parser = PlaintextParser.from_file(file, Tokenizer("english")) summarizer_lsa = LsaSummarizer() summary_2 =summarizer_lsa(parser.document,2) for sentence in summary_2: print(sentence)
from __future__ import absolute_import from __future__ import division, print_function, unicode_literals from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = 10 if __name__ == "__main__": url = "https://en.wikipedia.org/wiki/Automatic_summarization" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def clicked(): file = open('testfile.txt', 'a') #website to text file as testfile.txt html = requests.get(url1.get()).content #1 Recoding unicode_str = html.decode("utf8") encoded_str = unicode_str.encode("ascii", 'ignore') news_soup = BeautifulSoup(encoded_str, "html.parser") title = news_soup.find_all('h1') z = [re.sub(r'<.+?>', r'', str(b)) for b in title] s1 = ''.join(z) + '.' + '\n' file.write(s1) #finding the summary of text file and again store it into testfile.txt LANGUAGE = "english" SENTENCES_COUNT = 10 if __name__ == "__main__": url = url1.get() parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) print("--LuhnSummarizer--") summarizer = LuhnSummarizer() summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = ( "I", "am", "the", "you", "are", "me", "is", "than", "that", "this", ) for sentence in summarizer(parser.document, SENTENCES_COUNT): str1 = str(sentence) file.write(str1) file.close() #open the text file and divide it into 10 parts as 0 to 9.txt str1 = open('testfile.txt', 'r').read() #print(str1) l = str1.split(".") i = len(l) for j in range(8): file = open('text/' + str(j) + '.txt', 'a') s0 = ''.join(l[j]) file.write(s0) def _patch_faulty_function(self): if self.token_key is not None: return self.token_key timestamp = calendar.timegm(time.gmtime()) hours = int(math.floor(timestamp / 3600)) response = requests.get("https://translate.google.com/") line = response.text.split('\n')[-1] parsed = re.search("(?:TKK='(?:(\d+)\.(\d+))';)", line) a, b = parsed.groups() result = str(hours) + "." + str(int(a) + int(b)) self.token_key = result return result # Monkey patch faulty function. Token._get_token_key = _patch_faulty_function # Then call it normally. #with open('testfile.txt', 'r') as myfile: # data=myfile.readlines() for k in range(8): str1 = open('text/' + str(k) + '.txt', 'r').read() #print(str1) #str1 = "my name is khan" if (len(str1) != 0): tts = gTTS(str1) tts.save('voice/' + str(k) + '.mp3') keyword = open('text/0.txt', 'r').read() #print(keyword) st = 'googleimagesdownload --keywords "' + keyword + '" --limit 8' os.system(st) os.system("D:/VideoBeta/VideoBeta.exe")
def get_summary(textss, truereq, numofsent): output_sentences = [] hold = '' truecount = 0 store = '' store = keywords( textss, ratio=0.05) #extracting the most relevant words from full text store1 = str(store) holdfirst = nltk.word_tokenize( store1) #storing the tokenized string (keywords) to remove punctuation parser = PlaintextParser.from_string( textss, Tokenizer(LANGUAGE)) #storing the full text into an object stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentencess = [] compare = [] TEMP_FOLDER = tempfile.gettempdir() documents = sent_tokenize(textss) #storing sentences of full text summalen = len(documents) #storing the number of sentences stoplist = set('for a of the and to in'.split()) for sentence in summarizer(parser.document, numofsent): hold = str(sentence) ttt = nltk.word_tokenize(hold) count = 0 for i in range(0, len(ttt)): for j in range(0, len(holdfirst)): if ttt[i] == holdfirst[j]: count += 1 compare.append(count) sentencess.append(str(sentence)) texts = [ [word for word in document.lower().split() if word not in stoplist] for document in documents ] #storing an array of sentences where each sentence is a list of words without stopwords frequency = defaultdict( int ) #storing a subclass that calls a factory function to supply missing values for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts ] #storing an array of words that occur more than once dictionary = corpora.Dictionary(texts) #storing a map of words dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict')) new_doc = str(textss.encode( 'utf-8')) #storing the utf-8 version of textss (original) new_vec = dictionary.doc2bow( new_doc.lower().split() ) #converting the utf-8 econded textss into a bag-of-words format = list of (token_id, token_count) 2-tuples. Each word is assumed to be a tokenized and normalized string (either unicode or utf8-encoded). corpus = [ dictionary.doc2bow(text) for text in texts ] #applying doc2bow to texts(list of words that occur more than once) save into an array corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus) dictionary = corpora.Dictionary.load( os.path.join(TEMP_FOLDER, 'deerwester.dict')) corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'deerwester.mm')) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) doc = str(textss.encode('utf-8')) vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] #converting the query to LSI space index = similarities.MatrixSimilarity(lsi[corpus]) index.save(os.path.join(TEMP_FOLDER, 'deerwester.index')) index = similarities.MatrixSimilarity.load( os.path.join(TEMP_FOLDER, 'deerwester.index')) sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) newlist = [] for i in range(0, summalen): newlist.append(documents[sims[i][0]]) if i == 4: break for sentencez in newlist: hold = str(sentencez) ttt = nltk.word_tokenize(hold) count = 0 for i in range(0, len(ttt)): for j in range(0, len(holdfirst)): if ttt[i] == holdfirst[j]: count += 1 compare.append(count) sentencess.append(str(sentencez)) i = 0 while i < truereq: holdsubs = [] indexes = compare.index(max(compare)) doc1 = nlp(u'%s' % str(sentencess[indexes])) parse = doc1 for word in parse: if word.dep_ == 'nsubj': holdsubs.append(word.text.lower()) if holdsubs: if holdsubs[0] != 'they' and holdsubs[0] != 'their' and holdsubs[ 0] != 'both' and holdsubs[0] != 'these' and holdsubs[ 0] != 'this': countcomma = str(sentencess[indexes]).count(',') if countcomma < 7: output_sentences.append(sentencess[indexes]) i += 1 del sentencess[indexes] del compare[indexes] return output_sentences
# ## Package sumy # In[8]: import sumy from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words # In[9]: parser = PlaintextParser.from_string(rawdata.news[1], Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") for sentence in summarizer(parser.document, 6): print(sentence) # In[10]: testchinese = '温客行一眼就看出周子舒使用的是四季山庄的流云九宫步,狠狠教训了顾湘一顿,就带她离开了。张成岭看出周子舒有一身好武功,只是深藏不露,就主动过来和周子舒寒暄,还给他一块名帖,让他有事去镜湖山庄,张成岭着急给母亲买点心,就先行离开了。周子舒听到孩子们在唱那首五湖盟争夺武林盟主以及琉璃甲的歌谣,不禁感慨江湖的风云多变。周子舒叫醒岸边的摆渡船夫,他要乘船去镜湖山庄,摆渡船夫趁机狮子大开口,周子舒也不还价,摆渡船夫看他一副病恹恹的模样,不忍心敲诈他,温客行带顾湘及时赶来,主动提出送周子舒去镜湖山庄,摆渡船夫不依不饶,拉起周子舒就上船离开了。周子舒远远就发现镜湖山庄犹如人间仙境,他迫不及待赶过去,下船就忘了付钱,遭到摆渡船夫劈头盖脸一顿臭骂,周子舒索性就坐一次霸王船。周子舒施展轻功,很快就进入镜湖山庄的桃林,他沉醉于花香之中,温客行突然从背后偷袭,周子舒只能迎战,两个人交手几个回合,温客行对周子舒心生佩服,请他喝酒小叙,周子舒断然拒绝。周子舒来到镜湖山庄,从管家口中得知镜湖派掌门张玉森久不闻江湖事,他有三个儿子张成峰,张成峦和张成岭,也不许他们掺和江湖门派之争,管家把周子舒安顿到柴房,子时的时候,三秋钉又准时开始发作,周子舒感觉浑身疼痛难忍,只能发动全部功力为自己疗伤,突然听到外面人声嘈杂。周子舒打开门发现镜湖山庄已经变成一片火海,他飞身上屋顶观察,发现带着鬼面具的人在镜湖山庄大肆烧杀抢掠,怀疑是鬼谷的人所为,他立刻下去救人,张玉森,张成峦和张成峰父子三人被抓走,镜湖山庄的人几乎全部被杀,尸横遍野。摆渡船夫保护着张成岭想逃走,被鬼谷的人追杀,周子舒出手相救,掩护着他们俩乘船离开,远远看到温客行坐在华亭伤看热闹。周子舒把摆渡船夫和张成岭带到一间破庙,摆渡船夫说明张玉森曾经救过他的命,他在镜湖山庄门前摆渡三年,就是想等有朝一日报恩,摆渡船夫让张成岭去太湖找三白大侠,张成岭坚决不走。外面阴风阵阵,一群带鬼面具的人冲进来,一个自称吊死鬼的人叫嚣着进来抓张成岭,周子舒因为体力耗尽要静养半个时辰,摆渡船夫和吊死鬼战在一处,他渐渐体力不支被打翻在地,吊死鬼要杀了周子舒,张成岭拼命保护他,顾湘及时赶来,她和黑白无常大打出手,吊死鬼想杀张成岭,摆渡船夫奋不顾身护住他,被打成重伤。顾湘被恶鬼们团团包围,周子舒挣扎着跳起来为顾湘解围,把恶鬼们全部打跑,他因体力不支差点晕倒,温客行赶来抱住周子舒。摆渡船夫因为失血过多奄奄一息,温客行用内力帮他维持,船夫拜托周子舒把张成岭交给五湖盟的赵敬,还让张成岭当场给周子舒跪下磕头,周子舒满口答应,摆渡船夫说完这些话就咽气了。周子舒帮张成岭把摆渡船夫埋葬,张成岭累得精疲力尽,周子舒打算休息一夜再上路,温客行让顾湘生火,把干粮烤了给周子舒和张成岭,周子舒借口不饿不想吃,顾湘对他冷嘲热讽,张成岭也不吃顾湘的干粮,遭到顾湘的训斥,谴责他不知道报恩,张成岭连连向她赔礼道歉。温客行发现张成岭身受重伤,主动提出帮他医治,周子舒坚决不同意,两个人一言不合就大打出手。' parser = PlaintextParser.from_string(testchinese, Tokenizer("chinese")) stemmer = Stemmer("chinese") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("chinese") for sentence in summarizer(parser.document, 6): print(sentence)
def get_update(): try: from googlesearch import search except ImportError: print("No module named 'google' found") # to search query = "covid-19 google scholar" #google scholer, Czech Republic update ={} for url in search(query, tld="co.in", num=10, stop=2, pause=2): print(url) web_response = requests.get(url) # building element_tree = lxml.html.fromstring(web_response.text) tree_title_element = element_tree.xpath('//title')[0] #print("Tag title : ", tree_title_element.tag) print("\nText title :", tree_title_element.text_content()) print("\n") #print("\nhtml title :", lxml.html.tostring(tree_title_element)) #print("\ntitle tag:", tree_title_element.tag) #print("\nParent's tag title:", tree_title_element.getparent().tag) #url = "https://academic.oup.com/clinchem/advance-article/doi/10.1093/clinchem/hvaa029/5719336" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) # print(summarizer._text) summarizer.stop_words = get_stop_words(LANGUAGE) sentence_list = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): #print(dir(sentence)) # print(sentence._text) sentence_list.append(sentence._text) sentences = (" ".join(sentence_list)) update[tree_title_element.text_content()] = sentences # print("\n") # for i in range(0,len(sentence_list),1): # time.sleep(2) # translations = translator.translate([sentence_list[i]], dest='bn') # for translation in translations: # print(translation.text) # #print(translation.origin, ' -> ', translation.text) # print("\n") # translations = [] # for sentence in sentence_list: # translations.append(translator.translate(,dest='bn')) # print(translations) return update
def __init__(self): self.lsa_summarizer = LsaSummarizer(stemmer) self.lex_rank_summarizer = LexRankSummarizer(stemmer) self.lsa_summarizer.stop_words = get_stop_words(LANGUAGE) self.lex_rank_summarizer.stop_words = get_stop_words(LANGUAGE) self.email_text_parser = SbEmailTextParser()
def mySumD(): if request.form['action'] == 'LSA': para = "" url = request.form['url_link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): data = str(sentence) para += data return render_template('dependent.html', para = para) elif request.form['action'] == 'Luhn': para = "" url = request.form['url_link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LuhnSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): data = str(sentence) para += data return render_template('dependent.html', para = para) elif request.form['action'] == 'LexRank': para = "" url = request.form['url_link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = LexSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): data = str(sentence) para += data return render_template('dependent.html', para = para) elif request.form['action'] == 'TextRank': para = "" url = request.form['url_link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = TextSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): data = str(sentence) para += data return render_template('dependent.html', para = para) elif request.form['action'] == 'SumBasic': para = "" url = request.form['url_link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = SumSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): data = str(sentence) para += data return render_template('dependent.html', para = para) else: para = "" request.form['action'] == 'KL-Sum' url = request.form['url_link'] parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = KLSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): data = str(sentence) para += data return render_template('dependent.html', para = para)
def test_empty_document(): document = build_document() summarizer = LsaSummarizer() sentences = summarizer(document, 10) assert len(sentences) == 0
def summarize4(self, df): #http://ai.intelligentonlinetools.com/ml/text-summarization/ LANGUAGE = "english" SENTENCES_COUNT = 10 stopwords = nltk.corpus.stopwords.words('english') for row in df['conclusion']: if row == '0' or row == '': continue parser = PlaintextParser(row, Tokenizer(LANGUAGE)) print("--LsaSummarizer--") summarizer = LsaSummarizer() summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("--LuhnSummarizer--") summarizer = LuhnSummarizer() summarizer.stop_words = stopwords for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("--EdmundsonSummarizer--") summarizer = EdmundsonSummarizer() words = ("deep", "learning", "neural") summarizer.bonus_words = words words = ( "another", "and", "some", "next", ) summarizer.stigma_words = words words = ( "another", "and", "some", "next", ) summarizer.null_words = words for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def summarize(url, number = 5): parser = HtmlParser.from_url(url, Tokenizer("english")) stemmer = Stemmer("english") summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("english") return " ".join(str(i) for i in summarizer(parser.document, number))
def summarize(): response.content_type = "application/json" incoming = request.json if incoming.get('token', None) != SECRET_TOKEN: return channel = incoming.get('channel_id', None) if channel is None: return query = incoming.get('text', '').replace('!tldr', '').strip() count = 150 if query != "": try: count = int( query) if int(query) > 10 and int(query) < 1000 else count except ValueError: pass r = requests.post(API_URL + "login", data=json.dumps({ "username": USERNAME, "password": PASSWORD }), headers={"Content-type": "application/json"}) try: user = r.json() except Exception as e: print("BAILING OUT (login):\n{}".format(e)) return userdata = user.get('data', None) if userdata is None: print("Login failed") return uid = userdata.get('userId', None) authToken = userdata.get('authToken', None) if uid is None or authToken is None: print("uid or token was invalid") return r = requests.get(API_URL + \ "channels.history?roomId={}&count={}".format(channel, count), headers={"X-Auth-Token": authToken, "X-User-Id": uid}) try: history = r.json() except Exception as e: print("BAILING OUT (history):\n{}".format(e)) return last = history['messages'][1] # 0 is !tldr if last.get('urls', []) != []: summaries = [] for url in last['urls']: parser = HtmlParser.from_url(url['url'], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summaries.append( "> {}".format(" ".join([str(sentence)\ for sentence in summarizer(parser.document, SENTENCES_COUNT)]))) output = json.dumps({"text": "\n--\n".join(summaries)}) return output messages = ". ".join( [m['msg'] for m in history['messages'][::-1] \ if m['msg'] != "" and m.get('bot', None) is None]) parser = PlaintextParser.from_string(messages, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) return json.dumps({ "text": "\n--\n".join([ "> {}".format(str(sentence)) for sentence in summarizer(parser.document, SENTENCES_COUNT) ]) })
def lsaer(text, count): parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer_lsa = LsaSummarizer() summary_2 =summarizer_lsa(parser.document, count) return summary_2
from sumy.summarizers.text_rank import TextRankSummarizer from sumy.summarizers.sum_basic import SumBasicSummarizer from sumy.summarizers.kl import KLSummarizer from sumy.nlp.tokenizers import Tokenizer import sys def leadSummariser(document, no_of_sents): for sent in document.sentences[:no_of_sents]: yield str(sent) summarisers = { "lead": leadSummariser, "luhn": LuhnSummarizer(), "lsa": LsaSummarizer(), "lex_rank": LexRankSummarizer(), "text_rank": TextRankSummarizer(), "sum_basic": SumBasicSummarizer(), "kl": KLSummarizer() } tokenizer = Tokenizer("english") def to_words(str): return str.split(" ") def extractive(article, title=None): raw = article.replace(' <sb>', '').strip()
def find_summary_lsa(): p = PlaintextParser.from_file("testtext.txt", Tokenizer("english")) sumlsa = LsaSummarizer() su = sumlsa(p.document, 2) for s in su: print(s)
from modules.sql import dBAdapter from modules.pre import create_corpus as c from nltk import sent_tokenize n_documents = 4 #---------------------------------------------------------------------------- print("Getting body subtitles from the database started ...") dbAdapter = dBAdapter.Database() dbAdapter.open() dic_subtitles = dict(dbAdapter.selectDic_subtitles_limit(n_documents)) dbAdapter.close() print("finalizada consulta") string = sent_tokenize(list(dic_subtitles.values())[0]) from sumy.parsers.plaintext import PlaintextParser #for tokenization from sumy.nlp.tokenizers import Tokenizer parser = PlaintextParser.from_string( list(dic_subtitles.values())[0], Tokenizer("spanish")) from sumy.summarizers.lsa import LsaSummarizer summarizer_2 = LsaSummarizer() summary_2 = summarizer_2(parser.document, 10) summ_list = [] for sentence in summary_2: summ_list.append(sentence._text) summ_text = " ".join(summ_list)
def sumySummarize(filename, language="english", num_sents=1): """ Luhn's algorithm is the most basic: 1. Ignore Stopwords 2. Determine Top Words: The most often occuring words in the document are counted up. 3. Select Top Words: A small number of the top words are selected to be used for scoring. 4. Select Top Sentences: Sentences are scored according to how many of the top words they contain. The top N sentences are selected for the summary. SumBasic uses a simple concept: 1. get word prob. p(wi) = ni/N (ni = no. of times word w exists, N is total no. of words) 2. get sentence score sj = sum_{wi in sj} p(wi)/|wi| (|wi| = no. of times wi comes in sj) 3. choose sj with highest score 4. update pnew(wi) = pold(wi)^2 for words in the chosen sentence (we want probability to include the same words to go down) 5. repeat until you reach desired no. of sentences KL algorithm solves arg min_{S} KL(PD || PS) s.t. len(S) <= # sentences, where KL = Kullback-Lieber divergence = sum_{w} PD(w)log(PD(w)/PS(w)) PD = unigram word distribution of the entire document PS = unigram word distribution of the summary (optimization variable) LexRank and TextRank use a PageRank kind of algorithm 1. Treat each sentence as the node in the graph 2. Connect all sentences to get a complete graph (a clique basically) 3. Find similarity between si and sj to get weight Mij of the edge conecting i and j 4. Solve the eigen value problem Mp = p for similarity matrix M. 5. L = 0.15 + 0.85*Mp. L gives the final score for each sentence. Pick the top sentences LexRank uses a tf-idf modified cosine similarity for M. TextRank uses some other similarity metric LSA uses a SVD based approach 1. Get the term-sentence matrix A (rows is terms, columns is sentences). Normalize with term-frequency (tf) only 2. Do SVD; A = USV' (A=m x n, U=m x n, S=n x n, V=n x n) SVD derives the latent semantic structure of sentences. The k dimensional sub-space get the key k topics of the entire text structure. It's a mapping from n-dimensions to k If a word combination pattern is salient and recurring in document, this pattern will be captured and represented by one of the singular vectors. The magnitude of the corresponding singular value indicates the importance degree of this pattern within the document. Any sentences containing this word combination pattern will be projected along this singular vector, and the sentence that best represents this pattern will have the largest index value with this vector. As each particular word combination pattern describes a certain topic/concept in the document, the facts described above naturally lead to the hypothesis that each singular vector represents a salient topic/concept of the document, and the magnitude of its corresponding singular value represents the degree of importance of the salient topic/concept. Based on this, summarization can be based on matrix V. V describes an importance degree of each topic in each sentence. It means that the k’th sentence we choose has the largest index value in k’th right singular vector in matrix V. An extension of this is using SV' as the score for each sentence """ from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words from sumy.summarizers.luhn import LuhnSummarizer from sumy.summarizers.lsa import LsaSummarizer from sumy.summarizers.text_rank import TextRankSummarizer from sumy.summarizers.lex_rank import LexRankSummarizer from sumy.summarizers.sum_basic import SumBasicSummarizer from sumy.summarizers.kl import KLSummarizer parser = PlaintextParser.from_file(filename, Tokenizer(language)) def getSummary(sumyAlgorithm): sumyAlgorithm.stop_words = get_stop_words(language) summary = sumyAlgorithm(parser.document, num_sents) sents = " ".join([str(sentence) for sentence in summary]) return sents stemmer = Stemmer(language) summaries = {} summaries['Luhn'] = getSummary(LuhnSummarizer(stemmer)) summaries['LSA'] = getSummary(LsaSummarizer(stemmer)) summaries['TextRank'] = getSummary(TextRankSummarizer(stemmer)) summaries['LexRank'] = getSummary(LexRankSummarizer(stemmer)) summaries['SumBasic'] = getSummary(SumBasicSummarizer(stemmer)) summaries['KL'] = getSummary(KLSummarizer(stemmer)) print("") print("####### From Sumy #######") print(summaries)
SENTENCES_COUNT = 4 parser = PlaintextParser.from_file("sampleText.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) print("\n===== Luhn =====") summarizerLuhn = LuhnSummarizer(stemmer) summarizerLuhn.stop_words = get_stop_words(LANGUAGE) for sentenceLuhn in summarizerLuhn(parser.document, SENTENCES_COUNT): print(sentenceLuhn, "\n") print("\n===== TextRank =====") summarizerTR = TextRankSummarizer(stemmer) summarizerTR.stop_words = get_stop_words(LANGUAGE) for sentenceTR in summarizerTR(parser.document, SENTENCES_COUNT): print(sentenceTR, "\n") print("\n===== LSA =====") summarizerLSA = LsaSummarizer(stemmer) summarizerLSA.stop_words = get_stop_words(LANGUAGE) for sentenceLSA in summarizerLSA(parser.document, SENTENCES_COUNT): print(sentenceLSA, "\n") print("\n===== Edmonson =====") summarizerEd = EdmundsonSummarizer(stemmer) summarizerEd.bonus_words = ('focus', 'proposed', 'method', 'describes') summarizerEd.stigma_words = ('example') summarizerEd.null_words = ('literature', 'however') for sentenceEd in summarizerEd(parser.document, SENTENCES_COUNT): print(sentenceEd, "\n")
rouge_scores = list() for file in os.listdir('datafiles'): with codecs.open('datafiles/' + file, 'r', encoding='utf-8', errors='ignore') as f: parser = PlaintextParser.from_string(f.read().replace('\n', ' '), UrduTokenizer) objectDocModel = parser.document print(objectDocModel.sentences) print(objectDocModel.paragraphs) print(objectDocModel.words) print(objectDocModel.headings) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words("Urdu") summ = summarizer(parser.document, SENTENCES_COUNT) with open('dataresults/' + file.split('.')[0] + '.txt', 'w') as fw: for sentence in summ: #print sentence evaluated_sentences.append(sentence) fw.writelines(str(sentence)) #list of rouge scores (bigrams) res = rouge_1(evaluated_sentences, objectDocModel.sentences) rouge_scores.append(res) evaluated_sentences.clear() fw.close() f.close()
def func(file1, username, wc): Summary = "" packet = BytesIO() packet.seek(0) filename = "/" + file1 #print(user) config = { "apiKey": "AIzaSyDvTZQo3KQIWvDmMwP16ItJ_DaJEylIGrc", "authDomain": "fir-android-c7a0d.firebaseapp.com", "databaseURL": "https://fir-android-c7a0d.firebaseio.com", "storageBucket": "fir-android-c7a0d.appspot.com" } firebase = pyrebase.initialize_app(config) stor = firebase.storage() #os.remove("T3.pdf") stor.child(filename).download("T3.pdf") pdf_document = "T3.pdf" doc = fitz.open(pdf_document) page_Count = doc.pageCount for v in range(0, int(page_Count)): page1 = doc.loadPage(v) pageText = page1.getText("text") # Get text from StringIO text = pageText text1 = "" text3 = "" count = 0 r = len(text) for i in range(1, r - 1): if text[i] == " " and text[i + 1] == " ": text.replace(text[i], "") count += 1 if text[i] == '\t' or text[i] == '\n': text.replace(text[i], " ") count += 1 r = len(text) t = 0 i = 0 j = 0 k = 0 flag1 = 0 for i in range(t, len(text)): if text[i] == '.': for j in range(i + 1, len(text)): if text[j] == '.': text1 = text[i:j] for k in text1: flag1 = 0 if k in {':', '!', '-', '(', ')'}: flag1 = 1 break if flag1 == 1: break break if flag1 == 1: continue else: text3 = text3 + text[i:j] t = j r = 0 for i in range(0, r - 1): if text3[i] == '.' and text3[i + 1] != ' ': text3 = text3.replace(text3[i + 1], '') r = len(text3) w_count = int(wc) W_Count = 0 if w_count == 0: w_count = 50 else: W_Count = w_count counters = 0 for p in text3: if p == " ": counters += 1 if counters < 20: continue #out=summarize(text3,ratio=(W_Count*.01)) LANGUAGE = "english" SENTENCES_COUNT = W_Count parser = PlaintextParser.from_string(text3, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) out = "" summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): out += str(sentence) if out == "": out = "Not enough words in this page to summarize" Summary = Summary + "\n\n Page No : " + str(v + 1) + "\n\n" Summary = Summary + " " + out lengther = 0 for i in Summary: lengther += 1 if i == '.': break out1 = Summary[lengther:len(Summary)] out = "Summary\n\n\n Page No: 1\n\n" + out1 outfile = 'final.txt' with open(outfile, "w+") as filer: filer.write(out) filer.close() bucket = storage.bucket() blob = bucket.blob(str(username) + '/' + 'final.txt') blob.upload_from_filename(outfile) #os.remove(outfile) return out
from sumy.summarizers.luhn import LuhnSummarizer from sumy.summarizers.edmundson import EdmundsonSummarizer from sumy.summarizers.lex_rank import LexRankSummarizer LANGUAGE = "english" SENTENCES_COUNT = 10 url="https://en.wikipedia.org/wiki/Artificial_intelligence" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) summary1 = "" print("\n\n") print ("--LsaSummarizer--") summarizer = LsaSummarizer() summarizer = LsaSummarizer(Stemmer(LANGUAGE)) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): summary1+=str(sentence) summary1+=" " with open("summarised_text.txt", "w", encoding="utf8") as myfile: myfile.write("\n\nLSA:\n") myfile.write(summary1) summary2 = "" print("\n\n") print ("--LuhnSummarizer--") summarizer = LuhnSummarizer()
def summarize(SENTENCES_COUNT): try: # url = "https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/" speak.Speak("Please Enter the U r l for summarization") url = easygui.textbox( msg='Enter url for which you want summarization:', title='Summarization').split()[0] title = getTextFromURL(url) # url="https://medium.com/@gurunathrajagopal/what-happens-when-machines-are-curious-to-learn-9aed6805bf36" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) string_dict = {} for idx, sentence in enumerate( summarizer(parser.document, SENTENCES_COUNT)): # f.write(str(sentence)) string_dict[idx] = str(sentence) # print(type(sentence)) # print(string_dict) # speak.Speak("Please Enter the filename to save to summarization") # file_name=easygui.textbox(msg='Enter filename to save the summarization:',title='Summarization').split()[0] # current_dir=os.getcwd() # f=open(current_dir+'\\'+str(file_name)+'.txt','w') # f.write('Summarization') document = Document() document.add_heading('Summarization of ' + str(title), 0) p = document.add_paragraph( 'Summarizing your article in crisp {} points'.format( SENTENCES_COUNT)) for idx, sent in zip(string_dict.keys(), string_dict.values()): adding_break = p.add_run() adding_break.add_break() p = document.add_paragraph(sent) adding_break = p.add_run() adding_break.add_break() document.save(sumydir + '\\' + 'summarization.docx') speak.Speak("Summarization was saved to the following path") # f.write('\n') # f.write(str(idx)) # f.write('. ') # f.write(sent) # f.close() easygui.msgbox(msg='Sumarized file saved in this file ' + sumydir + '\\' + 'summarization.docx', title='Summarization') except Exception as e: speak.Speak( 'Sorry My mistake please provide your feedback regarding this error' ) easygui.exceptionbox(str(e))
try: news_response = urlopen(news_req) except urllib.error.HTTPError: pass try: news_soup = BeautifulSoup(news_response, features='html.parser') except http.client.IncompleteRead: pass article = news_soup.find_all('p') final_result = '' for i in range(len(article) - 1): final_result += article[i].text + " " file_name = ticker + '-' + str(index) + '.txt' parser = PlaintextParser.from_string(str(final_result), Tokenizer('english')) stemmer = Stemmer('english') summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words('english') print('writing: ' + file_name) f = open(file_name, "a") for sentence in summarizer(parser.document, 2): f.write(str(sentence)) f.close()
def summarize(request): """Responds to any HTTP request. Args: request (flask.Request): HTTP request object. Returns: The response text or any set of values that can be turned into a Response object using `make_response <http://flask.pocoo.org/docs/1.0/api/#flask.Flask.make_response>`. """ # request_json = request.get_json() # if request.args and 'message' in request.args: # return request.args.get('message') # elif request_json and 'message' in request_json: # return request_json['message'] # else: # return f'Hello World!' try: if request.method == 'OPTIONS': # Allows GET requests from any origin with the Content-Type # header and caches preflight response for an 3600s headers = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET, POST, PUT, PATCH, DELETE, OPTIONS', 'Access-Control-Allow-Headers': 'DNT,User-Agent,X-Requested-With,If-Modified-Since,Cache-Control,Content-Type,Range,Authorization', 'Access-Control-Expose-Headers': 'Content-Length,Content-Range', 'Access-Control-Max-Age': '3600' } return ('', 204, headers) headers = { 'Access-Control-Allow-Origin': '*', } request_json = request.get_json() document = request_json['value'] except: #for local try using py main.py headers = None document = request['value'] finally: parser = PlaintextParser.from_string(document, Tokenizer("english")) summaries = {} number_pool = [0, 1, 2, 3] random.shuffle(number_pool) print(number_pool) for i in range(len(number_pool)): if number_pool[i] == 0: summarizer = LexRankSummarizer() if number_pool[i] == 1: summarizer = LuhnSummarizer() if number_pool[i] == 2: summarizer = LsaSummarizer(Stemmer("english")) summarizer.stop_words = get_stop_words("english") if number_pool[i] == 3: summarizer = PureNLTKSummarizer() summary = summarizer(parser.document, 3) sum_string = [] for sentence in summary: sum_string.append(str(sentence)) summaries[f'{i}'] = " ".join(sum_string) if headers is None: return summaries return (summaries, 200, headers)
return summary # %% df = pd.read_pickle('cnn_dataset_10k.pkl') # %% df['summary_LexRank'] = '' df['summary_Luhn'] = '' df['summary_LSA'] = '' # %% lex_summarizer = LexRankSummarizer() luhn_summarizer = LuhnSummarizer() lsa_summarizer = LsaSummarizer() rouge = Rouge() for i, r in df.iterrows(): # print(df['text'].iloc[i]) parser = PlaintextParser.from_string(df['text'].iloc[i], Tokenizer("english")) sentence_amount = 5 sentences = lex_summarizer(parser.document, sentence_amount) df['summary_LexRank'].iloc[i] = append_summaries(sentences) # print(append_summaries(sentences)) # print(sentences) sentences = luhn_summarizer(parser.document, sentence_amount) df['summary_Luhn'].iloc[i] = append_summaries(sentences) # print(append_summaries(sentences))
def get(self,request): query = request.GET['query'] query = query.lower() query = re.sub(r'[^\w\s]','',query) response_json = {} fact_check = requests.get('https://factchecktools.googleapis.com/v1alpha1/claims:search',params = {'query':query,'key':api_key,'languageCode':'en-US'}) db = client["news"] if len(fact_check.json()) == 0: response_json['Common Myths'] = [{'source':'No Results Available for this query','check':'Not Available','claim':'Not Available'}] else: claims = fact_check.json()['claims'] ratings = [claims[i]['claimReview'][0]['textualRating'] for i in range(0,len(claims))] factcheck = None for rating in ratings: if rating == 'False' or 'myth' in rating or 'no evidence' in rating: factcheck = False if factcheck == False: response_json['Common Myths'] = [] for claim in claims: current_result = {} current_result['source'] = claim['claimReview'][0]['url'] current_result['check'] = claim['claimReview'][0]['textualRating'] current_result['claim'] = claim['text'] response_json['Common Myths'].append(current_result) else: response_json['Common Myths'] = [{'source':'No Results Available for this query','check':'Not Available','claim':'Not Available'}] stored_queries = db["news"].find({'_id':query}) stored_result = [] for q in stored_queries: stored_result.append(q) is_stored = None if len(stored_result)==0: is_stored = False else: is_stored = True if is_stored == True: if request.GET['update'] == 'True': update_db.after_response(stored_result,db,query) response_json["News"] = [] query_json= stored_result[0] for news in query_json["News"]: latest_news = news[-1] current_dict = {} current_dict["source"] = latest_news["source"] current_dict["content"] = latest_news["content"] response_json["News"].append(current_dict) update_faq(query) response_json["similar_questions"] = related_questions(query) response_json["summary"] = query_json["summary"] response_json["hit_again"] = 'True' return Response(response_json) result = resource.list(q= query, cx = search_engine_id).execute() if len(result) == 0 or 'items' not in result: response_json['News'] = [{'source':'No Results Available for this query','content':'Not Available'}] else: url = None extractor = extractors.ArticleExtractor() response_json['News'] = [] content_summary = '' if is_stored == False: for item in result['items']: try: url = item['link'] if 'pdf' in url or 'xml.gz' in url: continue if url == 'https://www.cdc.gov/coronavirus/2019-ncov/faq.html' or url=='https://www.cdc.gov/coronavirus/2019-ncov/hcp/faq.html': page = requests.get("https://www.cdc.gov/coronavirus/2019-ncov/faq.html") soup = BeautifulSoup(page.content, 'html.parser') page_results= soup.find_all('div',attrs={'class': 'card bar'}) for content in page_results: question = content.find('span',attrs = {'role':'heading'}).contents[0] question = question.lower() re.sub(r'[^\w\s]','',question) question = question answer = content.find('div',attrs = {'class':'card-body'}).find('p').getText() if len(answer)!=0 and is_similar(query,question,0.5): current_result = {} current_result['source'] = url current_result['content'] = [] #print(question,":",answer) current_result['content'].append(answer) response_json['News'].append(current_result) content_summary = content_summary + answer else: response = requests.get(url) stemmer = Stemmer(language=LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) summary = summarizer(parser.document, 5) summary = '\n'.join([line._text for line in summary]) current_result = {} current_result['source'] = url current_result['content'] = [] current_result['content'].append(summary) content_summary = content_summary + summary if 'Last-Modified' in response.headers: current_result['last_modified'] = response.headers['Last-Modified'] else: current_result['last_modified'] = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) response_json['News'].append(current_result) except urllib.error.HTTPError as e: current_result['content'] = ["No results available"] continue except TypeError: current_result['content'] = ["No results available"] continue except AttributeError: current_result['content'] = ["No results available"] continue except requests.exceptions.SSLError as e: current_result['content'] = ["No results available"] continue response_json['summary'] = get_summary(content_summary) db_json = {} db_json['News'] = response_json['News'] db_json['summary'] = response_json['summary'] for i,news in enumerate(db_json['News']): url = news['source'] response = requests.get(url) headers = response.headers last_modified = None if 'Last-Modified' in headers: last_modified = headers['Last-Modified'] else: last_modified = time.strftime("%a, %d %b %Y %H:%M:%S %Z", time.gmtime()) db_json['News'][i]['last_modified'] = last_modified db_json['News'] = [[json] for json in db_json['News']] db_json['_id'] = query db["news"].insert_one(db_json) update_faq(query) response_json["similar_questions"] = related_questions(query) response_json["hit_again"] = 'False' return Response(response_json)
def test_empty_document(self): document = build_document() summarizer = LsaSummarizer() sentences = summarizer(document, 10) self.assertEqual(len(sentences), 0)
txtSummary.write("\n\n*** LEXRANK NEGATIVE ***\n") print("*** LEXRANK NEGATIVE ***") for sentence in summary: txtSummary.write(str(sentence)) print(sentence) LSummarizer = LuhnSummarizer() summary = LSummarizer(parser.document, 1) txtSummary.write("\n\n*** LUHN NEGATIVE ***\n") print("") print("*** LUHN NEGATIVE ***") for sentence in summary: txtSummary.write(str(sentence)) print(sentence) LSASummarizer = LsaSummarizer() summary = LSASummarizer(parser.document, 1) txtSummary.write("\n\n*** LSA NEGATIVE ***\n") print("") print("*** LSA NEGATIVE ***") for sentence in summary: txtSummary.write(str(sentence)) print(sentence) LSA2Summarizer = LsaSummarizer() LSA2Summarizer = LsaSummarizer(Stemmer("english")) LSA2Summarizer.stop_words = get_stop_words("english") txtSummary.write("\n\n*** LSA W/ STOP WORDS NEGATIVE ***\n") print("") print("*** LSA W/ STOP WORDS NEGATIVE ***") for sentence in LSA2Summarizer(parser.document, 1):
def textteaser_test(): summary = open("summary_list.txt", "a", encoding='utf-8-sig') sys.stdout = summary # obtain the input article from url #url = "http://www.nytimes.com/2016/11/17/us/politics/donald-trump-administration-twitter.html?ref=politics" #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # obtain the input article from plain text files parser = PlaintextParser.from_file("input_sample.txt", Tokenizer(LANGUAGE)) # define the language, by dafult it is English stemmer = Stemmer(LANGUAGE) # SumBasic algorithm summarizer = SumBasicSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SumBasic:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # LSA algorithm summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("Latent Semantic Analysis:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # TextRank algorithm summarizer = TextRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("TextRank:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") # LexRank algorithm summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("LexRank:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") #Featured-LexRank algorithm with open('input_sample.txt', 'r', encoding='utf-8-sig') as f: first_line = f.readline() title = first_line with open('input_sample.txt', 'r', encoding='utf-8-sig') as f: text = f.read() tt = TextTeaser() sentences = tt.summarize(title, text) file = open("tt.txt", "w", encoding='utf-8-sig') print("Featured-LexRank:") for sentence in sentences: file.write("%s\n" % sentence) file.close() parser = PlaintextParser.from_file("tt.txt", Tokenizer(LANGUAGE)) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence) print("\n") summary.close()