def wiki_summary(wikipage, toc, relevant_toc, mode='multi', wikisec=True, word_count=150): wikisum = wikipage.summary wikisum_len = len(wikisum.split()) if toc: #if non-empty table of contents sum_fields = list(set(toc).difference(set(relevant_toc))) if mode == 'multi': sum_sum = summarize_wiki_page(wikipage, sum_fields, wikisec=wikisec) sum_total = wikisum + sum_sum elif mode == 'single': text = [] subset_dict = grab_wikisec_toc(wikipage, sum_fields) for key, content in subset_dict.items(): text.append(content) try: sum_sum = wikisum + ''.join(text) except TypeError: sum_sum = wikisum sum_sum_len = len(sum_sum.split()) if sum_sum_len < word_count: sum_total = sum_sum else: sum_total = summarize(sum_sum, word_count=word_count) else: if wikisum_len < word_count: sum_total = wikisum else: sum_total = summarize(wikisum, word_count=word_count) return sum_total
def text_splitter(text): k = keywords(text, words=8, lemmatize=True).split('\n') kwords = ', '.join(k) text += '. --END' print(text) print(summarize(text, 0.3)) return (summarize(text, 0.3), kwords)
def frontpage(): if request.method == 'GET': return render_template('homepage.html') elif request.method == 'POST': if request.form['paragraph_text']: text = request.form['paragraph_text'] with open('run/src/static/ori_text.txt', 'w+') as f: f.write(text) try: summ1 = summarization(text) with open('run/src/static/textfile1.txt', 'w+') as f: f.write(summ1) summ2 = summarize(text) with open('run/src/static/textfile2.txt', 'w+') as f: f.write(summ2) return redirect('/result') except ValueError: return render_template('homepage.html') elif request.files['fileselect']: text_file = request.files['fileselect'] filename = secure_filename(text_file.filename) text_file.save(os.path.join("run/src/static/ori_text.txt")) with open('run/src/static/ori_text.txt', 'r') as f: content = f.read() summ3 = summarization(content) with open('run/src/static/textfile1.txt', 'w+') as f: f.write(summ3) summ4 = summarize(content) with open('run/src/static/textfile2.txt', 'w+') as f: f.write(summ4) return redirect('/result') else: return render_template('homepage.html')
def textlize(request): result = '' form = TextInputForm(request.POST) if request.method == 'POST': if form.is_valid(): raw_text = form.cleaned_data["text_in"] cleaned_text = get_only_text(raw_text) selected = form.cleaned_data.get('slize_size') if selected: size_dict = { "slize_pointfive": 0.005, "slize_one": 0.01, "slize_five": 0.05, "slize_ten": 0.10, "slize_twenty": 0.20, "slize_thirty": 0.30, "slize_forty": 0.40, "slize_fifty": 0.50 } for k, v in size_dict.items(): if selected == k: result = summarize(cleaned_text, ratio=v) return render(request, 'text/textlize.html', { 'form': form, 'result': result }) else: result = summarize(cleaned_text, ratio=0.25) return render(request, 'text/textlize.html', { 'form': form, 'result': result }) else: form = TextInputForm() return render(request, 'text/textlize.html', {'form': form})
def find_keywords(filepaths): print("Finding Keywords...") file_keywords = [] files = [] documents = [] for fp in filepaths: ext = os.path.splitext(fp)[-1].lower() if ext == ".pdf": # keywords_set = clean_keywords(read_pdf(fp)) text = '' pdfFileObj = open(fp, 'rb') pdfReader = PyPDF2.PdfFileReader(pdfFileObj) num = pdfReader.numPages for i in range(num): pageObj = pdfReader.getPage(i) text += pageObj.extractText() pdfFileObj.close() if text != '': document = summarize(text) else: meta_path = os.path.dirname(fp) + '\metadata.csv' des = pd.read_csv(meta_path, encoding='unicode_escape') try: description = des['Description'][0] except: description = des['Title'][0] document = description keywords_set = clean_keywords(keywords_from_summary(document)) files.append(fp) elif ext == '.rtf': files.append(fp) with open(fp, 'r') as file: text = file.read() document_t = rtf_to_text(text).replace('\n', ' ').replace('\t', ' ') keywords_set = clean_keywords( keywords_from_summary(summarize(document_t))) document = document_t elif ext == '.docx': text = getText(fp) document = text keywords_set = clean_keywords( keywords_from_summary(summarize(text))) files.append(fp) else: files.append(fp) meta_path = os.path.dirname(fp) + '\metadata.csv' des = pd.read_csv(meta_path, encoding='unicode_escape') try: description = des['Description'][0] except: description = des['Title'][0] document = description keywords_set = clean_keywords(keywords_from_summary(description)) file_keywords.append(keywords_set) documents.append(document) # print(documents) return file_keywords, documents
def get_summary(url: str) -> str: if 'wikipedia' in urllib.parse.urlparse(url).netloc: sentences = get_wiki_text(url) else: sentences = get_article_text(url) if len(sentences.split()) > 2000: return summarize(sentences, word_count=1000) else: return summarize(sentences, ratio=0.5)
def bm25(input_seq): inp = ''.join(input_seq) ten = summarize(inp, ratio=0.1, split=True) thirty = summarize(inp, ratio=0.3, split=True) forty = summarize(inp, ratio=0.4, split=True) fifty = summarize(inp, ratio=0.5, split=True) return ten, thirty, forty, fifty # ============================================================================
def reumir(self, text): res = summarize(text, ratio=0.1) print res res2 = summarize(text, word_count=100) print res2 print(keywords(ratio=0.1))
def _doc_summarizer_eng(table, input_col, hold_cols=None, result_type='summarized_document', new_col_name='summarized_document', ratio=None, num_sentence=1): doc_col = table[input_col].values len_doc_col = len(doc_col) if hold_cols is None: out_table = table.copy() else: out_table = table[hold_cols] table_list = [] for i in range(len_doc_col): try: len_doc = len(summarize(doc_col[i], ratio=1, split=True)) except ValueError as e: if str(e) == "input must have more than one sentence": summarized_doc = doc_col[i] summarized_sents = [doc_col[i]] _num_sentence = 1 else: raise else: if ratio is not None: _num_sentence = np.maximum(int(len_doc * ratio), 1) _ratio = ratio else: _num_sentence = np.minimum(len_doc, num_sentence) _ratio = (_num_sentence / len_doc if len_doc != 0 else 1) summarized_doc = summarize(doc_col[i], ratio=_ratio, split=False) summarized_sents = summarize(doc_col[i], ratio=_ratio, split=True) if result_type == 'summarized_document': summarized_col = [summarized_doc] else: summarized_col = np.insert(np.transpose( [summarized_sents[0:_num_sentence]]), 0, i + 1, axis=1) table_list.append(summarized_col) result_table = np.concatenate(table_list, axis=0) if result_type == 'summarized_document': out_table[new_col_name] = result_table else: out_table = pd.DataFrame(result_table, columns=['doc_id', 'sentence']) out_table['doc_id'] = out_table['doc_id'].astype(int) return {'out_table': out_table}
def get_speech(): global out driver.get("https://www.moneycontrol.com/annual-report/" + company_name+"/directors-report/"+cd+"#"+cd) # director_speech director_speech = driver.find_element_by_xpath( '//div[@class="report_data"]').text director_speech d = re.match('.*\\n', director_speech).group() # ds = director_speech.rstrip("\n") # ds=re.sub('\n',' ',director_speech) ds = re.sub(d, ' ', director_speech) driver.get("https://www.moneycontrol.com/annual-report/" + company_name+"/chairmans-speech/"+cd+"#"+cd) chairman_speech = driver.find_element_by_xpath( '//div[@class="report_data"]').text c = re.match('.*\\n', chairman_speech).group() # cs=re.sub('\n',' ',chairman_speech) cs = re.sub(c, " ", chairman_speech) ds_keyword_list = keywords(ds, words=20, split=True, lemmatize=True) cs_keyword_list = keywords(cs, words=20, split=True, lemmatize=True) # keywords from whole chairman's speech # print(keyword_list) ds_keyword_tags = dict(nltk.pos_tag(ds_keyword_list)) cs_keyword_tags = dict(nltk.pos_tag(cs_keyword_list)) ds_keywords_final = [ word for word in ds_keyword_tags.keys() if ds_keyword_tags[word] == 'NN'] cs_keywords_final = [ word for word in cs_keyword_tags.keys() if cs_keyword_tags[word] == 'NN'] # also need to remove company name if there in the list # print(keywords_final[:5]) # summarization ds_summ = summarize(ds, word_count=100) cs_summ = summarize(cs, word_count=100) driver.get("https://www.moneycontrol.com/annual-report/"+company_name+"/directors-report/"+cd+"#"+cd) time.sleep(3) page = driver.page_source driver.quit() soup = BeautifulSoup(page, 'html.parser') container = soup.find_all('div', attrs={ 'class':'report_data'}) soup_string = str(container) out = json.dumps([{'summary': cs_summ, 'keywords': cs_keywords_final[:5], 'fullCont':cs}, { 'summary': ds_summ, 'keywords': ds_keywords_final[:5], 'fullCont':soup_string}]) return out
def get_summary(text): num_of_words = len(text.split()) print('[info] total size: ' + str(num_of_words)) if num_of_words >= 5000: return (summarize(text, 0.05)) elif num_of_words >= 3000 and num_of_words < 5000: return (summarize(text, 0.1)) elif num_of_words >= 1000 and num_of_words < 3000: return (summarize(text, 0.2)) else: return (summarize(text, 0.3))
def summarizeText(text): """summarize text if needed""" text_summarized = [] for txt in text: try: if (summarize(txt) == ''): text_summarized.append(txt) else: text_summarized.append(summarize(txt)) except: text_summarized.append(txt) return text_summarized
def create_summarized_feature(x): str_local = "" try : if len(x.split()) > 200: str_local = summarize(x, word_count = 200) else: str_local = x except ValueError: str_local_Error = ". ".join(rake_implement(x)) str_local = summarize(str_local_Error, word_count = 200) print("Can't Summarize this sentence as input has only one sentence. Hence, replacing with (Rake + Summarized Value)" ) return str_local
def summarize(text, word_count=120): if num_sentences(text) > 1: try: word_count_summary = summarizer.summarize(text, word_count=word_count) except ValueError: return text if word_count_summary: return word_count_summary else: ratio_summary = summarizer.summarize(text, ratio=0.2) if ratio_summary: return ratio_summary return text
def main(): """ NLP APP WITH STREAMLIT""" st.title("NLP with streamlit") st.subheader("Natural Language Processing on the Go") # tokenization if st.checkbox("show tokens and lemma", False): st.markdown("Tokenize your Text") message = st.text_area("Enter your text", "Type Here") if st.button("Analyze"): nlp_result = text_analyzer(message) st.json(nlp_result) # named entity if st.checkbox("show Named Entities", False): st.markdown("Extract entities from your Text") message = st.text_area("Enter ur text", "Type Here") if st.button("Extract"): nlp_result = entity_analyzer(message) st.json(nlp_result) # sentiment analysis if st.checkbox("show Sentiment Analysis", False): st.markdown("Sentiment of your Text") message = st.text_area("Enter text", "Type Here") if st.button("Analyze"): blob = TextBlob(message) result_sentiment = blob.sentiment st.success(result_sentiment) # text summarization if st.checkbox("show Text Summarization", False): st.markdown("Summarize of your Text") message = st.text_area("Enter text", "Type Here") summary_options = st.selectbox("Choose your summarizer", ("gensim", "sumy")) if st.button("Summarize"): if summary_options == 'sumy': st.text("Using sumy...") summary_result = sumy_summarizer(message) elif summary_options == 'gensim': st.text("Using gensim summarizer") summary_result = summarize(message) else: st.warning("Using default summarizer") st.text("Using Gensim") summary_result = summarize(message) st.success(summary_result)
def summarize_text(text, ratio=0.05): sentences = sent_tokenize(text) sentences = list(filter(lambda x: len(word_tokenize(x)) < 50, sentences)) text = '\n'.join(sentences) if len(sentences) > 1: return ' '.join(summarize(text, ratio=ratio, split=True)) return ""
def summary_summarize_task(self, pk): ''' retrive csv file from s3. read into datframe. get summaization of document add summary to list add list to dictionary ''' progress_recorder = ProgressRecorder( self) # create progress recorder object doc = Summary_Documents.objects.get( pk=pk) # get the document ref from the database documentName = str(doc.document) # get the name of the doc aws_id = os.environ.get('AWS_ACCESS_KEY_ID') # aws access aws_secret = os.environ.get('AWS_SECRET_ACCESS_KEY') #aws access REGION = 'eu-west-1' client = boto3.client( 's3', region_name=REGION, aws_access_key_id=aws_id, aws_secret_access_key=aws_secret ) # create the client to retrieve the file from storage bucket_name = "doc-sort-file-upload" object_key = documentName csv_obj = client.get_object(Bucket=bucket_name, Key=object_key) body = csv_obj['Body'] csv_string = body.read().decode('utf-8') data = pd.read_csv(StringIO(csv_string)) # read csv into dataframe documents = data['content'] docs_summarized = 0 docs_not_summarized = 0 total_docs = 0 documents_len = [] summary_len = [] result = [] # new column to hold result integer (0,1,2)value count = 0 for doc in documents: # iterate through filtered list documents_len.append(len(doc)) summary = summarize(doc, ratio=0.03) # get summary result.append(summary) summary_len.append(len(summary)) total_docs += 1 if result == None: result.append("Document too short") docs_not_summarized += 1 docs_summarized += 1 progress_recorder.set_progress(count + 1, len(documents)) # update progress count += 1 # update count summary_dict = {} # Adding list as value summary_dict["Result"] = result summary_dict["Total_docs"] = total_docs summary_dict["Docs_summarized"] = docs_summarized summary_dict["Docs_not_summarized"] = docs_not_summarized summary_dict["Documents_len"] = documents_len summary_dict["Summary_len"] = summary_len return summary_dict
def get_summary(url, ratio): news = Article(url, language='ko') news.download() news.parse() return summarize(news.text, ratio=ratio)
def endpoint(event, context): body = json.loads(event['body']) text = body['text'] # Strip out questions #new_text = [x for x in text.split('\n') if x[-1] != '?'] # Remove single words #new_text = '\n'.join([x for x in new_text if len(x.split()) > 1]) # Summarize summary = summarize(text, .5) # Strip out questions summary = [x for x in summary.split('\n') if x[-1] != '?'] # Remove single words summary = '\n'.join([x for x in summary if len(x.split()) > 1]) response = { "statusCode": 200, "headers": { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Credentials': True, }, "body": json.dumps(summary) } return response
def gensim_summarize(): from gensim.summarization.summarizer import summarize data = dict(default_data) data['message'] = "Summarize long text - Usage: 'text' POST parameter" params = {} if request.method == 'GET': return jsonify(data) params = request.form # postdata if not params: data['error'] = 'Missing parameters' return jsonify(data) if not 'text' in params: data['error'] = '[text] parameter not found' return jsonify(data) if not 'word_count' in params: word_count = None else: word_count = int(params['word_count']) data['summarize'] = summarize(text=params['text'], word_count=word_count) return jsonify(data)
def text2sentences(self, text): #text 요약하기 #print(summarize(text, ratio=0.3)) sentences1 = summarize(text, ratio=0.1) return self.kkma.sentences(sentences1)
def scrape_news_article(self, response): if 'html' in response.url: headline = response.css('h3 > span.titletext::text').extract() date_publish = response.css('time::text').extract() date_publish = date_publish[0] article_text = response.css('.textblock p::text').extract() article_text = ''.join(article_text) hot_words = self.get_hotwords(article_text) top_key_words = [(kw[0] + ', ') for kw in Counter(hot_words).most_common(7)] keywords = ''.join(top_key_words) author = '' subject = response.url.split('/')[3] summary = summarize(article_text) link = response.url headline = list(map(lambda x: x.strip(), headline)) articleItem = NewsScrapingItem(headline=headline, date_publish=date_publish, article_text=article_text, author=author, subject=subject, keywords=keywords, summary=summary, link=link) yield articleItem
def csummary(bot, update, args): url_link = ' '.join(args[1:]) title, raw_content = extract_article(url_link) if (title == "0" and raw_content == "0"): bot.send_message(chat_id=update.message.chat_id, text="Unable to access the URL") elif (raw_content == "0"): bot.send_message(chat_id=update.message.chat_id, text="No content is read") elif (raw_content == None): bot.send_message( chat_id=update.message.chat_id, text="Unable to retrive content: Perhaps it's too long") else: summary = summarize(raw_content, 0.5, int(args[0])) analysed_text = "Title: " + title + "\nSummary:\n" + summary original_len = count_words(raw_content) summarised_len = count_words(summary) if (summarised_len >= original_len): bot.send_message( chat_id=update.message.chat_id, text= "Fail to summarize: Perhaps your content is too short or using too big chunk sentence(s)" ) else: #prevent overflooding #text_splitter(bot, update, analysed_text) bot.send_message(chat_id=update.message.chat_id, text=analysed_text) data_text = "\nOriginal Length: " + str( original_len) + " words\nSummary length: " + str( summarised_len ) + " words\nProportion from original length:" + str( summarised_len / original_len) bot.send_message(chat_id=update.message.chat_id, text=data_text)
def bbc_dataset_rouge(): scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) test_data = get_bbc_dataset_files() generic_baseline, generic_improved, generic_improved_redundancy_removal = None, None, None for v in test_data: text = v['full'] summary = v['summary'] scores = scorer.score(summarize(text, ratio=0.4), summary) generic_baseline = update_score(generic_baseline, scores) scores = scorer.score( improved_summarize(text, ratio=0.4, redundancy_removal=False), summary) generic_improved = update_score(generic_improved, scores) scores = scorer.score( improved_summarize(text, ratio=0.4, redundancy_removal=True), summary) generic_improved_redundancy_removal = update_score( generic_improved_redundancy_removal, scores) total_news = len(test_data) return { 'generic_baseline': get_score_avg(generic_baseline, total_news), 'generic_improved_redundancy_removal': get_score_avg(generic_improved_redundancy_removal, total_news), 'generic_improved': get_score_avg(generic_improved, total_news) }
def index(request): text = "" summarized_text = "" message = "" if request.method == 'POST': form = ImageUpload(request.POST, request.FILES) if form.is_valid(): try: form.save() image = request.FILES['image'] image = image.name path = settings.MEDIA_ROOT pathz = path + "/images/" + image text = pytesseract.image_to_string(Image.open(pathz)) text = text.encode("ascii", "ignore") text = text.decode() # Summary (0.1% of the original content). summarized_text = summarize(text, ratio=0.1) os.remove(pathz) except: message = "check your filename and ensure it doesn't have any space or check if it has any text" context = { 'text': text, 'summarized_text': summarized_text, 'message': message } return render(request, 'formpage.html', context)
def summarise(data, context): print(context) print(data.textPayload) event = json.loads(data.textPayload) bucket_name = event.bucket file_name = event.name if "speech-to-text.txt" not in file_name: return storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) blob = bucket.get_blob(file_name) text = blob.download_as_string() body = summarize(text, 0.5) filepath_parts = file_name.split('/') summary_file = "/" summary_file.join(filepath_parts[0:len(filepath_parts - 2)]) blob = bucket.blob(summary_file) blob.upload_from_string( body, content_type="text/plain" ) response = { "statusCode": 200, "body": body } return json.dumps(response, indent=4)
def gensim_summarize(text, ratio=0.2, word_count=None): """ Input: a paragraph as text string, proportion of sentences as summary, or limit words as summary Output a summary as text string """ return summarize(text, ratio, word_count)
def parse_url(self, url): from bs4 import BeautifulSoup from gensim.summarization.summarizer import summarize import requests headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } summaries = [] # URL cleaning if url is not None: url = url.strip('\'"') try: page = requests.get(url, headers=headers, stream=True) soup = BeautifulSoup(page.content, "lxml") # print ('got soup') text = ' '.join(map(lambda p: p.text, soup.find_all('p'))) # print ('got text') text_summary = summarize(text) # print ('got summary') except: text_summary = '' summaries.append(text_summary) return summaries
def SKMN(input1, sentcount): summarizer = KeysentenceSummarizer(tokenize=komoran_tokenizer, similarity='textrank', min_sim=0.3) tr = TextRank() tr.loadSents(RawSentence(input1), lambda sent: filter(lambda x: x not in stop_words, preprocessing(input1))) tr.build() sents = sent_tokenize(input1) keysent1 = summarizer.summarize(sents, topk=sentcount) keysent2 = sent_tokenize(summarize(input1, ratio=0.1)) keysent3 = sent_tokenize(tr.summarize(sentcount)) keysent4 = summarizer.summarize(sents, topk=sentcount * 2) Mixlist = mixlist(mixlist(mixlist(keysent1, keysent2), keysent3), keysent4) mixlen = len(Mixlist) countlist = [0] * mixlen countlist = mcountlist(Mixlist, keysent1, countlist) countlist = mcountlist(Mixlist, keysent2, countlist) countlist = mcountlist(Mixlist, keysent3, countlist) countlist = mcountlist(Mixlist, keysent4, countlist) maxnum = max(countlist) rcountlist = [0] * mixlen for i in range(mixlen): rcountlist[i] = maxnum - countlist[i] + 1 from queue import PriorityQueue que = PriorityQueue() for i in range(mixlen): que.put((rcountlist[i], Mixlist[i])) return '\n'.join([que.get()[1] for _ in range(sentcount)])
def scrape(self, response): articleItem = response.meta['item'] articleItem['headline'] = response.css('h3::text').extract() articleItem['date_publish'] = response.css( "time::attr('title')").extract() articleItem['article_text'] = response.css( '.css-1jftgse p::text').extract() article_text = ''.join(articleItem['article_text']) articleItem['author'] = response.css( ".css-134vnn1 section:nth-child(3) li a span::text").extract() articleItem['author'] = ','.join(articleItem['author']) txt_blob = TextBlob(article_text) articleItem['sentiment'] = txt_blob.sentiment key_words = self.get_hotwords(article_text) top_key_words = [(kw[0] + ', ') for kw in Counter(key_words).most_common(7)] articleItem['keywords'] = ''.join(top_key_words) articleItem['summary'] = summarize(article_text, ratio=0.2) articleItem['link'] = response.url if 'Sekundäre Navigation' in articleItem['headline']: articleItem['headline'].remove('Sekundäre Navigation') for i in articleItem['date_publish']: pattern = re.compile(r'\d{2}.\d{2}.\d{4}') result = re.search(pattern, i) articleItem['date_publish'] = result.group() yield articleItem