def test_tokenize_japanese_sentence(): tokenizer = Tokenizer('japanese') assert tokenizer.language == 'japanese' sentence = 'この文章を、正しくトークン化したい。' expected = ('この', '文章', 'を', '正しく', 'トークン', '化', 'し', 'たい') assert expected == tokenizer.to_words(sentence)
def test_tokenize_chinese_sentence(): tokenizer = Tokenizer('chinese') assert tokenizer.language == 'chinese' sentence = '好用的文档自动化摘要程序。' expected = ('好用', '的', '文档', '自动化', '摘要', '程序') assert expected == tokenizer.to_words(sentence)
def test_tokenize_sentence(): tokenizer = Tokenizer("english") words = tokenizer.to_words("I am a very nice sentence with comma, but..") expected = ( "I", "am", "a", "very", "nice", "sentence", "with", "comma", ) assert expected == words
def test_tokenize_japanese_paragraph(): tokenizer = Tokenizer('japanese') expected = ( '1つ目の文章です。', 'その次は何が来ますか?', '「2つ目の文章」です。' ) paragraph = '1つ目の文章です。その次は何が来ますか? 「2つ目の文章」です。' assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_chinese_paragraph(): tokenizer = Tokenizer('chinese') expected = ( '我正在为这个软件添加中文支持。', '这个软件是用于文档摘要!', '这个软件支持网页和文本两种输入格式?' ) paragraph = '我正在为这个软件添加中文支持。这个软件是用于文档摘要!这个软件支持网页和文本两种输入格式?' assert expected == tokenizer.to_sentences(paragraph)
def test_tokenize_paragraph(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences(""" I am a very nice sentence with comma, but.. This is next sentence. "I'm bored", said Pepek. Ou jee, duffman is here. """) expected = ( "I am a very nice sentence with comma, but..", "This is next sentence.", '"I\'m bored", said Pepek.', "Ou jee, duffman is here.", ) assert expected == sentences
def test_ensure_czech_tokenizer_available(): tokenizer = Tokenizer("czech") assert "czech" == tokenizer.language sentences = tokenizer.to_sentences(""" Měl jsem sen, že toto je sen. Bylo to také zvláštní. Jakoby jsem plaval v moři rekurze. """) expected = ( "Měl jsem sen, že toto je sen.", "Bylo to také zvláštní.", "Jakoby jsem plaval v moři rekurze.", ) assert expected == sentences
def test_ensure_czech_tokenizer_available(self): tokenizer = Tokenizer("czech") self.assertEqual("czech", tokenizer.language) sentences = tokenizer.to_sentences(""" Měl jsem sen, že toto je sen. Bylo to také zvláštní. Jakoby jsem plaval v moři rekurze. """) expected = ( "Měl jsem sen, že toto je sen.", "Bylo to také zvláštní.", "Jakoby jsem plaval v moři rekurze.", ) self.assertEqual(expected, sentences)
def test_slovak_alias_into_czech_tokenizer(self): tokenizer = Tokenizer("slovak") self.assertEqual(tokenizer.language, "slovak") sentences = tokenizer.to_sentences(""" Je to veľmi fajn. Bodaj by nie. Ale na druhej strane čo je to oproti inému? To nechám na čitateľa. """) expected = ( "Je to veľmi fajn.", "Bodaj by nie.", "Ale na druhej strane čo je to oproti inému?", "To nechám na čitateľa.", ) self.assertEqual(expected, sentences)
def test_slovak_alias_into_czech_tokenizer(): tokenizer = Tokenizer("slovak") assert tokenizer.language == "slovak" sentences = tokenizer.to_sentences(""" Je to veľmi fajn. Bodaj by nie. Ale na druhej strane čo je to oproti inému? To nechám na čitateľa. """) expected = ( "Je to veľmi fajn.", "Bodaj by nie.", "Ale na druhej strane čo je to oproti inému?", "To nechám na čitateľa.", ) assert expected == sentences
def lexrank_summarize(input_dict): """Return the summarized text string given the raw_dict from get_raw_dict() input: input_dict: dictionary containing document content under "RAW_TEXT" key (first input from get_raw_dict()) """ summarizer_lr = LexRankSummarizer() lr_fulltext = "" for key, value in input_dict.items(): text = value["RAW_TEXT"] parser = PlaintextParser.from_string(text, Tokenizer("english")) summary = summarizer_lr(parser.document, 1)[0] lr_fulltext += str(summary) + " " return lr_fulltext
def summarize(self, documents, posts, max_sents): # print('Preprecesing...') documents = self.preprocess(documents) summaries = [] for doc in documents: doc = doc.replace('\n', ' ').replace(' ', ' ').replace( '\r', ' ').replace(' ', ' ') parser = PlaintextParser.from_string(doc, Tokenizer('english')) summary = '' summary_sents = self.summarizer(parser.document, max_sents) for sentence in summary_sents: # print(sentence) summary += sentence._text + ' ' summaries.append(summary.strip()) return summaries
def summary_from_text(self, text, count): list = [] article = '' try: parser = PlaintextParser.from_string(text, Tokenizer(self.language)) self.save_article(parser) article = text for sentence in self.summarizer(parser.document, count): list.append(str(sentence)) except (LinAlgError): list.append(text) except: list = [] return article, list
def summarize(): """ Returns summary of articles """ text = request.form['text'] # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) final = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): final.append(str(sentence)) return jsonify(summary=final)
def summarize_doc(self, bullets=3): parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) master = "" sentence_counter = 0 for sentence in summarizer(parser.document, bullets): print(sentence) master = str(master) + str(sentence) sentence_counter += 1 print(sentence_counter) return master
def slititle(sena): # Creates a temporary text file since sumy works that way with open('Titlefile___.txt', 'w') as tefil: tefil.write(sena) thefile = "Titlefile___.txt" #name of the plain-text file parser = PlaintextParser.from_file(thefile, Tokenizer("english")) summarizer = LsaSummarizer() summary = summarizer(parser.document, 1) # Reduce the document to 1 sentence os.remove('Titlefile___.txt') return str(summary[0]).split('.')[0]
def summarize(self, fields): """ yields the summary on a hit to facilitate building bulk update """ assert self.content_field in fields content = fields[self.content_field][0] language = fields[self.lang_field][0] if self.lang_field in fields else 'en' language = LANGUAGE_MAP[language] parser = PlaintextParser.from_string(content, Tokenizer(language)) stemmer = Stemmer(language) summarizer = LexRankSummarizer(stemmer) summarizer.stop_words = get_stop_words(language) sentences = [str(s) for s in summarizer(parser.document, self.count)] summary = ' '.join(sentences) return summary
def extract_summary_keywords(trend,urls,titles): total_articles_content=extract_text(urls) keywords=extract_keywords_from_all_text(total_articles_content,titles) current_path=os.path.dirname(os.path.realpath(__file__)) current_path=current_path+'\\'+trend+'.txt' with open(current_path, 'w') as the_file: the_file.write(total_articles_content) parser = PlaintextParser.from_file(current_path, Tokenizer(LANGUAGE)) os.remove(current_path) sentences='' for sentence in summarizer(parser.document, 12): sentences=sentences+' '+str(sentence) replaced_syn=replacesynonym(sentences) matches = tool.check(sentences) correct_summary=language_check.correct(sentences, matches) return correct_summary,keywords
def summy_lex_rank_process_article_file(file_path): sents = [] with io.open(file_path, 'r', encoding='utf-8') as article_file: for line in article_file: if line.find('@highlight') != -1: break line = line.strip() sents.extend(sent_tokenize(line)) parser = PlaintextParser.from_string(' '.join(sents), Tokenizer('english')) summarizer = LexRankSummarizer() # Summarize the document with 2 sentences sums = summarizer(parser.document, NUM_SUM_SENTS) res_list = [] for summary in sums: res_list.append(str(summary)) return res_list
def get_summary(article): url = article parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) parsed_articled = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): parsed_articled += (str(sentence) + "\n") return parsed_articled
def sumy_sum(filename): text = pdf_text(filename) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) final = "" for sentence in summarizer(parser.document, SENTENCES_COUNT): text = str(sentence) text = text.replace('\n', ' ') text = text.replace('\r', '') text = re.sub(r'[^\x00-\x7f]', r' ', text) text = re.sub(r'\s+', r" ", text) final += str(text) print(final)
def get_content(url_name, extras): LANGUAGE = "english" SENTENCES_COUNT = 5 url = str(url_name) parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) summarizer = Summarizer(Stemmer(LANGUAGE)) summarizer.stop_words = extras try: result = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): result.append(str(sentence)) result = ' '.join(map(str, result)) return {'result': result, 'source': url} except Exception as e: print('fail: ', e) return extras, sentence
def make_summary(language, sentence_count, algorithm, text=None, url=None): tokenizer = Tokenizer(language) if url: parser = HtmlParser.from_url(url, tokenizer) else: parser = PlaintextParser.from_string(text, tokenizer) stemmer = Stemmer(language) summarizer = build_summarizer(algorithm, stemmer) summarizer.stop_words = stopwords.words(language) lines = [] for sentence in summarizer(parser.document, sentence_count): lines.append(str(sentence)) return '\n'.join(lines)
def gen_sum(document, n, alg="LSA"): parser = PlaintextParser.from_string(document, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if alg == "LSA": return run_LSA(stemmer, parser.document) elif alg == "LexRank": return run_LexRank(stemmer, parser.document) elif alg == "TextRank": return run_TextRank(stemmer, parser.document) elif alg == "Luhn": return run_Luhn(stemmer, parser.document) elif alg == "SumBasic": return run_SumBasic(stemmer, parser.document, n) else: exit("Unkown extractive summarization algorithm!")
def __init__(self, transcript_file_path, summary_number): """ Input a transcript_file_path in the form of a string and a summary_number denoting the number of sentences requested in the summary. """ self.transcript_file = transcript_file_path full_transcript_text = file.read(open(self.transcript_file, "r")) self.tokenized_transcript = sent_tokenize(full_transcript_text) LANGUAGE = "English" parser = PlaintextParser.from_file(self.transcript_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) self.summary = summarizer(parser.document, summary_number)
def summarize(): """ Returns summary of articles """ text = request.form['text'] # parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) final = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): final.append(str(sentence)) length = len(final) return render_template('paraphrase.html', report=final, length=length)
def summarized(): # Requests data in forms URL = request.form['url-field'] SENTENCES_COUNT = request.form['quantity'] LANGUAGE = "english" # Summarization parser = HtmlParser.from_url(URL, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) list = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): list.append(sentence) return list
def main(args=None): summarizer, document, items_count, reference_summary = handle_arguments() evaluated_sentences = summarizer(document, items_count) reference_document = PlaintextParser.from_string(reference_summary, Tokenizer(language)) reference_sentences = reference_document.document.sentences for name, evaluate_document, evaluate in AVAILABLE_EVALUATIONS: if evaluate_document: result = evaluate(evaluated_sentences, document.sentences) else: result = evaluate(evaluated_sentences, reference_sentences) print("%s: %f" % (name, result)) return 0
def summarize_sentences(sentences: str, language="english") -> list: # Preparation sentences corpus_maker = EnglishCorpus() preprocessed_sentences = corpus_maker.preprocessing(sentences) preprocessed_sentence_list = corpus_maker.make_sentence_list( preprocessed_sentences) corpus = corpus_maker.make_corpus() parser = PlaintextParser.from_string(" ".join(corpus), Tokenizer(language)) # Call the summarization algorithm and do the summarization summarizer = LexRankSummarizer() summarizer.stop_words = get_stop_words(language) summary = summarizer(document=parser.document, sentences_count=len(corpus) * 2 // 10) return summary
def test_document_is_all_in_upper_case(): """ When all words is in upper case Plaintext parser first line as heading and LexRank algorithm raises exception "ZeroDivisionError: float division by zero" because there is no sentence to summarize. See https://github.com/miso-belica/sumy/issues/25 """ parser = PlaintextParser.from_string( "JUST WRITING SOME TEXT. TO TEST CASE. WITH ZERO SENTENCES RETURNED. FROM TOKENIZER.", Tokenizer("english") ) summarizer = LexRankSummarizer(stem_word) summarizer.stop_words = get_stop_words("english") sentences = summarizer(parser.document, 20) assert len(sentences) == 0
def button_clicked(self): self.label.setText("Recording you!") self.update() LANGUAGE = "english" SENTENCES_COUNT = 5 #collects the x number of relevant sentences. import speech_recognition as sr for index, name in enumerate(sr.Microphone.list_microphone_names()): print("Microphone with name \"{1}\" found for `Microphone(device_index={0})`".format(index, name)) # obtain audio from the microphone r = sr.Recognizer() commentary = "" while True: with sr.Microphone(1) as source: print("Say something!") audio = r.listen(source) # recognize speech using Google Speech Recognition try: speech = r.recognize_google(audio) print("You said " + speech) # use default Google API key `r.recognize_google(audio, key = implied)` commentary = commentary + speech # + ". " if "Corpus" in speech: commentary = commentary - "corpus" break except sr.UnknownValueError: print("Speech Recognition didn't catch that") except sr.RequestError as e: print("Could not request results from Speech Recognition service; {0}".format(e)) except KeyboardInterrupt: break print("PRE-PUNCTUATION:", commentary, "\n") commentary = (fastpunct.punct([commentary], batch_size=32)) print("PUNCTUATED", commentary, "\n") parser = PlaintextParser.from_string(commentary, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) print("SUMMARIZED TEXT:") for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def get_summary(self): # 1行1文となっているため、改行コードで分離 sentences = [t for t in self._text.split('\n')] for i in range(1): print(sentences[i]) # 形態素解析器を作る analyzer = Analyzer( [ UnicodeNormalizeCharFilter(), RegexReplaceCharFilter(r'[(\)「」、。]', ' ') ], # ()「」、。は全てスペースに置き換える JanomeTokenizer(), [ POSKeepFilter(['名詞', '形容詞', '副詞', '動詞']), ExtractAttributeFilter('base_form') ] # 名詞・形容詞・副詞・動詞の原型のみ ) # 抽出された単語をスペースで連結 # 末尾の'。'は、この後使うtinysegmenterで文として分離させるため。 corpus = [' '.join(analyzer.analyze(s)) + '。' for s in sentences] """ for i in range(2): print(corpus[i]) """ # 転職 Advent Calendar 2016 - Qiita 14 日 目 なる 少し ポエム 含む。 # 今年 11 月 SIer Web サービス 会社 転職 する。 """ from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lex_rank import LexRankSummarizer """ # 連結したcorpusを再度tinysegmenterでトークナイズさせる parser = PlaintextParser.from_string(''.join(corpus), Tokenizer('japanese')) # LexRankで要約を2文抽出 summarizer = LexRankSummarizer() summarizer.stop_words = [' '] # スペースも1単語として認識されるため、ストップワードにすることで除外する self.summary = summarizer(document=parser.document, sentences_count=2) # 元の文を表示 for sentence in self.summary: print(sentences[corpus.index(sentence.__str__())])
def summarize(): rows = store.get_row_by_status(1) for row in rows: parser = PlaintextParser.from_string(row["content_origin"], Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = list() for sentence in summarizer(parser.document, SENTENCES_COUNT): sentences.append(str(sentence)) summary = "\n".join(sentences) store.update_row(row["id"], {"summary_origin": summary, "status": 2})
def perform_sumy_summarization(data, gold_standard_data): stemmer = Stemmer(LANGUAGE) summarizers = [LsaSummarizer(stemmer), TextRankSummarizer(stemmer), LexRankSummarizer(stemmer)] summarizer_names = ["Lsa", "TextRank", "LexRank"] # print "SUMY Scores: " # Read each sentence from 'data' and create a summary of it for line, gs_line in zip(data, gold_standard_data): # Only consider the content part of the text. Changed it from unicode to normal string # summarized_text = line["content"].encode('ascii', 'ignore') summarized_text = line["content"] if gs_line == "\n": continue gold_standard = gs_line # Read line by line instead of reading the entire file parser = PlaintextParser.from_string(summarized_text, Tokenizer(LANGUAGE)) idx = 0 for summarizer in summarizers: # Store the scores in a dictionary output_scores[line["index"]] = [] sumy_dict_key = str(line["index"]) + summarizer_names[idx] sumy_scores_dict[sumy_dict_key] = [] summarizer.stop_words = get_stop_words(LANGUAGE) # print "SUMY with", summarizer for sentence in summarizer(parser.document, SENTENCES_COUNT): if line["index"] in output_scores: output_scores[line["index"]] = [] if sumy_dict_key in sumy_scores_dict[sumy_dict_key]: continue # Store output in a dictionary in the form of a key-value pair # Example --> 1: 'with the exception of the elderly and the youth' output_scores[int(line["index"])].append({"sumy_rouge_unigrams": calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 1)}) output_scores[int(line["index"])].append({"sumy_rouge_bigrams": calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 2)}) output_scores[int(line["index"])].append({"sumy_rouge_l": calculate_rouge_l_score(line["index"], gold_standard, str(sentence))}) output_scores[int(line["index"])].append({"sumy_rouge_s": calculate_rouge_s_score(line["index"], gold_standard,str(sentence), 2)}) sumy_scores_dict[sumy_dict_key].append(calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 1)) sumy_scores_dict[sumy_dict_key].append(calculate_rouge_n_score(line["index"], gold_standard, str(sentence), 2)) sumy_scores_dict[sumy_dict_key].append(calculate_rouge_l_score(line["index"], gold_standard, str(sentence))) sumy_scores_dict[sumy_dict_key].append(calculate_rouge_s_score(line["index"], gold_standard,str(sentence), 2)) idx += 1
def summarize_sumy(doc, case): summary = "" file_doc = open("temp.txt", "w", encoding='utf-8') file_doc.write(doc) file_doc.close() parser = PlaintextParser.from_file("temp.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) if case == 1: summarizer = LexRankSummarizer(stemmer) else: summarizer = LsaSummarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): summary += str(sentence) + ' ' return summary
def summarize(url): #url = "https://www.programmableweb.com/api/google-custom-search" parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE)) # or for plain text files # parser = PlaintextParser.from_file("document.txt", Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) summary = [] for sentence in summarizer(parser.document, SENTENCES_COUNT): summary.append(sentence) #print(sentence) return summary
def sumy_summarization(input_text): sumy_output = {} start_time = time.time() parser = PlaintextParser.from_string(input_text, Tokenizer("english")) # using Lexrank summarizer that is similair to the spacy approach used above lex_summarizer = LexRankSummarizer() # Use the top 5 senteences as a sumart summarized_sentences = lex_summarizer(parser.document, 5) summary_sentences = [str(sentence) for sentence in summarized_sentences] sumy_output['summary'] = ' '.join(summary_sentences) sumy_output['total_time'] = time.time() - start_time return sumy_output
def test_real_example(self): """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti""" parser = PlaintextParser.from_string( load_resource("snippets/prevko.txt"), Tokenizer("czech")) summarizer = LsaSummarizer(Stemmer("czech")) summarizer.stop_words = get_stop_words("czech") sentences = summarizer(parser.document, 2) self.assertEqual(len(sentences), 2) self.assertEqual( to_unicode(sentences[0]), "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením." ) self.assertEqual( to_unicode(sentences[1]), "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, " "což se chlapci ani trochu nelíbilo.")
def summarize(): message = sys.stdin.readlines() print(message[0]) jsonMsg = json.loads(message[0]) TEXT = jsonMsg[0] LANGUAGE = jsonMsg[1] SENTENCES_COUNT = 1 parser = PlaintextParser.from_string(TEXT, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)
def url_summarize(): if not Validation.validate_api_request( request=request, required_properties=["url", "ratio"]): return API.api_response( API.failure_code, "Request does not match defined schema. Check documentation") json_content = request.get_json() parser = HtmlParser.from_url(url=json_content['url'], tokenizer=Tokenizer('english')) text_from_url = ' '.join( str(sentence) for sentence in parser.document.sentences) summary = gensim.summarization.summarize(text=text_from_url, ratio=json_content['ratio']) return API.api_response(API.success_code, summary)
def test_tokenize_sentences_with_abbreviations(): tokenizer = Tokenizer("english") sentences = tokenizer.to_sentences("There are people who are weird, e.g. normal people. These people know you.") expected = ("There are people who are weird, e.g. normal people.", "These people know you.",) assert expected == sentences