def test_annotated_text(self): path = expand_resource_path("snippets/paragraphs.html") url = "http://www.snippet.org/paragraphs.html" parser = HtmlParser.from_file(path, url, Tokenizer("czech")) document = parser.document self.assertEqual(len(document.paragraphs), 2) self.assertEqual(len(document.paragraphs[0].headings), 1) self.assertEqual(len(document.paragraphs[0].sentences), 1) self.assertEqual(to_unicode(document.paragraphs[0].headings[0]), "Toto je nadpis prvej úrovne") self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]), "Toto je prvý odstavec a to je fajn.") self.assertEqual(len(document.paragraphs[1].headings), 0) self.assertEqual(len(document.paragraphs[1].sentences), 2) self.assertEqual( to_unicode(document.paragraphs[1].sentences[0]), "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.") self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]), "Aj súbory majú predsa city.")
def summarize_text(request): if request.html: parser = HtmlParser.from_file(file_path=request.html, url=request.url, tokenizer=Tokenizer(LANGUAGE)) else: parser = PlaintextParser.from_file(file_path=request.html, tokenizer=Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)] html = generate_html(sentences, fix_text(request.title)).render() request.send_html(html)
def test_annotated_text(): path = expand_resource_path("snippets/paragraphs.html") url = "http://www.snippet.org/paragraphs.html" parser = HtmlParser.from_file(path, url, Tokenizer("czech")) document = parser.document assert len(document.paragraphs) == 2 assert len(document.paragraphs[0].headings) == 1 assert len(document.paragraphs[0].sentences) == 1 assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne" assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn." assert len(document.paragraphs[1].headings) == 0 assert len(document.paragraphs[1].sentences) == 2 assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru." assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."
print(urls[i]) print("-------------------------------------------------------------------------------------") html = requests.get(urls[i]).content webpage_content = get_article(html) print(webpage_content.strip()) print("-------------------------------------------------------------------------------------\n\n\n") print("===============================================================================================================") print("\n\n\n") ''' # From a chosen file root = tk.Tk().withdraw() # removes tkinter default popup box filepath = askopenfilename(filetypes=[("Webpage", ["*.html", "*.rtf"]) ]) # Only allow text files currently parser = HtmlParser.from_file(filepath, url="", tokenizer=Tokenizer('english')) def sumyParcer(): print( "------------------------- sumy parser --------------------------------------------" ) try: webpage_content = "" for s in parser.document.sentences: webpage_content += str(s) + "\n" print(webpage_content.strip()) except: print("parcer failed - error :") print( "------------------------- end of sumy parser -------------------------------------\n\n\n"
from sumy.parsers.html import HtmlParser from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.kl import KLSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer from sumy.utils import get_stop_words LANGUAGE = "english" SENTENCES_COUNT = "5" if __name__ == "__main__": directory = "~/dropbox/17-18/573/AQUAINT/nyt/2000/" # TODO: Get list of files and loop each file filename = "20000101_NYT" process_file = "doc.txt" # directory + filename url = "file://home/unclenacho/school/573/src/doc.txt" parser = HtmlParser.from_file(process_file, None, Tokenizer(LANGUAGE)) # parser = PlaintextParser.from_file(process_file, Tokenizer(LANGUAGE)) stemmer = Stemmer(LANGUAGE) summarizer = Summarizer(stemmer) summarizer.stop_words = get_stop_words(LANGUAGE) for sentence in summarizer(parser.document, SENTENCES_COUNT): print(sentence)