def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
                         "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
                         "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)

        self.assertEqual(
            to_unicode(document.paragraphs[1].sentences[0]),
            "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.")
        self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]),
                         "Aj súbory majú predsa city.")
示例#2
0
def summarize_text(request):
    if request.html:
        parser = HtmlParser.from_file(file_path=request.html,
                                      url=request.url,
                                      tokenizer=Tokenizer(LANGUAGE))
    else:
        parser = PlaintextParser.from_file(file_path=request.html,
                                           tokenizer=Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
    html = generate_html(sentences, fix_text(request.title)).render()
    request.send_html(html)
示例#3
0
def test_annotated_text():
    path = expand_resource_path("snippets/paragraphs.html")
    url = "http://www.snippet.org/paragraphs.html"
    parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 1
    assert len(document.paragraphs[0].sentences) == 1

    assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne"
    assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn."

    assert len(document.paragraphs[1].headings) == 0
    assert len(document.paragraphs[1].sentences) == 2

    assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru."
    assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."
示例#4
0
    print(urls[i])
    print("-------------------------------------------------------------------------------------")
    html = requests.get(urls[i]).content
    webpage_content = get_article(html)
    print(webpage_content.strip())
    print("-------------------------------------------------------------------------------------\n\n\n")

print("===============================================================================================================")
print("\n\n\n")
'''

# From a chosen file
root = tk.Tk().withdraw()  # removes tkinter default popup box
filepath = askopenfilename(filetypes=[("Webpage", ["*.html", "*.rtf"])
                                      ])  # Only allow text files currently
parser = HtmlParser.from_file(filepath, url="", tokenizer=Tokenizer('english'))


def sumyParcer():
    print(
        "------------------------- sumy parser --------------------------------------------"
    )
    try:
        webpage_content = ""
        for s in parser.document.sentences:
            webpage_content += str(s) + "\n"
        print(webpage_content.strip())
    except:
        print("parcer failed - error  :")
    print(
        "------------------------- end of sumy parser -------------------------------------\n\n\n"
示例#5
0
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.kl import KLSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = "5"

if __name__ == "__main__":
    directory = "~/dropbox/17-18/573/AQUAINT/nyt/2000/"

    # TODO: Get list of files and loop each file

    filename = "20000101_NYT"

    process_file = "doc.txt"  # directory + filename

    url = "file://home/unclenacho/school/573/src/doc.txt"
    parser = HtmlParser.from_file(process_file, None, Tokenizer(LANGUAGE))

    # parser = PlaintextParser.from_file(process_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)