Python HtmlParser.from_file示例

编程语言: Python

命名空间/包名称: sumy.parsers.html

类/类型: HtmlParser

方法/功能: from_file

hotexamples.com的示例: 5

Python HtmlParser.from_file - 已找到5个示例。这些是从开源项目中提取的最受好评的sumy.parsers.html.HtmlParser.from_file现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

from_url(30)

from_string(20)

from_file(4)

HtmlParser(3)

示例#1

显示文件

文件： test_parsers.py 项目： NecessitateApps/Summarizer

    def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
                         "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
                         "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)

        self.assertEqual(
            to_unicode(document.paragraphs[1].sentences[0]),
            "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.")
        self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]),
                         "Aj súbory majú predsa city.")

示例#2

显示文件

def summarize_text(request):
    if request.html:
        parser = HtmlParser.from_file(file_path=request.html,
                                      url=request.url,
                                      tokenizer=Tokenizer(LANGUAGE))
    else:
        parser = PlaintextParser.from_file(file_path=request.html,
                                           tokenizer=Tokenizer(LANGUAGE))

    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)
    sentences = [fix_text(str(s)) for s in summarizer(parser.document, SENTENCES_COUNT)]
    html = generate_html(sentences, fix_text(request.title)).render()
    request.send_html(html)

示例#3

显示文件

文件： test_html_parser.py 项目： miso-belica/sumy

def test_annotated_text():
    path = expand_resource_path("snippets/paragraphs.html")
    url = "http://www.snippet.org/paragraphs.html"
    parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 1
    assert len(document.paragraphs[0].sentences) == 1

    assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne"
    assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn."

    assert len(document.paragraphs[1].headings) == 0
    assert len(document.paragraphs[1].sentences) == 2

    assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru."
    assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."

示例#4

显示文件

    print(urls[i])
    print("-------------------------------------------------------------------------------------")
    html = requests.get(urls[i]).content
    webpage_content = get_article(html)
    print(webpage_content.strip())
    print("-------------------------------------------------------------------------------------\n\n\n")

print("===============================================================================================================")
print("\n\n\n")
'''

# From a chosen file
root = tk.Tk().withdraw()  # removes tkinter default popup box
filepath = askopenfilename(filetypes=[("Webpage", ["*.html", "*.rtf"])
                                      ])  # Only allow text files currently
parser = HtmlParser.from_file(filepath, url="", tokenizer=Tokenizer('english'))


def sumyParcer():
    print(
        "------------------------- sumy parser --------------------------------------------"
    )
    try:
        webpage_content = ""
        for s in parser.document.sentences:
            webpage_content += str(s) + "\n"
        print(webpage_content.strip())
    except:
        print("parcer failed - error  :")
    print(
        "------------------------- end of sumy parser -------------------------------------\n\n\n"

示例#5

显示文件

文件： sum_with_sumy.py 项目： JoshuaMathias/summarizer

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.kl import KLSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

LANGUAGE = "english"
SENTENCES_COUNT = "5"

if __name__ == "__main__":
    directory = "~/dropbox/17-18/573/AQUAINT/nyt/2000/"

    # TODO: Get list of files and loop each file

    filename = "20000101_NYT"

    process_file = "doc.txt"  # directory + filename

    url = "file://home/unclenacho/school/573/src/doc.txt"
    parser = HtmlParser.from_file(process_file, None, Tokenizer(LANGUAGE))

    # parser = PlaintextParser.from_file(process_file, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    for sentence in summarizer(parser.document, SENTENCES_COUNT):
        print(sentence)