Python HTMLDocument.parse 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cosrlib.document.html

클래스/타입: HTMLDocument

메소드/함수: parse

hotexamples.com에서의 예제들: 12

Python HTMLDocument.parse - 12개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cosrlib.document.html.HTMLDocument.parse에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

HTMLDocument(23)

parse(6)

get_all_words(2)

get_hyperlinks(2)

get_title(2)

get_domain_paid_words(1)

get_external_hyperlinks(1)

get_internal_hyperlinks(1)

get_summary(1)

get_url(1)

get_url_words(1)

get_word_groups(1)

parse_canonical_url(1)

예제 #1

파일 보기

파일: test_encoding.py 프로젝트: JBaba/cosr-back

def test_encoding_w3c(p_w3c_test, p_expected_encoding):
    from cosrlib.document.html import HTMLDocument

    test_file = os.path.join(
        "tests/testdata/html_w3c_encoding_testcases",
        "the-input-byte-stream-%s.html" % p_w3c_test
    )

    with open(test_file, "rb") as f:
        headers = {}
        if os.path.isfile(test_file + ".headers"):
            with open(test_file + ".headers", "rb") as hf:
                headers["content-type"] = hf.read()[14:].strip()

        html = f.read()

        # print repr(html[0:10])

        doc = HTMLDocument(html, url=None, headers=headers)

        if p_expected_encoding is None:
            assert doc.encoding.detect() is None
        else:
            assert doc.encoding.detect().name == p_expected_encoding

        doc.parse()

예제 #2

파일 보기

파일: test_formatting.py 프로젝트: mlinksva/cosr-back

 def format_html_title(title, url=None):
     doc = HTMLDocument("""
         <html><head><title>%s</title></head><body>Hello</body></html>
     """ % title,
                        url=url)
     doc.parse()
     return format_title(doc, {})

예제 #3

파일 보기

파일: test_encoding.py 프로젝트: JBaba/cosr-back

def test_encoding_xml():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html>
<html  lang="en" ></html>
    """)
    assert doc.encoding.detect().name == "shift_jis"
    doc.parse()

예제 #4

파일 보기

파일: test_encoding.py 프로젝트: x0rzkov/cosr-back

def test_encoding_x_user_defined():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "x-user-defined"
    doc.parse()

예제 #5

파일 보기

파일: test_encoding.py 프로젝트: x0rzkov/cosr-back

def test_encoding_xml():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<?xml version="1.0" encoding="shift_jis"?><!DOCTYPE html>
<html  lang="en" ></html>
    """)
    assert doc.encoding.detect().name == "shift_jis"
    doc.parse()

예제 #6

파일 보기

파일: test_encoding.py 프로젝트: JBaba/cosr-back

def test_reparse():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>""")
    assert doc.encoding.detect().name == "iso8859-15"

    # A re-parsing of the document should be triggered, gumbo only accepts utf-8
    doc.parse()

    assert doc.get_title() == "Mac\xc3\xa9o"

예제 #7

파일 보기

파일: test_encoding.py 프로젝트: JBaba/cosr-back

def test_encoding_aliases():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<html><head><meta charset="tis-620"></head><body>Hello</body></html>""")
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

    doc = HTMLDocument("""<html><head><meta charset="windows-874"></head><body>Hello</body></html>""")
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

예제 #8

파일 보기

파일: test_encoding.py 프로젝트: x0rzkov/cosr-back

def test_reparse():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="iso-8859-15"><title>Mac\xe9o</title></head></html>"""
    )
    assert doc.encoding.detect().name == "iso8859-15"

    # A re-parsing of the document should be triggered, gumbo only accepts utf-8
    doc.parse()

    assert doc.get_title() == "Mac\xc3\xa9o"

예제 #9

파일 보기

파일: test_encoding.py 프로젝트: x0rzkov/cosr-back

def test_encoding_aliases():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument(
        """<html><head><meta charset="tis-620"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

    doc = HTMLDocument(
        """<html><head><meta charset="windows-874"></head><body>Hello</body></html>"""
    )
    assert doc.encoding.detect().name == "cp874"
    doc.parse()

예제 #10

파일 보기

파일: test_encoding.py 프로젝트: x0rzkov/cosr-back

def test_encoding_w3c(p_w3c_test, p_expected_encoding):
    from cosrlib.document.html import HTMLDocument

    test_file = os.path.join("tests/testdata/html_w3c_encoding_testcases",
                             "the-input-byte-stream-%s.html" % p_w3c_test)

    with open(test_file, "rb") as f:
        headers = {}
        if os.path.isfile(test_file + ".headers"):
            with open(test_file + ".headers", "rb") as hf:
                headers["content-type"] = hf.read()[14:].strip()

        html = f.read()

        # print repr(html[0:10])

        doc = HTMLDocument(html, url=None, headers=headers)

        if p_expected_encoding is None:
            assert doc.encoding.detect() is None
        else:
            assert doc.encoding.detect().name == p_expected_encoding

        doc.parse()

예제 #11

파일 보기

파일: test_encoding.py 프로젝트: JBaba/cosr-back

def test_encoding_x_user_defined():
    from cosrlib.document.html import HTMLDocument

    doc = HTMLDocument("""<html><head><meta charset="x-user-defined"></head><body>Hello</body></html>""")
    assert doc.encoding.detect().name == "x-user-defined"
    doc.parse()

예제 #12

파일 보기

파일: test_formatting.py 프로젝트: bakztfuture/cosr-back

 def format_html_title(title, url=None):
     doc = HTMLDocument("""
         <html><head><meta charset="UTF-8"><title>%s</title></head><body>Hello</body></html>
     """ % title, url=url)
     doc.parse()
     return format_title(doc, {})