Python process_record 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: trafilatura.core

메소드/함수: process_record

hotexamples.com에서의 예제들: 4

Python process_record - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 trafilatura.core.process_record에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.load_html(123) is None
    assert utils.load_html('<html><body>XYZ</body></html>') is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    assert extract(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None
    # legacy
    assert process_record(None, 'url', '0000', xml_output=False, tei_output=False, target_language=None) is None

예제 #2

파일 보기

def load_mock_page(url, xml_flag=False, langcheck=None):
    '''load mock page from samples'''
    try:
        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r') as inputf:
            htmlstring = inputf.read()
    except UnicodeDecodeError:
        with open(os.path.join(TEST_DIR, 'cache', MOCK_PAGES[url]), 'r', encoding='ISO-8859-1') as inputf:
            htmlstring = inputf.read()
    result = process_record(htmlstring, url, '0000', xml_output=xml_flag, tei_output=False, target_language=langcheck)
    return result

예제 #3

파일 보기

파일: unit_tests.py 프로젝트: vbarbaresi/trafilatura

def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.load_html(123) is None
    assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None
    assert utils.load_html(
        b'<html><body>\x2f\x2e\x9f</body></html>') is not None
    assert utils.load_html(
        '<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    assert extract(None, 'url', '0000', target_language=None) is None
    # GZip
    with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
        myinput = gzfile.read()
    assert 'Long story short,' in extract(myinput)
    # legacy
    assert process_record(None, 'url', '0000', target_language=None) is None

예제 #4

파일 보기

def test_input():
    '''test if loaded strings/trees are handled properly'''
    assert utils.is_dubious_html('This is a string.') is True
    assert utils.is_dubious_html(b'This is a string.') is True
    with pytest.raises(TypeError) as err:
        assert utils.load_html(123) is None
    assert 'incompatible' in str(err.value)
    assert utils.load_html('<html><body>ÄÖÜ</body></html>') is not None
    assert utils.load_html(b'<html><body>\x2f\x2e\x9f</body></html>') is not None
    assert utils.load_html('<html><body>\x2f\x2e\x9f</body></html>'.encode('latin-1')) is not None
    #assert utils.load_html(b'0'*int(10e3)) is None
    with pytest.raises(TypeError) as err:
        assert extract(None, 'url', '0000', target_language=None) is None
        # legacy
        assert process_record(None, 'url', '0000', target_language=None) is None
    # GZip
    with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
        myinput = gzfile.read()
    assert 'Long story short,' in extract(myinput)

    # unicode normalization
    assert utils.normalize_unicode('A\u0308ffin') != 'A\u0308ffin'
    testresult = extract('<html><body><p>A\u0308ffin</p></body></html>', config=ZERO_CONFIG)
    assert testresult != 'A\u0308ffin' and testresult == 'Äffin'