Python extract_article 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: misinformation.extractors

메소드/함수: extract_article

hotexamples.com에서의 예제들: 7

Python extract_article - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 misinformation.extractors.extract_article에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def test_extract_article_custom_publication_datetime_selector():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR,
        "addictinginfo.com-1_extracted_data_default_custom_publication_datetime_selector.json"
    )
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        start_url: 'http://addictinginfo.com/category/news/'
        article:
            publication_datetime:
                select_method: 'xpath'
                select_expression: '//time[contains(concat(" ", normalize-space(@class), " "), " entry-date ")]/@datetime'
                match_rule: 'single'
            content:
                select_method: 'xpath'
                select_expression: '//div[@class="entry entry-content"]'
                match_rule: 'single'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Test
    article = extract_article(response, config)
    assert article["publication_datetime"] == expected_article[
        "publication_datetime"]

예제 #2

파일 보기

def test_extract_article_custom_content_selector():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR,
        "addictinginfo.com-1_extracted_data_default_custom_content_selector.json"
    )
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        article:
            content:
                select_method: 'xpath'
                select_expression: '//div[@class="entry entry-content"]'
                match_rule: 'single'

    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Test
    article = extract_article(response, config)
    assert article["content"] == expected_article["content"]

예제 #3

파일 보기

def test_extract_article_default_with_crawl_info():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR,
        "addictinginfo.com-1_extracted_data_default_with_crawl_info.json")
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        start_url: 'http://addictinginfo.com/category/news/'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Mock crawl info
    crawl_info = MockDBEntry(crawl_id="bdbcf1cf-e4,1f-4c10-9958-4ab1b07e46ae",
                             crawl_datetime="2018-10-17T20:25:34.234567+0000")

    # Test
    article = extract_article(response, config, crawl_info)
    assert article == expected_article

예제 #4

파일 보기

def validate_extract_article(response, config, expected):
    article = extract_article(response, config)
    # Check title extraction
    assert article['title'] == expected['title']
    # Check byline extraction
    assert article['byline'] == expected['byline']
    # Check publication datetime extraction
    assert article['publication_datetime'] == expected['publication_datetime']
    # Check plain content extraction
    assert article['plain_content'] == expected['plain_content']
    # Check plain text extraction
    assert article['plain_text'] == expected['plain_text']

예제 #5

파일 보기

def test_extract_article_default():
    # Load test file
    html_filepath = os.path.join(UNIT_TEST_DATA_DIR,
                                 "addictinginfo.com-1_article.html")
    response = response_from_html_file(html_filepath)
    # Load expected article data
    article_filepath = os.path.join(
        UNIT_TEST_DATA_DIR, "addictinginfo.com-1_extracted_data_default.json")
    expected_article = article_from_json_file(article_filepath)

    # Mock config
    config_yaml = """
        site_name: 'example.com'
        start_url: 'http://addictinginfo.com/category/news/'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    # Test
    article = extract_article(response, config)
    assert article == expected_article

예제 #6

파일 보기

def test_extract_article_with_no_data_has_all_fields_present_but_null():
    # Mock response using expected article data
    html = """<html>
    <head></head>
    <body>
        <div>
            No article here.
        </div>
    </body>
    </html>"""
    response = TextResponse(url="http://example.com",
                            body=html,
                            encoding="utf-8")

    # Mock config
    config_yaml = """
    site_name: 'example.com'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    expected_article = {
        'site_name': "example.com",
        'article_url': "http://example.com",
        'title': None,
        'byline': None,
        'publication_datetime': None,
        'content': "<div>No article here.</div>",
        'plain_content': "<div>No article here.</div>",
        'plain_text': [{
            'text': 'No article here.'
        }],
        'metadata': None
    }

    # Test
    article = extract_article(response, config)
    assert article == expected_article

예제 #7

파일 보기

def test_extract_datetime_works_with_multiple_dates():
    # Mock response using expected article data
    html = """<html>
    <head></head>
    <body>
        <div class="subarticle">
            <p>October 22, 2018</p>
            <p>Article text here.</p>
            <p>May 15, 2006</p>
        </div>
    </body>
    </html>"""
    response = TextResponse(url="http://example.com",
                            body=html,
                            encoding="utf-8")

    # Mock config
    config_yaml = """
    site_name: 'example.com'
    article:
        publication_datetime:
            select_method: 'xpath'
            select_expression: '//div[@class="subarticle"]/p/text()'
            match_rule: 'comma_join'
            datetime_formats:
              - 'MMMM D YYYY'
        content:
            select_method: 'xpath'
            select_expression: '//div[@class="subarticle"]'
            match_rule: 'single'
    """
    config = yaml.load(config_yaml, Loader=yaml.FullLoader)

    expected_article = {
        'site_name':
        'example.com',
        'article_url':
        'http://example.com',
        'title':
        None,
        'byline':
        None,
        'publication_datetime':
        "2006-05-15T00:00:00",
        'content':
        '<div><p>October 22, 2018</p><p>Article text here.</p><p>May 15, 2006</p></div>',
        'plain_content':
        '<div><p>October 22, 2018</p><p>Article text here.</p><p>May 15, 2006</p></div>',
        'plain_text': [{
            "text": "October 22, 2018"
        }, {
            "text": "Article text here."
        }, {
            "text": "May 15, 2006"
        }],
        'metadata':
        None,
    }

    # Test
    article = extract_article(response, config)
    assert article == expected_article