def test_extract_article_custom_publication_datetime_selector(): # Load test file html_filepath = os.path.join(UNIT_TEST_DATA_DIR, "addictinginfo.com-1_article.html") response = response_from_html_file(html_filepath) # Load expected article data article_filepath = os.path.join( UNIT_TEST_DATA_DIR, "addictinginfo.com-1_extracted_data_default_custom_publication_datetime_selector.json" ) expected_article = article_from_json_file(article_filepath) # Mock config config_yaml = """ site_name: 'example.com' start_url: 'http://addictinginfo.com/category/news/' article: publication_datetime: select_method: 'xpath' select_expression: '//time[contains(concat(" ", normalize-space(@class), " "), " entry-date ")]/@datetime' match_rule: 'single' content: select_method: 'xpath' select_expression: '//div[@class="entry entry-content"]' match_rule: 'single' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) # Test article = extract_article(response, config) assert article["publication_datetime"] == expected_article[ "publication_datetime"]
def test_extract_article_custom_content_selector(): # Load test file html_filepath = os.path.join(UNIT_TEST_DATA_DIR, "addictinginfo.com-1_article.html") response = response_from_html_file(html_filepath) # Load expected article data article_filepath = os.path.join( UNIT_TEST_DATA_DIR, "addictinginfo.com-1_extracted_data_default_custom_content_selector.json" ) expected_article = article_from_json_file(article_filepath) # Mock config config_yaml = """ site_name: 'example.com' article: content: select_method: 'xpath' select_expression: '//div[@class="entry entry-content"]' match_rule: 'single' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) # Test article = extract_article(response, config) assert article["content"] == expected_article["content"]
def test_extract_article_default_with_crawl_info(): # Load test file html_filepath = os.path.join(UNIT_TEST_DATA_DIR, "addictinginfo.com-1_article.html") response = response_from_html_file(html_filepath) # Load expected article data article_filepath = os.path.join( UNIT_TEST_DATA_DIR, "addictinginfo.com-1_extracted_data_default_with_crawl_info.json") expected_article = article_from_json_file(article_filepath) # Mock config config_yaml = """ site_name: 'example.com' start_url: 'http://addictinginfo.com/category/news/' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) # Mock crawl info crawl_info = MockDBEntry(crawl_id="bdbcf1cf-e4,1f-4c10-9958-4ab1b07e46ae", crawl_datetime="2018-10-17T20:25:34.234567+0000") # Test article = extract_article(response, config, crawl_info) assert article == expected_article
def validate_extract_article(response, config, expected): article = extract_article(response, config) # Check title extraction assert article['title'] == expected['title'] # Check byline extraction assert article['byline'] == expected['byline'] # Check publication datetime extraction assert article['publication_datetime'] == expected['publication_datetime'] # Check plain content extraction assert article['plain_content'] == expected['plain_content'] # Check plain text extraction assert article['plain_text'] == expected['plain_text']
def test_extract_article_default(): # Load test file html_filepath = os.path.join(UNIT_TEST_DATA_DIR, "addictinginfo.com-1_article.html") response = response_from_html_file(html_filepath) # Load expected article data article_filepath = os.path.join( UNIT_TEST_DATA_DIR, "addictinginfo.com-1_extracted_data_default.json") expected_article = article_from_json_file(article_filepath) # Mock config config_yaml = """ site_name: 'example.com' start_url: 'http://addictinginfo.com/category/news/' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) # Test article = extract_article(response, config) assert article == expected_article
def test_extract_article_with_no_data_has_all_fields_present_but_null(): # Mock response using expected article data html = """<html> <head></head> <body> <div> No article here. </div> </body> </html>""" response = TextResponse(url="http://example.com", body=html, encoding="utf-8") # Mock config config_yaml = """ site_name: 'example.com' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) expected_article = { 'site_name': "example.com", 'article_url': "http://example.com", 'title': None, 'byline': None, 'publication_datetime': None, 'content': "<div>No article here.</div>", 'plain_content': "<div>No article here.</div>", 'plain_text': [{ 'text': 'No article here.' }], 'metadata': None } # Test article = extract_article(response, config) assert article == expected_article
def test_extract_datetime_works_with_multiple_dates(): # Mock response using expected article data html = """<html> <head></head> <body> <div class="subarticle"> <p>October 22, 2018</p> <p>Article text here.</p> <p>May 15, 2006</p> </div> </body> </html>""" response = TextResponse(url="http://example.com", body=html, encoding="utf-8") # Mock config config_yaml = """ site_name: 'example.com' article: publication_datetime: select_method: 'xpath' select_expression: '//div[@class="subarticle"]/p/text()' match_rule: 'comma_join' datetime_formats: - 'MMMM D YYYY' content: select_method: 'xpath' select_expression: '//div[@class="subarticle"]' match_rule: 'single' """ config = yaml.load(config_yaml, Loader=yaml.FullLoader) expected_article = { 'site_name': 'example.com', 'article_url': 'http://example.com', 'title': None, 'byline': None, 'publication_datetime': "2006-05-15T00:00:00", 'content': '<div><p>October 22, 2018</p><p>Article text here.</p><p>May 15, 2006</p></div>', 'plain_content': '<div><p>October 22, 2018</p><p>Article text here.</p><p>May 15, 2006</p></div>', 'plain_text': [{ "text": "October 22, 2018" }, { "text": "Article text here." }, { "text": "May 15, 2006" }], 'metadata': None, } # Test article = extract_article(response, config) assert article == expected_article