def extract_article_data(source): """ """ if hasattr(source, "read"): html_content = source.read() else: try: html_content = fetch_html_content(source) except HTTPError as e: if e.code == 404: return None, None else: raise except Exception: raise soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content and main_content.h1: title = remove_text_formatting_markup_from_fragments(main_content.h1.contents) pub_date, pub_time = extract_date_from_maincontent(main_content) category = extract_category_from_maincontent(main_content) author_name = extract_author_name_from_maincontent(main_content) article_text = main_content.find("div", {"id": "articleText"}) if article_has_intro(article_text): intro = extract_intro_from_articletext(article_text) text, in_text_links = extract_text_content_and_links_from_articletext(article_text) else: intro = u"" text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False) audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) embedded_content_links = extract_links_to_embedded_content(main_content) all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER) fetched_datetime = datetime.today() # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet') # import os # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True) new_article = ArticleData( source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text ) return new_article, html_content else: return None, html_content
def extract_article_data_from_html(html_content, source_url): soup = make_soup_from_html_content(html_content) main_content = soup.find('div', {'id': 'mainContent'}) if main_content.h1: title = main_content.h1.contents[0].rstrip().lstrip() else: return None, html_content category = extract_category(main_content) author = extract_author_name(main_content) pub_date, pub_time = extract_date(main_content) fetched_datetime = datetime.today() intro = extract_intro(main_content) text_content, in_text_links = extract_text_content_and_links(main_content) embedded_audio_links = ipm_utils.extract_embedded_audio_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) associated_tagged_urls = ipm_utils.extract_and_tag_associated_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) embedded_content_links = extract_embedded_content_links(main_content) all_links = in_text_links + associated_tagged_urls + bottom_links + embedded_content_links + embedded_audio_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.LALIBRE_SAME_OWNER) # generate_unittest("vuvox_without_title", 'lalibre', dict(updated_tagged_urls=updated_tagged_urls), # html_content, source_url, # os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lalibre"), # save_file=True) new_article = ArticleData(source_url, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, text_content) return new_article, html_content