def extract_article_data(source): # url is either a file-like object, or a url. # if it's a file we just open it, assume it's an article and extract article data if hasattr(source, 'read'): html_data = source.read() # if it's an url we need to check if it's a photo album, a link to the frontpage or a true article else: html_data = utils.fetch_html_content(source) page_type = detect_page_type(source) if page_type == IS_FRONTPAGE: return None, None elif page_type == MAYBE_ARTICLE: raise ValueError("We couldn't define if this was an article or the frontpage, please check") if is_404_page(html_data): return (None, html_data) # pour tous les autres vrais articles soup = bs4.BeautifulSoup(html_data) if soup.find("head").find("title").contents[0] == "301 Moved Permanently": return (None, html_data) else: title = extract_title(soup) author_box = soup.find(attrs={"class": "author"}) author_name = extract_author_name(author_box) pub_date, pub_time = extract_date_and_time(author_box) # original_source = extract_source(author_box) intro, tagged_urls_from_intro = extract_intro(soup) category = extract_category(soup) text, tagged_urls_intext = extract_text_content_and_links(soup) tagged_urls_read_more_box = extract_links_from_read_more_box(soup) tagged_urls_sidebar_box = extract_links_from_sidebar_box(soup) tagged_urls_embedded_media = extract_embedded_media(soup) tagged_urls = tagged_urls_intext + tagged_urls_read_more_box + tagged_urls_sidebar_box + tagged_urls_embedded_media + tagged_urls_from_intro updated_tagged_urls = tagging.update_tagged_urls(tagged_urls, SEPTSURSEPT_SAME_OWNER) # print generate_test_func('same_owner', 'septsursept', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_data, source, 'same_owner', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/septsursept') return (ArticleData(source, title, pub_date, pub_time, dt.datetime.now(), updated_tagged_urls, category, author_name, intro, text), html_data)
def extract_article_data(source): """ """ if hasattr(source, "read"): html_content = source.read() else: try: html_content = fetch_html_content(source) except HTTPError as e: if e.code == 404: return None, None else: raise except Exception: raise soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content and main_content.h1: title = remove_text_formatting_markup_from_fragments(main_content.h1.contents) pub_date, pub_time = extract_date_from_maincontent(main_content) category = extract_category_from_maincontent(main_content) author_name = extract_author_name_from_maincontent(main_content) article_text = main_content.find("div", {"id": "articleText"}) if article_has_intro(article_text): intro = extract_intro_from_articletext(article_text) text, in_text_links = extract_text_content_and_links_from_articletext(article_text) else: intro = u"" text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False) audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) embedded_content_links = extract_links_to_embedded_content(main_content) all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER) fetched_datetime = datetime.today() # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet') # import os # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True) new_article = ArticleData( source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text ) return new_article, html_content else: return None, html_content
def extract_article_data(source): if hasattr(source, 'read'): html_data = source.read() else: try: source = convert_utf8_url_to_ascii(source) html_data = fetch_html_content(source) except HTTPError as e: if e.code == 404 or e.code == 403: return None, None else: raise except Exception: raise soup = bs4.BeautifulSoup(html_data) # this is how we detect paywalled articles if soup.find(attrs={"id": "main-content"}).h2 and soup.find(attrs={"id": "main-content"}).h2.find(attrs={'class': 'ir locked'}): title = extract_title(soup) return (ArticleData(source, title, constants.NO_DATE, constants.NO_TIME, datetime.today(), [], [constants.NO_CATEGORY_NAME], None, None, constants.PAYWALLED_CONTENT), html_data) else: title = extract_title(soup) author_name = extract_author_name(soup) intro, links_from_intro = extract_intro(soup) text, tagged_urls_intext = extract_text_content_and_links(soup) category = extract_category(soup) sidebar_links = extract_links_from_sidebar_box(soup) article_tags = extract_article_tags(soup) embedded_media_from_top_box = extract_links_to_embedded_content(soup) embedded_media_from_bottom = extract_embedded_media_from_bottom(soup) embedded_media_in_article = extract_embedded_media_in_article(soup) embedded_media = embedded_media_from_top_box + embedded_media_from_bottom + embedded_media_in_article all_links = tagged_urls_intext + sidebar_links + article_tags + embedded_media + links_from_intro pub_date, pub_time = extract_date_and_time(soup) fetched_datetime = datetime.today() updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER) # print generate_test_func('embedded_storify_top_box', 'lesoir_new', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_data, source, 'embedded_storify_top_box', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new') return (ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text), html_data)
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: source = convert_utf8_url_to_ascii(source) try: html_content = fetch_html_content(source) except urllib2.HTTPError as err: if err.code == 404: return None, "<html><head><title>404</title></head><body></body></html>" else: raise err hxs = HtmlXPathSelector(text=html_content) if is_page_error_404(hxs): return None, html_content else: category = hxs.select("//p[starts-with(@class, 'fil_ariane')]/a//text()").extract() #old version title = hxs.select("//div[@id='article']/h1/text()").extract()[0] # new version: # title = hxs.select("//div[@id='article']/article//h1/text()").extract()[0] pub_date, pub_time = extract_date(hxs) author = hxs.select("//p[@class='auteur']/text()").extract()[0] fetched_datetime = datetime.today() intro, intro_links = extract_intro_and_links(hxs) content, content_links = extract_content_and_links(hxs) associated_links = extract_associated_links(hxs) all_links = intro_links + content_links + associated_links updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER) # import os # generate_unittest("embedded_dailymotion_video", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True) return (ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content)
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content, additional_massage_functions=coveritlive_title_massage_functions) if is_page_error_404(soup): return None, html_content else: content = soup.find('div', {'id': 'content'}) category = extract_category(content) article = soup.find('div', {'id': 'article'}) title = extract_title(article) pub_date, pub_time = extract_date(article) author = extract_author_name(article) fetched_datetime = datetime.today() intro, intro_links = extract_intro_and_links(article) content, content_links = extract_content_and_links(article) associated_links = extract_associated_links(article) embedded_media = extract_embedded_media(article) all_links = intro_links + content_links + associated_links + embedded_media updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER) #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls)) #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse') return ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content
def extract_article_data_new_style(source, hxs): """ """ category = hxs.select("//nav [contains(@id,'breadcrumb')]//li").extract() datetime_string = hxs.select("//div [@class='row content']//time/@datetime").extract() if not datetime_string: raise ValueError("Could not find the date, update the parser") parsed_datetime = datetime_from_iso8601(datetime_string[0]) pub_date, pub_time = parsed_datetime.date(), parsed_datetime.time() fetched_datetime = datetime.now() title = hxs.select("//header//h1/text()").extract() if not title: raise ValueError() title = title[0] content_hxs = hxs.select("//div [@class='entry-content']") author_fragments = content_hxs.select(".//p [@class='copyright']/text()").extract() author = ''.join([remove_text_formatting_markup_from_fragments(author_fragments, strip_chars='\r\n\t ')]) intro, intro_links = extract_intro_and_links_new(content_hxs) content, content_links = extract_content_and_links_new(content_hxs) other_div_hxs = content_hxs.select("//div [@class='entry-content']/div [not(contains(@class, 'entry-'))]") content_media_links = extract_links_from_other_divs(other_div_hxs) related_links = extract_related_links(hxs) media_links = extract_links_from_embbeded_media(content_hxs) tag_links = extract_links_from_tags(hxs) all_links = it.chain(intro_links, content_links, media_links, content_media_links, related_links, tag_links) updated_tagged_urls = update_tagged_urls(all_links, LAVENIR_SAME_OWNER) article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content) return article_data
def extract_article_data_from_html(html_content, source_url): soup = make_soup_from_html_content(html_content) main_content = soup.find('div', {'id': 'mainContent'}) if main_content.h1: title = main_content.h1.contents[0].rstrip().lstrip() else: return None, html_content category = extract_category(main_content) author = extract_author_name(main_content) pub_date, pub_time = extract_date(main_content) fetched_datetime = datetime.today() intro = extract_intro(main_content) text_content, in_text_links = extract_text_content_and_links(main_content) embedded_audio_links = ipm_utils.extract_embedded_audio_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) associated_tagged_urls = ipm_utils.extract_and_tag_associated_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) embedded_content_links = extract_embedded_content_links(main_content) all_links = in_text_links + associated_tagged_urls + bottom_links + embedded_content_links + embedded_audio_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.LALIBRE_SAME_OWNER) # generate_unittest("vuvox_without_title", 'lalibre', dict(updated_tagged_urls=updated_tagged_urls), # html_content, source_url, # os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lalibre"), # save_file=True) new_article = ArticleData(source_url, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, text_content) return new_article, html_content
def extract_article_data(source): """ source is either a file-like object, or a url. """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content) story = soup.find('div', {'id': 'story'}) category = extract_category(story) title = extract_title(story) pub_date, pub_time = extract_date(story) author = extract_author_name(story) sidebar_links = extract_links(soup) intro = extract_intro(story) content, intext_links = extract_text_content(story) fetched_datetime = datetime.today() embedded_content_links = extract_links_from_embedded_content(story) all_links = sidebar_links + intext_links + embedded_content_links updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER) # generate_unittest("links_overload", "lesoir", dict(urls=updated_tagged_urls), html_content, source.name, "/Users/sevas/PycharmProjects/csxj-crawler-dev/tests/datasources/test_data/lesoir", True) return ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content
def extract_article_data_old(source, hxs): """ process an old-style lavenir.net article page""" article_detail_hxs = hxs.select("//div[@id='content']/div[starts-with(@class,'span-3 article-detail')]") category = hxs.select("//div[@id='content']/*[1]/p/a/text()").extract() intro_h1s = article_detail_hxs.select(".//div[@id='intro']/h1/text()").extract() title = '' if len(intro_h1s) == 1: title = intro_h1s[0].strip() else: return None, None # all the date stuff #raw_date = article_detail_hxs.select(".//div[@id='intro']//li[@id='liDate']/*").extract() raw_date = ''.join([t.strip() for t in article_detail_hxs.select(".//div[@id='intro']//li[@id='liDate']//text()").extract()]) pub_date, pub_time = extract_publication_date(raw_date) fetched_datetime = datetime.today() #author(s) raw_author = article_detail_hxs.select("./div/ul/li[@class='author']/text()").extract() author = None if raw_author: author = raw_author[0].strip() #intro intro = None raw_intro = article_detail_hxs.select("./div/div[@class='intro ']//text()").extract() if raw_intro: intro = ''.join([fragment.strip() for fragment in raw_intro]) # in photosets pages, the structure is a bit different if not intro: raw_intro = article_detail_hxs.select("./div/div[@class='intro']//text()").extract() if raw_intro: intro = ''.join([fragment.strip() for fragment in raw_intro]) #detect photoset full_class = article_detail_hxs.select("./@class").extract()[0] if 'article-with-photoset' in full_class.split(" "): title = u"{0}|{1}".format("PHOTOSET", title) all_links = list() #content article_body = article_detail_hxs.select("./div/div[@class='article-body ']") content = article_body.select(".//p//text()").extract() all_links.extend(extract_links_from_article_body(article_body)) all_links.extend(extract_links_from_highlight_section(article_body.select('../..'))) # associated sidebar links sidebar_links = article_detail_hxs.select("./div/div[@class='article-side']/div[@class='article-related']//li/a") all_links.extend(extract_sidebar_links(sidebar_links)) # bottom links bottom_box = hxs.select('//div[@class="span-3 lire-aussi"]//a') all_links.extend(extract_bottom_links(bottom_box)) updated_tagged_urls = update_tagged_urls(all_links, LAVENIR_SAME_OWNER) article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content) return article_data