def get_frontpage_toc(): html_data = fetch_html_content('http://www.lesoir.be') hxs = HtmlXPathSelector(text=html_data) # main stories list_items = hxs.select("//div [@id='main-content']//ul/li") headlines_links = list_items.select("./h2/a | ./h3/a") # just for the blog count statistics blog_block = hxs.select("//div [@class='bottom-content']//div [@class='block-blog box']//h5/a") # mainly soccer sport_block = hxs.select("//div [@class='bottom-content']//div [@class='block-sport']") sports_links = sport_block.select(".//h2/a | .//aside//li/a") # bottom sections bottom_news_links = hxs.select("//div [@class='bottom-content']//div [@class='block-articles']//a") all_links_hxs = itertools.chain(headlines_links, blog_block, sports_links, bottom_news_links) regular_articles_hxs, all_paywalled_hxs = separate_paywalled_articles(all_links_hxs) titles_and_urls = [extract_title_and_url(link) for link in regular_articles_hxs] paywalled_titles_and_urls = [extract_title_and_url(link) for link in all_paywalled_hxs] articles_toc, blogpost_toc = separate_news_and_blogposts(titles_and_urls) return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc, [(title, reconstruct_full_url(url)) for (title, url) in paywalled_titles_and_urls]
def get_frontpage_toc(): url = "http://www.dhnet.be" html_content = fetch_html_content(url) soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content: all_titles_and_urls = [] # so, the list here is a combination of several subcontainer types. # processing every type separately first_title, first_url = get_first_story_title_and_url(main_content) all_titles_and_urls.append((first_title, first_url)) # this will pick up the 'annouceGroup' containers with same type in the 'regions' div first_announce_groups = main_content.findAll( "div", {"class": "announceGroupFirst announceGroup"}, recursive=True ) announce_groups = main_content.findAll("div", {"class": "announceGroup"}, recursive=True) # all those containers have two sub stories for announce_group in chain(first_announce_groups, announce_groups): titles_and_urls = extract_title_and_link_from_anounce_group(announce_group) all_titles_and_urls.extend(titles_and_urls) return [(title, "http://www.dhnet.be%s" % url) for (title, url) in all_titles_and_urls], [], [] else: return [], [], []
def extract_article_data(source): # url is either a file-like object, or a url. # if it's a file we just open it, assume it's an article and extract article data if hasattr(source, 'read'): html_data = source.read() # if it's an url we need to check if it's a photo album, a link to the frontpage or a true article else: html_data = utils.fetch_html_content(source) page_type = detect_page_type(source) if page_type == IS_FRONTPAGE: return None, None elif page_type == MAYBE_ARTICLE: raise ValueError("We couldn't define if this was an article or the frontpage, please check") if is_404_page(html_data): return (None, html_data) # pour tous les autres vrais articles soup = bs4.BeautifulSoup(html_data) if soup.find("head").find("title").contents[0] == "301 Moved Permanently": return (None, html_data) else: title = extract_title(soup) author_box = soup.find(attrs={"class": "author"}) author_name = extract_author_name(author_box) pub_date, pub_time = extract_date_and_time(author_box) # original_source = extract_source(author_box) intro, tagged_urls_from_intro = extract_intro(soup) category = extract_category(soup) text, tagged_urls_intext = extract_text_content_and_links(soup) tagged_urls_read_more_box = extract_links_from_read_more_box(soup) tagged_urls_sidebar_box = extract_links_from_sidebar_box(soup) tagged_urls_embedded_media = extract_embedded_media(soup) tagged_urls = tagged_urls_intext + tagged_urls_read_more_box + tagged_urls_sidebar_box + tagged_urls_embedded_media + tagged_urls_from_intro updated_tagged_urls = tagging.update_tagged_urls(tagged_urls, SEPTSURSEPT_SAME_OWNER) # print generate_test_func('same_owner', 'septsursept', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_data, source, 'same_owner', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/septsursept') return (ArticleData(source, title, pub_date, pub_time, dt.datetime.now(), updated_tagged_urls, category, author_name, intro, text), html_data)
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) return extract_article_data_from_html(html_content, source)
def extract_article_data(source): """ """ if hasattr(source, "read"): html_content = source.read() else: try: html_content = fetch_html_content(source) except HTTPError as e: if e.code == 404: return None, None else: raise except Exception: raise soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content and main_content.h1: title = remove_text_formatting_markup_from_fragments(main_content.h1.contents) pub_date, pub_time = extract_date_from_maincontent(main_content) category = extract_category_from_maincontent(main_content) author_name = extract_author_name_from_maincontent(main_content) article_text = main_content.find("div", {"id": "articleText"}) if article_has_intro(article_text): intro = extract_intro_from_articletext(article_text) text, in_text_links = extract_text_content_and_links_from_articletext(article_text) else: intro = u"" text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False) audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) embedded_content_links = extract_links_to_embedded_content(main_content) all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER) fetched_datetime = datetime.today() # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet') # import os # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True) new_article = ArticleData( source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text ) return new_article, html_content else: return None, html_content
def get_frontpage_toc(): """ Fetch links to articles listed on the 'Le Soir' front page. For each of them, extract the relevant data. """ url = 'http://www.lesoir.be' html_content = fetch_html_content(url) soup = make_soup_from_html_content(html_content) # Here we have interlaced <ul>'s with a bunch of random other shit that # need some filtering stories_containers = soup.findAll('ul', {'class': 'stories_list grid_6'}) articles_toc, blogpost_toc = [], [] for container in stories_containers: all_stories = set(container.findAll('li', recursive=False)) main_stories = set(container.findAll('li', {'class': 'stories_main clearfix'}, recursive=False)) other_stories = all_stories - main_stories # So, in _some_ lists of stories, the first one ('main story') has its title in an <h1> # and the rest in an <h2> # Also, some have two columns stories. # Beautiful soup indeed. for item in main_stories: title, url = (item.h1.a.get('title'), item.h1.a.get('href')) if is_external_blog(url): blogpost_toc.append((title, url)) else: articles_toc.append((title, url)) for item in other_stories: if element_has_two_columns_stories(item): # For some reason, those links don't have a 'title' attribute. # Love this. def extract_title_and_link(item): return item.h2.a.contents[0], item.h2.a.get('href') for story in get_two_columns_stories(item): title, url = extract_title_and_link(story) if is_external_blog(url): blogpost_toc.append((title, url)) else: articles_toc.append((title, url)) else: title, url = (item.h2.a.get('title'), item.h2.a.get('href')) if is_external_blog(url): blogpost_toc.append((title, url)) else: articles_toc.append((title, url)) return [(title, 'http://www.lesoir.be{0}'.format(url)) for (title, url) in articles_toc], blogpost_toc, []
def get_frontpage_toc(): hostname_url = 'http://www.lalibre.be' html_content = fetch_html_content(hostname_url) soup = make_soup_from_html_content(html_content) article_list_container = soup.find('div', {'id': 'mainContent'}) announces = article_list_container.findAll('div', {'class': 'announce'}, recursive=False) def extract_title_and_link(announce): title, url = announce.h1.a.contents[0], announce.h1.a.get('href') return title, '{0}{1}'.format(hostname_url, url) return [extract_title_and_link(announce) for announce in announces], [], []
def extract_article_data(source): if hasattr(source, 'read'): html_data = source.read() else: try: source = convert_utf8_url_to_ascii(source) html_data = fetch_html_content(source) except HTTPError as e: if e.code == 404 or e.code == 403: return None, None else: raise except Exception: raise soup = bs4.BeautifulSoup(html_data) # this is how we detect paywalled articles if soup.find(attrs={"id": "main-content"}).h2 and soup.find(attrs={"id": "main-content"}).h2.find(attrs={'class': 'ir locked'}): title = extract_title(soup) return (ArticleData(source, title, constants.NO_DATE, constants.NO_TIME, datetime.today(), [], [constants.NO_CATEGORY_NAME], None, None, constants.PAYWALLED_CONTENT), html_data) else: title = extract_title(soup) author_name = extract_author_name(soup) intro, links_from_intro = extract_intro(soup) text, tagged_urls_intext = extract_text_content_and_links(soup) category = extract_category(soup) sidebar_links = extract_links_from_sidebar_box(soup) article_tags = extract_article_tags(soup) embedded_media_from_top_box = extract_links_to_embedded_content(soup) embedded_media_from_bottom = extract_embedded_media_from_bottom(soup) embedded_media_in_article = extract_embedded_media_in_article(soup) embedded_media = embedded_media_from_top_box + embedded_media_from_bottom + embedded_media_in_article all_links = tagged_urls_intext + sidebar_links + article_tags + embedded_media + links_from_intro pub_date, pub_time = extract_date_and_time(soup) fetched_datetime = datetime.today() updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER) # print generate_test_func('embedded_storify_top_box', 'lesoir_new', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_data, source, 'embedded_storify_top_box', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new') return (ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text), html_data)
def get_frontpage_toc(): url = "http://{0}/info".format(LEVIF_NETLOC) html_data = fetch_html_content(url) hxs = HtmlXPathSelector(text=html_data) h1_breaking_news_links = hxs.select("//div [@id='body']/div/div[@class='frame col_650']/div [@class='frame breakingNews']//div[@class='teaserContent']//h1/a") h2_breaking_news_links = hxs.select("//div [@id='body']/div/div[@class='frame col_650']/div [@class='frame breakingNews']//div[@class='teaserContent']//h2/a") other_news = hxs.select("//div [@id='body']/div/div[@class='frame col_650']/div [@class='frame teaserRow2 clearfix']//div[@class='teaserContent']/../h1/a") titles_and_urls = [extract_title_and_url(link_hxs) for link_hxs in chain(h1_breaking_news_links, h2_breaking_news_links, other_news)] frontpage_items, blogposts = split_news_and_blogposts(titles_and_urls) return frontpage_items, blogposts, []
def get_frontpage_toc(): frontpage_url = "http://{0}".format(LAVENIR_NETLOC) html_data = fetch_html_content(frontpage_url) hxs = HtmlXPathSelector(text=html_data) story_links = hxs.select("//div[@id='content']//div[starts-with(@class, 'fr-row')]//h3/a") more_story_links = hxs.select("//div[@id='content']//div[starts-with(@class, 'fr-section')]//h3/a") local_sport_links = hxs.select("//div[@id='content']//div[contains(@class, 'article-with-photo')]//h2/a") nopic_story_list = hxs.select("//div[@id='content']//ul[@class='nobullets']//li//div[contains(@class, 'item-title')]//a") all_links = it.chain(story_links, more_story_links, local_sport_links, nopic_story_list) all_items = [extract_title_and_url(link_hxs) for link_hxs in all_links] news_items, blogpost_items = separate_blogposts(all_items) return [(title, expand_full_url(url)) for (title, url) in news_items if url not in BLACKLIST], list(blogpost_items), []
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: source = convert_utf8_url_to_ascii(source) try: html_content = fetch_html_content(source) except urllib2.HTTPError as err: if err.code == 404: return None, "<html><head><title>404</title></head><body></body></html>" else: raise err hxs = HtmlXPathSelector(text=html_content) if is_page_error_404(hxs): return None, html_content else: category = hxs.select("//p[starts-with(@class, 'fil_ariane')]/a//text()").extract() #old version title = hxs.select("//div[@id='article']/h1/text()").extract()[0] # new version: # title = hxs.select("//div[@id='article']/article//h1/text()").extract()[0] pub_date, pub_time = extract_date(hxs) author = hxs.select("//p[@class='auteur']/text()").extract()[0] fetched_datetime = datetime.today() intro, intro_links = extract_intro_and_links(hxs) content, content_links = extract_content_and_links(hxs) associated_links = extract_associated_links(hxs) all_links = intro_links + content_links + associated_links updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER) # import os # generate_unittest("embedded_dailymotion_video", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True) return (ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content)
def extract_main_content_links(source): if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content) all_links = soup.findAll('a', recursive=True) def extract_url_and_title(item): url = item.get('href') if item.contents[0]: title = item.contents[0] else: # yes, this happens title = 'No Title found' return url, title return [extract_url_and_title(l) for l in all_links]
def get_frontpage_toc(): url = "http://{0}/info".format(RTBFINFO_NETLOC) html_data = fetch_html_content(url) hxs = HtmlXPathSelector(text=html_data) main_story = hxs.select("//div [@id='mainContent']//article//h2//a") featured_stories = hxs.select("//div [@id='mainContent']//section/article//h3//a") anchored_stories = hxs.select("//div [@id='mainContent']//div [starts-with(@class, 'anchor')]//ul//a") chronic_title_hxs = hxs.select("//div [@id='mainContent']//div [@class='second chronic']/div [@class='illuBox']//p") chronic_links = [chronic_hxs for chronic_hxs in chronic_title_hxs.select("../@href").extract()] chronic_titles = [c.strip() for c in chronic_title_hxs.select(".//text()").extract()] chronic_stories = zip(chronic_titles, chronic_links) titles_and_urls = [extract_title_and_url(link_hxs) for link_hxs in chain(main_story, featured_stories, anchored_stories)] + chronic_stories return titles_and_urls, [], []
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content, additional_massage_functions=coveritlive_title_massage_functions) if is_page_error_404(soup): return None, html_content else: content = soup.find('div', {'id': 'content'}) category = extract_category(content) article = soup.find('div', {'id': 'article'}) title = extract_title(article) pub_date, pub_time = extract_date(article) author = extract_author_name(article) fetched_datetime = datetime.today() intro, intro_links = extract_intro_and_links(article) content, content_links = extract_content_and_links(article) associated_links = extract_associated_links(article) embedded_media = extract_embedded_media(article) all_links = intro_links + content_links + associated_links + embedded_media updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER) #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls)) #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse') return ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content
def extract_article_data(source): if hasattr(source, 'read'): html_content = source.read() else: try: html_content = fetch_html_content(source) except HTTPError as e: if e.code == 404 or e.code == 403: return None, None else: raise except Exception: raise hxs = HtmlXPathSelector(text=html_content) old_style_content_hxs = hxs.select("//div[@id='content']") if old_style_content_hxs: return extract_article_data_old(source, hxs), html_content else: return extract_article_data_new_style(source, hxs), html_content
def extract_article_data(source): """ source is either a file-like object, or a url. """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content) story = soup.find('div', {'id': 'story'}) category = extract_category(story) title = extract_title(story) pub_date, pub_time = extract_date(story) author = extract_author_name(story) sidebar_links = extract_links(soup) intro = extract_intro(story) content, intext_links = extract_text_content(story) fetched_datetime = datetime.today() embedded_content_links = extract_links_from_embedded_content(story) all_links = sidebar_links + intext_links + embedded_content_links updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER) # generate_unittest("links_overload", "lesoir", dict(urls=updated_tagged_urls), html_content, source.name, "/Users/sevas/PycharmProjects/csxj-crawler-dev/tests/datasources/test_data/lesoir", True) return ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content