def get_frontpage_toc(): url = "http://www.dhnet.be" html_content = fetch_html_content(url) soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content: all_titles_and_urls = [] # so, the list here is a combination of several subcontainer types. # processing every type separately first_title, first_url = get_first_story_title_and_url(main_content) all_titles_and_urls.append((first_title, first_url)) # this will pick up the 'annouceGroup' containers with same type in the 'regions' div first_announce_groups = main_content.findAll( "div", {"class": "announceGroupFirst announceGroup"}, recursive=True ) announce_groups = main_content.findAll("div", {"class": "announceGroup"}, recursive=True) # all those containers have two sub stories for announce_group in chain(first_announce_groups, announce_groups): titles_and_urls = extract_title_and_link_from_anounce_group(announce_group) all_titles_and_urls.extend(titles_and_urls) return [(title, "http://www.dhnet.be%s" % url) for (title, url) in all_titles_and_urls], [], [] else: return [], [], []
def extract_article_data(source): if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_content_from_url(source) soup = make_soup_from_html_content(html_content) main_article= soup.find('div', {'id':'mainArticle'}) if main_article: title = extract_title(main_article) category = extract_category(main_article) pub_date, pub_time = extract_date_and_time(main_article) fetched_datetime = datetime.now() links = extract_links(main_article) author = None embedded_links, content = extract_links_and_text_content(main_article) intro = extract_intro(main_article) all_links = links+embedded_links article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, all_links, category, author, intro, content) return article_data, html_content else: return None, html_content
def extract_article_data(source): """ """ if hasattr(source, "read"): html_content = source.read() else: try: html_content = fetch_html_content(source) except HTTPError as e: if e.code == 404: return None, None else: raise except Exception: raise soup = make_soup_from_html_content(html_content) main_content = soup.find("div", {"id": "maincontent"}) if main_content and main_content.h1: title = remove_text_formatting_markup_from_fragments(main_content.h1.contents) pub_date, pub_time = extract_date_from_maincontent(main_content) category = extract_category_from_maincontent(main_content) author_name = extract_author_name_from_maincontent(main_content) article_text = main_content.find("div", {"id": "articleText"}) if article_has_intro(article_text): intro = extract_intro_from_articletext(article_text) text, in_text_links = extract_text_content_and_links_from_articletext(article_text) else: intro = u"" text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False) audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES) embedded_content_links = extract_links_to_embedded_content(main_content) all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER) fetched_datetime = datetime.today() # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls)) # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet') # import os # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True) new_article = ArticleData( source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text ) return new_article, html_content else: return None, html_content
def get_frontpage_toc(): """ Fetch links to articles listed on the 'Le Soir' front page. For each of them, extract the relevant data. """ url = 'http://www.lesoir.be' html_content = fetch_html_content(url) soup = make_soup_from_html_content(html_content) # Here we have interlaced <ul>'s with a bunch of random other shit that # need some filtering stories_containers = soup.findAll('ul', {'class': 'stories_list grid_6'}) articles_toc, blogpost_toc = [], [] for container in stories_containers: all_stories = set(container.findAll('li', recursive=False)) main_stories = set(container.findAll('li', {'class': 'stories_main clearfix'}, recursive=False)) other_stories = all_stories - main_stories # So, in _some_ lists of stories, the first one ('main story') has its title in an <h1> # and the rest in an <h2> # Also, some have two columns stories. # Beautiful soup indeed. for item in main_stories: title, url = (item.h1.a.get('title'), item.h1.a.get('href')) if is_external_blog(url): blogpost_toc.append((title, url)) else: articles_toc.append((title, url)) for item in other_stories: if element_has_two_columns_stories(item): # For some reason, those links don't have a 'title' attribute. # Love this. def extract_title_and_link(item): return item.h2.a.contents[0], item.h2.a.get('href') for story in get_two_columns_stories(item): title, url = extract_title_and_link(story) if is_external_blog(url): blogpost_toc.append((title, url)) else: articles_toc.append((title, url)) else: title, url = (item.h2.a.get('title'), item.h2.a.get('href')) if is_external_blog(url): blogpost_toc.append((title, url)) else: articles_toc.append((title, url)) return [(title, 'http://www.lesoir.be{0}'.format(url)) for (title, url) in articles_toc], blogpost_toc, []
def get_frontpage_toc(): url = 'http://sudpresse.be/' html_content = fetch_content_from_url(url) soup = make_soup_from_html_content(html_content) column1 = soup.find('div', {'class': 'column col-01'}) headlines = extract_headlines_from_column_1(column1) column3 = soup.find('div', {'class': 'column col-03'}) headlines.extend(extract_headlines_from_column_3(column3)) regional_headlines = make_full_url(url, get_regional_toc()) headlines.extend(regional_headlines) return make_full_url(url, headlines), [], []
def get_frontpage_toc(): hostname_url = 'http://www.lalibre.be' html_content = fetch_html_content(hostname_url) soup = make_soup_from_html_content(html_content) article_list_container = soup.find('div', {'id': 'mainContent'}) announces = article_list_container.findAll('div', {'class': 'announce'}, recursive=False) def extract_title_and_link(announce): title, url = announce.h1.a.contents[0], announce.h1.a.get('href') return title, '{0}{1}'.format(hostname_url, url) return [extract_title_and_link(announce) for announce in announces], [], []
def get_frontpage_toc(): url = 'http://www.rtl.be/info/' html_content = fetch_content_from_url(url) soup = make_soup_from_html_content(html_content) maincontent = soup.find('div', {'class': 'mainContent'}) first_articles = extract_first_articles(maincontent) small_articles = extract_small_articles(maincontent) modules_articles = extract_headlines_from_modules(maincontent) all_articles = first_articles + small_articles + modules_articles news_items, blogposts = separate_news_and_blogposts(all_articles) return [make_full_url(title_and_url) for title_and_url in news_items], list(blogposts), []
def extract_main_content_links(source): if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content) all_links = soup.findAll('a', recursive=True) def extract_url_and_title(item): url = item.get('href') if item.contents[0]: title = item.contents[0] else: # yes, this happens title = 'No Title found' return url, title return [extract_url_and_title(l) for l in all_links]
def extract_article_data(source): """ """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content, additional_massage_functions=coveritlive_title_massage_functions) if is_page_error_404(soup): return None, html_content else: content = soup.find('div', {'id': 'content'}) category = extract_category(content) article = soup.find('div', {'id': 'article'}) title = extract_title(article) pub_date, pub_time = extract_date(article) author = extract_author_name(article) fetched_datetime = datetime.today() intro, intro_links = extract_intro_and_links(article) content, content_links = extract_content_and_links(article) associated_links = extract_associated_links(article) embedded_media = extract_embedded_media(article) all_links = intro_links + content_links + associated_links + embedded_media updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER) #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls)) #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse') return ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content
def extract_article_data_from_html(html_content, source_url): soup = make_soup_from_html_content(html_content) main_content = soup.find('div', {'id': 'mainContent'}) if main_content.h1: title = main_content.h1.contents[0].rstrip().lstrip() else: return None, html_content category = extract_category(main_content) author = extract_author_name(main_content) pub_date, pub_time = extract_date(main_content) fetched_datetime = datetime.today() intro = extract_intro(main_content) text_content, in_text_links = extract_text_content_and_links(main_content) embedded_audio_links = ipm_utils.extract_embedded_audio_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) associated_tagged_urls = ipm_utils.extract_and_tag_associated_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) bottom_links = ipm_utils.extract_bottom_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES) embedded_content_links = extract_embedded_content_links(main_content) all_links = in_text_links + associated_tagged_urls + bottom_links + embedded_content_links + embedded_audio_links updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.LALIBRE_SAME_OWNER) # generate_unittest("vuvox_without_title", 'lalibre', dict(updated_tagged_urls=updated_tagged_urls), # html_content, source_url, # os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lalibre"), # save_file=True) new_article = ArticleData(source_url, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, text_content) return new_article, html_content
def extract_article_data(source): """ source is either a file-like object, or a url. """ if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_html_content(source) soup = make_soup_from_html_content(html_content) story = soup.find('div', {'id': 'story'}) category = extract_category(story) title = extract_title(story) pub_date, pub_time = extract_date(story) author = extract_author_name(story) sidebar_links = extract_links(soup) intro = extract_intro(story) content, intext_links = extract_text_content(story) fetched_datetime = datetime.today() embedded_content_links = extract_links_from_embedded_content(story) all_links = sidebar_links + intext_links + embedded_content_links updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER) # generate_unittest("links_overload", "lesoir", dict(urls=updated_tagged_urls), html_content, source.name, "/Users/sevas/PycharmProjects/csxj-crawler-dev/tests/datasources/test_data/lesoir", True) return ArticleData(source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author, intro, content), html_content
def get_regional_toc(): url = 'http://sudpresse.be/regions' html_content = fetch_content_from_url(url) soup = make_soup_from_html_content(html_content) return extract_regional_headlines(soup.find('div', {'id': 'content_first'}))