Пример #1
0
def get_frontpage_toc():
    html_data = fetch_html_content('http://www.lesoir.be')
    hxs = HtmlXPathSelector(text=html_data)

    # main stories
    list_items = hxs.select("//div [@id='main-content']//ul/li")
    headlines_links = list_items.select("./h2/a | ./h3/a")

    # just for the blog count statistics
    blog_block = hxs.select("//div [@class='bottom-content']//div [@class='block-blog box']//h5/a")

    # mainly soccer
    sport_block = hxs.select("//div [@class='bottom-content']//div [@class='block-sport']")
    sports_links = sport_block.select(".//h2/a | .//aside//li/a")

    # bottom sections
    bottom_news_links = hxs.select("//div [@class='bottom-content']//div [@class='block-articles']//a")

    all_links_hxs = itertools.chain(headlines_links, blog_block, sports_links, bottom_news_links)
    regular_articles_hxs, all_paywalled_hxs = separate_paywalled_articles(all_links_hxs)

    titles_and_urls = [extract_title_and_url(link) for link in regular_articles_hxs]
    paywalled_titles_and_urls = [extract_title_and_url(link) for link in all_paywalled_hxs]

    articles_toc, blogpost_toc = separate_news_and_blogposts(titles_and_urls)
    return [(title, reconstruct_full_url(url)) for (title, url) in articles_toc], blogpost_toc, [(title, reconstruct_full_url(url)) for (title, url) in paywalled_titles_and_urls]
Пример #2
0
def get_frontpage_toc():
    url = "http://www.dhnet.be"
    html_content = fetch_html_content(url)
    soup = make_soup_from_html_content(html_content)

    main_content = soup.find("div", {"id": "maincontent"})
    if main_content:
        all_titles_and_urls = []

        # so, the list here is a combination of several subcontainer types.
        # processing every type separately
        first_title, first_url = get_first_story_title_and_url(main_content)
        all_titles_and_urls.append((first_title, first_url))

        # this will pick up the 'annouceGroup' containers with same type in the 'regions' div
        first_announce_groups = main_content.findAll(
            "div", {"class": "announceGroupFirst announceGroup"}, recursive=True
        )
        announce_groups = main_content.findAll("div", {"class": "announceGroup"}, recursive=True)

        # all those containers have two sub stories
        for announce_group in chain(first_announce_groups, announce_groups):
            titles_and_urls = extract_title_and_link_from_anounce_group(announce_group)
            all_titles_and_urls.extend(titles_and_urls)

        return [(title, "http://www.dhnet.be%s" % url) for (title, url) in all_titles_and_urls], [], []
    else:
        return [], [], []
Пример #3
0
def extract_article_data(source):
    # url is either a file-like object, or a url.
    # if it's a file we just open it, assume it's an article and extract article data

    if hasattr(source, 'read'):
        html_data = source.read()
    # if it's an url we need to check if it's a photo album, a link to the frontpage or a true article
    else:
        html_data = utils.fetch_html_content(source)

        page_type = detect_page_type(source)
        if page_type == IS_FRONTPAGE:
            return None, None
        elif page_type == MAYBE_ARTICLE:
            raise ValueError("We couldn't define if this was an article or the frontpage, please check")

    if is_404_page(html_data):
        return (None, html_data)

    # pour tous les autres vrais articles
    soup = bs4.BeautifulSoup(html_data)

    if soup.find("head").find("title").contents[0] == "301 Moved Permanently":
        return (None, html_data)

    else:
        title = extract_title(soup)

        author_box = soup.find(attrs={"class": "author"})
        author_name = extract_author_name(author_box)
        pub_date, pub_time = extract_date_and_time(author_box)

        # original_source = extract_source(author_box)

        intro, tagged_urls_from_intro = extract_intro(soup)

        category = extract_category(soup)

        text, tagged_urls_intext = extract_text_content_and_links(soup)

        tagged_urls_read_more_box = extract_links_from_read_more_box(soup)

        tagged_urls_sidebar_box = extract_links_from_sidebar_box(soup)

        tagged_urls_embedded_media = extract_embedded_media(soup)

        tagged_urls = tagged_urls_intext + tagged_urls_read_more_box + tagged_urls_sidebar_box + tagged_urls_embedded_media + tagged_urls_from_intro

        updated_tagged_urls = tagging.update_tagged_urls(tagged_urls, SEPTSURSEPT_SAME_OWNER)

        # print generate_test_func('same_owner', 'septsursept', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_data, source, 'same_owner', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/septsursept')

        return (ArticleData(source, title, pub_date, pub_time, dt.datetime.now(),
                updated_tagged_urls,
                category, author_name,
                intro, text),
                html_data)
Пример #4
0
def extract_article_data(source):
    """
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    return extract_article_data_from_html(html_content, source)
Пример #5
0
def extract_article_data(source):
    """
    """
    if hasattr(source, "read"):
        html_content = source.read()
    else:
        try:
            html_content = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = make_soup_from_html_content(html_content)
    main_content = soup.find("div", {"id": "maincontent"})

    if main_content and main_content.h1:
        title = remove_text_formatting_markup_from_fragments(main_content.h1.contents)
        pub_date, pub_time = extract_date_from_maincontent(main_content)
        category = extract_category_from_maincontent(main_content)
        author_name = extract_author_name_from_maincontent(main_content)

        article_text = main_content.find("div", {"id": "articleText"})
        if article_has_intro(article_text):
            intro = extract_intro_from_articletext(article_text)
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text)
        else:
            intro = u""
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False)

        audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        embedded_content_links = extract_links_to_embedded_content(main_content)
        all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links

        updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER)

        fetched_datetime = datetime.today()

        # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet')

        # import os
        # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True)

        new_article = ArticleData(
            source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text
        )
        return new_article, html_content
    else:
        return None, html_content
Пример #6
0
def get_frontpage_toc():
    """
    Fetch links to articles listed on the 'Le Soir' front page.
    For each of them, extract the relevant data.
    """
    url = 'http://www.lesoir.be'
    html_content = fetch_html_content(url)

    soup = make_soup_from_html_content(html_content)

    # Here we have interlaced <ul>'s with a bunch of random other shit that
    # need some filtering
    stories_containers = soup.findAll('ul', {'class': 'stories_list grid_6'})

    articles_toc, blogpost_toc = [], []

    for container in stories_containers:
        all_stories = set(container.findAll('li', recursive=False))
        main_stories = set(container.findAll('li', {'class': 'stories_main clearfix'}, recursive=False))

        other_stories = all_stories - main_stories

        # So, in _some_ lists of stories, the first one ('main story') has its title in an <h1>
        # and the rest in an <h2>
        # Also, some have two columns stories.
        # Beautiful soup indeed.
        for item in main_stories:
            title, url = (item.h1.a.get('title'), item.h1.a.get('href'))
            if is_external_blog(url):
                blogpost_toc.append((title, url))
            else:
                articles_toc.append((title, url))

        for item in other_stories:
            if element_has_two_columns_stories(item):
                # For some reason, those links don't have a 'title' attribute.
                # Love this.
                def extract_title_and_link(item):
                    return item.h2.a.contents[0], item.h2.a.get('href')

                for story in get_two_columns_stories(item):
                    title, url = extract_title_and_link(story)
                    if is_external_blog(url):
                        blogpost_toc.append((title, url))
                    else:
                        articles_toc.append((title, url))
            else:
                title, url = (item.h2.a.get('title'), item.h2.a.get('href'))
                if is_external_blog(url):
                    blogpost_toc.append((title, url))
                else:
                    articles_toc.append((title, url))

    return [(title, 'http://www.lesoir.be{0}'.format(url)) for (title, url) in articles_toc], blogpost_toc, []
Пример #7
0
def get_frontpage_toc():
    hostname_url = 'http://www.lalibre.be'
    html_content = fetch_html_content(hostname_url)

    soup = make_soup_from_html_content(html_content)

    article_list_container = soup.find('div', {'id': 'mainContent'})
    announces = article_list_container.findAll('div', {'class': 'announce'}, recursive=False)

    def extract_title_and_link(announce):
        title, url = announce.h1.a.contents[0], announce.h1.a.get('href')
        return title, '{0}{1}'.format(hostname_url, url)

    return [extract_title_and_link(announce) for announce in announces], [], []
Пример #8
0
def extract_article_data(source):

    if hasattr(source, 'read'):
        html_data = source.read()
    else:
        try:
            source = convert_utf8_url_to_ascii(source)
            html_data = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = bs4.BeautifulSoup(html_data)

    # this is how we detect paywalled articles
    if soup.find(attrs={"id": "main-content"}).h2 and soup.find(attrs={"id": "main-content"}).h2.find(attrs={'class': 'ir locked'}):
        title = extract_title(soup)
        return (ArticleData(source, title, constants.NO_DATE, constants.NO_TIME, datetime.today(), [], [constants.NO_CATEGORY_NAME], None, None, constants.PAYWALLED_CONTENT), html_data)

    else:
        title = extract_title(soup)
        author_name = extract_author_name(soup)
        intro, links_from_intro = extract_intro(soup)
        text, tagged_urls_intext = extract_text_content_and_links(soup)
        category = extract_category(soup)
        sidebar_links = extract_links_from_sidebar_box(soup)
        article_tags = extract_article_tags(soup)
        embedded_media_from_top_box = extract_links_to_embedded_content(soup)
        embedded_media_from_bottom = extract_embedded_media_from_bottom(soup)
        embedded_media_in_article = extract_embedded_media_in_article(soup)
        embedded_media = embedded_media_from_top_box + embedded_media_from_bottom + embedded_media_in_article
        all_links = tagged_urls_intext + sidebar_links + article_tags + embedded_media + links_from_intro
        pub_date, pub_time = extract_date_and_time(soup)
        fetched_datetime = datetime.today()

        updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)

        # print generate_test_func('embedded_storify_top_box', 'lesoir_new', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_data, source, 'embedded_storify_top_box', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new')

        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                updated_tagged_urls,
                category, author_name,
                intro, text),
                html_data)
Пример #9
0
def get_frontpage_toc():
    url = "http://{0}/info".format(LEVIF_NETLOC)
    html_data = fetch_html_content(url)

    hxs = HtmlXPathSelector(text=html_data)

    h1_breaking_news_links = hxs.select("//div [@id='body']/div/div[@class='frame col_650']/div [@class='frame breakingNews']//div[@class='teaserContent']//h1/a")
    h2_breaking_news_links = hxs.select("//div [@id='body']/div/div[@class='frame col_650']/div [@class='frame breakingNews']//div[@class='teaserContent']//h2/a")

    other_news = hxs.select("//div [@id='body']/div/div[@class='frame col_650']/div [@class='frame teaserRow2 clearfix']//div[@class='teaserContent']/../h1/a")

    titles_and_urls = [extract_title_and_url(link_hxs) for link_hxs in chain(h1_breaking_news_links, h2_breaking_news_links, other_news)]

    frontpage_items, blogposts = split_news_and_blogposts(titles_and_urls)

    return frontpage_items, blogposts, []
Пример #10
0
def get_frontpage_toc():
    frontpage_url = "http://{0}".format(LAVENIR_NETLOC)
    html_data = fetch_html_content(frontpage_url)

    hxs = HtmlXPathSelector(text=html_data)

    story_links = hxs.select("//div[@id='content']//div[starts-with(@class, 'fr-row')]//h3/a")
    more_story_links = hxs.select("//div[@id='content']//div[starts-with(@class, 'fr-section')]//h3/a")
    local_sport_links = hxs.select("//div[@id='content']//div[contains(@class, 'article-with-photo')]//h2/a")
    nopic_story_list = hxs.select("//div[@id='content']//ul[@class='nobullets']//li//div[contains(@class, 'item-title')]//a")

    all_links = it.chain(story_links, more_story_links, local_sport_links, nopic_story_list)

    all_items = [extract_title_and_url(link_hxs) for link_hxs in all_links]
    news_items, blogpost_items = separate_blogposts(all_items)

    return [(title, expand_full_url(url)) for (title, url) in news_items if url not in BLACKLIST], list(blogpost_items), []
Пример #11
0
def extract_article_data(source):
    """
    """

    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        source = convert_utf8_url_to_ascii(source)
        try:
            html_content = fetch_html_content(source)
        except urllib2.HTTPError as err:
            if err.code == 404:
                return None, "<html><head><title>404</title></head><body></body></html>"
            else:
                raise err

    hxs = HtmlXPathSelector(text=html_content)

    if is_page_error_404(hxs):
        return None, html_content
    else:
        category = hxs.select("//p[starts-with(@class, 'fil_ariane')]/a//text()").extract()
        #old version
        title = hxs.select("//div[@id='article']/h1/text()").extract()[0]
        # new version:
        # title = hxs.select("//div[@id='article']/article//h1/text()").extract()[0]

        pub_date, pub_time = extract_date(hxs)
        author = hxs.select("//p[@class='auteur']/text()").extract()[0]
        fetched_datetime = datetime.today()

        intro, intro_links = extract_intro_and_links(hxs)

        content, content_links = extract_content_and_links(hxs)
        associated_links = extract_associated_links(hxs)
        all_links = intro_links + content_links + associated_links
        updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)

        # import os
        # generate_unittest("embedded_dailymotion_video", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True)

        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                            updated_tagged_urls,
                            category, author,
                            intro, content),
                html_content)
Пример #12
0
def extract_main_content_links(source):
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content)
    all_links = soup.findAll('a', recursive=True)

    def extract_url_and_title(item):
        url = item.get('href')
        if item.contents[0]:
            title = item.contents[0]
        else:
            # yes, this happens
            title = 'No Title found'
        return url, title

    return [extract_url_and_title(l) for l in all_links]
Пример #13
0
def get_frontpage_toc():
    url = "http://{0}/info".format(RTBFINFO_NETLOC)
    html_data = fetch_html_content(url)

    hxs = HtmlXPathSelector(text=html_data)

    main_story = hxs.select("//div [@id='mainContent']//article//h2//a")
    featured_stories = hxs.select("//div [@id='mainContent']//section/article//h3//a")
    anchored_stories = hxs.select("//div [@id='mainContent']//div [starts-with(@class, 'anchor')]//ul//a")

    chronic_title_hxs = hxs.select("//div [@id='mainContent']//div [@class='second chronic']/div [@class='illuBox']//p")
    chronic_links = [chronic_hxs for chronic_hxs in chronic_title_hxs.select("../@href").extract()]
    chronic_titles = [c.strip() for c in chronic_title_hxs.select(".//text()").extract()]

    chronic_stories = zip(chronic_titles, chronic_links)

    titles_and_urls = [extract_title_and_url(link_hxs) for link_hxs in chain(main_story, featured_stories, anchored_stories)] + chronic_stories

    return titles_and_urls, [], []
Пример #14
0
def extract_article_data(source):
    """
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content, additional_massage_functions=coveritlive_title_massage_functions)

    if is_page_error_404(soup):
        return None, html_content
    else:
        content = soup.find('div', {'id': 'content'})
        category = extract_category(content)

        article = soup.find('div', {'id': 'article'})
        title = extract_title(article)
        pub_date, pub_time = extract_date(article)
        author = extract_author_name(article)

        fetched_datetime = datetime.today()

        intro, intro_links = extract_intro_and_links(article)
        content, content_links = extract_content_and_links(article)

        associated_links = extract_associated_links(article)
        embedded_media = extract_embedded_media(article)

        all_links = intro_links + content_links + associated_links + embedded_media

        updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)

        #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls))
        #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse')

        return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                           updated_tagged_urls,
                           category, author,
                           intro, content), html_content
Пример #15
0
def extract_article_data(source):
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        try:
            html_content = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                return None, None
            else:
                raise
        except Exception:
            raise

    hxs = HtmlXPathSelector(text=html_content)

    old_style_content_hxs = hxs.select("//div[@id='content']")

    if old_style_content_hxs:
        return extract_article_data_old(source, hxs), html_content
    else:
        return extract_article_data_new_style(source, hxs), html_content
Пример #16
0
def extract_article_data(source):
    """
    source is either a file-like object, or a url.
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content)
    story = soup.find('div', {'id': 'story'})

    category = extract_category(story)
    title = extract_title(story)
    pub_date, pub_time = extract_date(story)
    author = extract_author_name(story)

    sidebar_links = extract_links(soup)

    intro = extract_intro(story)
    content, intext_links = extract_text_content(story)

    fetched_datetime = datetime.today()

    embedded_content_links = extract_links_from_embedded_content(story)

    all_links = sidebar_links + intext_links + embedded_content_links

    updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)

    # generate_unittest("links_overload", "lesoir", dict(urls=updated_tagged_urls), html_content, source.name, "/Users/sevas/PycharmProjects/csxj-crawler-dev/tests/datasources/test_data/lesoir", True)

    return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                       updated_tagged_urls,
                       category, author,
                       intro, content), html_content