Exemplo n.º 1
0
def extract_article_data(source):
    # url is either a file-like object, or a url.
    # if it's a file we just open it, assume it's an article and extract article data

    if hasattr(source, 'read'):
        html_data = source.read()
    # if it's an url we need to check if it's a photo album, a link to the frontpage or a true article
    else:
        html_data = utils.fetch_html_content(source)

        page_type = detect_page_type(source)
        if page_type == IS_FRONTPAGE:
            return None, None
        elif page_type == MAYBE_ARTICLE:
            raise ValueError("We couldn't define if this was an article or the frontpage, please check")

    if is_404_page(html_data):
        return (None, html_data)

    # pour tous les autres vrais articles
    soup = bs4.BeautifulSoup(html_data)

    if soup.find("head").find("title").contents[0] == "301 Moved Permanently":
        return (None, html_data)

    else:
        title = extract_title(soup)

        author_box = soup.find(attrs={"class": "author"})
        author_name = extract_author_name(author_box)
        pub_date, pub_time = extract_date_and_time(author_box)

        # original_source = extract_source(author_box)

        intro, tagged_urls_from_intro = extract_intro(soup)

        category = extract_category(soup)

        text, tagged_urls_intext = extract_text_content_and_links(soup)

        tagged_urls_read_more_box = extract_links_from_read_more_box(soup)

        tagged_urls_sidebar_box = extract_links_from_sidebar_box(soup)

        tagged_urls_embedded_media = extract_embedded_media(soup)

        tagged_urls = tagged_urls_intext + tagged_urls_read_more_box + tagged_urls_sidebar_box + tagged_urls_embedded_media + tagged_urls_from_intro

        updated_tagged_urls = tagging.update_tagged_urls(tagged_urls, SEPTSURSEPT_SAME_OWNER)

        # print generate_test_func('same_owner', 'septsursept', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_data, source, 'same_owner', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/septsursept')

        return (ArticleData(source, title, pub_date, pub_time, dt.datetime.now(),
                updated_tagged_urls,
                category, author_name,
                intro, text),
                html_data)
Exemplo n.º 2
0
def extract_article_data(source):
    """
    """
    if hasattr(source, "read"):
        html_content = source.read()
    else:
        try:
            html_content = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = make_soup_from_html_content(html_content)
    main_content = soup.find("div", {"id": "maincontent"})

    if main_content and main_content.h1:
        title = remove_text_formatting_markup_from_fragments(main_content.h1.contents)
        pub_date, pub_time = extract_date_from_maincontent(main_content)
        category = extract_category_from_maincontent(main_content)
        author_name = extract_author_name_from_maincontent(main_content)

        article_text = main_content.find("div", {"id": "articleText"})
        if article_has_intro(article_text):
            intro = extract_intro_from_articletext(article_text)
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text)
        else:
            intro = u""
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False)

        audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        embedded_content_links = extract_links_to_embedded_content(main_content)
        all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links

        updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER)

        fetched_datetime = datetime.today()

        # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet')

        # import os
        # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True)

        new_article = ArticleData(
            source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text
        )
        return new_article, html_content
    else:
        return None, html_content
Exemplo n.º 3
0
def extract_article_data(source):

    if hasattr(source, 'read'):
        html_data = source.read()
    else:
        try:
            source = convert_utf8_url_to_ascii(source)
            html_data = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404 or e.code == 403:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = bs4.BeautifulSoup(html_data)

    # this is how we detect paywalled articles
    if soup.find(attrs={"id": "main-content"}).h2 and soup.find(attrs={"id": "main-content"}).h2.find(attrs={'class': 'ir locked'}):
        title = extract_title(soup)
        return (ArticleData(source, title, constants.NO_DATE, constants.NO_TIME, datetime.today(), [], [constants.NO_CATEGORY_NAME], None, None, constants.PAYWALLED_CONTENT), html_data)

    else:
        title = extract_title(soup)
        author_name = extract_author_name(soup)
        intro, links_from_intro = extract_intro(soup)
        text, tagged_urls_intext = extract_text_content_and_links(soup)
        category = extract_category(soup)
        sidebar_links = extract_links_from_sidebar_box(soup)
        article_tags = extract_article_tags(soup)
        embedded_media_from_top_box = extract_links_to_embedded_content(soup)
        embedded_media_from_bottom = extract_embedded_media_from_bottom(soup)
        embedded_media_in_article = extract_embedded_media_in_article(soup)
        embedded_media = embedded_media_from_top_box + embedded_media_from_bottom + embedded_media_in_article
        all_links = tagged_urls_intext + sidebar_links + article_tags + embedded_media + links_from_intro
        pub_date, pub_time = extract_date_and_time(soup)
        fetched_datetime = datetime.today()

        updated_tagged_urls = tagging.update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)

        # print generate_test_func('embedded_storify_top_box', 'lesoir_new', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_data, source, 'embedded_storify_top_box', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/lesoir_new')

        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                updated_tagged_urls,
                category, author_name,
                intro, text),
                html_data)
Exemplo n.º 4
0
def extract_article_data(source):
    """
    """

    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        source = convert_utf8_url_to_ascii(source)
        try:
            html_content = fetch_html_content(source)
        except urllib2.HTTPError as err:
            if err.code == 404:
                return None, "<html><head><title>404</title></head><body></body></html>"
            else:
                raise err

    hxs = HtmlXPathSelector(text=html_content)

    if is_page_error_404(hxs):
        return None, html_content
    else:
        category = hxs.select("//p[starts-with(@class, 'fil_ariane')]/a//text()").extract()
        #old version
        title = hxs.select("//div[@id='article']/h1/text()").extract()[0]
        # new version:
        # title = hxs.select("//div[@id='article']/article//h1/text()").extract()[0]

        pub_date, pub_time = extract_date(hxs)
        author = hxs.select("//p[@class='auteur']/text()").extract()[0]
        fetched_datetime = datetime.today()

        intro, intro_links = extract_intro_and_links(hxs)

        content, content_links = extract_content_and_links(hxs)
        associated_links = extract_associated_links(hxs)
        all_links = intro_links + content_links + associated_links
        updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)

        # import os
        # generate_unittest("embedded_dailymotion_video", "sudinfo", dict(urls=updated_tagged_urls), html_content, "csxjdb://sudinfo/2012-03-26/13.05.07/raw_data/7.html", os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/sudinfo"), True)

        return (ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                            updated_tagged_urls,
                            category, author,
                            intro, content),
                html_content)
Exemplo n.º 5
0
def extract_article_data(source):
    """
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content, additional_massage_functions=coveritlive_title_massage_functions)

    if is_page_error_404(soup):
        return None, html_content
    else:
        content = soup.find('div', {'id': 'content'})
        category = extract_category(content)

        article = soup.find('div', {'id': 'article'})
        title = extract_title(article)
        pub_date, pub_time = extract_date(article)
        author = extract_author_name(article)

        fetched_datetime = datetime.today()

        intro, intro_links = extract_intro_and_links(article)
        content, content_links = extract_content_and_links(article)

        associated_links = extract_associated_links(article)
        embedded_media = extract_embedded_media(article)

        all_links = intro_links + content_links + associated_links + embedded_media

        updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)

        #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls))
        #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse')

        return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                           updated_tagged_urls,
                           category, author,
                           intro, content), html_content
Exemplo n.º 6
0
def extract_article_data_new_style(source, hxs):
    """ """
    category = hxs.select("//nav [contains(@id,'breadcrumb')]//li").extract()

    datetime_string = hxs.select("//div [@class='row content']//time/@datetime").extract()
    if not datetime_string:
        raise ValueError("Could not find the date, update the parser")

    parsed_datetime = datetime_from_iso8601(datetime_string[0])
    pub_date, pub_time = parsed_datetime.date(), parsed_datetime.time()
    fetched_datetime = datetime.now()

    title = hxs.select("//header//h1/text()").extract()
    if not title:
        raise ValueError()
    title = title[0]

    content_hxs = hxs.select("//div [@class='entry-content']")

    author_fragments = content_hxs.select(".//p [@class='copyright']/text()").extract()
    author = ''.join([remove_text_formatting_markup_from_fragments(author_fragments, strip_chars='\r\n\t ')])

    intro, intro_links = extract_intro_and_links_new(content_hxs)
    content, content_links = extract_content_and_links_new(content_hxs)

    other_div_hxs = content_hxs.select("//div [@class='entry-content']/div [not(contains(@class, 'entry-'))]")
    content_media_links = extract_links_from_other_divs(other_div_hxs)
    related_links = extract_related_links(hxs)
    media_links = extract_links_from_embbeded_media(content_hxs)
    tag_links = extract_links_from_tags(hxs)

    all_links = it.chain(intro_links, content_links, media_links, content_media_links, related_links, tag_links)
    updated_tagged_urls = update_tagged_urls(all_links, LAVENIR_SAME_OWNER)

    article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                               updated_tagged_urls,
                               category, author,
                               intro, content)
    return article_data
Exemplo n.º 7
0
def extract_article_data_from_html(html_content, source_url):
    soup = make_soup_from_html_content(html_content)

    main_content = soup.find('div', {'id': 'mainContent'})

    if main_content.h1:
        title = main_content.h1.contents[0].rstrip().lstrip()
    else:
        return None, html_content

    category = extract_category(main_content)
    author = extract_author_name(main_content)
    pub_date, pub_time = extract_date(main_content)
    fetched_datetime = datetime.today()

    intro = extract_intro(main_content)
    text_content, in_text_links = extract_text_content_and_links(main_content)

    embedded_audio_links = ipm_utils.extract_embedded_audio_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    associated_tagged_urls = ipm_utils.extract_and_tag_associated_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    bottom_links = ipm_utils.extract_bottom_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    embedded_content_links = extract_embedded_content_links(main_content)

    all_links = in_text_links + associated_tagged_urls + bottom_links + embedded_content_links + embedded_audio_links

    updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.LALIBRE_SAME_OWNER)

    # generate_unittest("vuvox_without_title", 'lalibre', dict(updated_tagged_urls=updated_tagged_urls),
    #                   html_content, source_url,
    #                   os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lalibre"),
    #                   save_file=True)

    new_article = ArticleData(source_url, title,
                              pub_date, pub_time, fetched_datetime,
                              updated_tagged_urls,
                              category, author,
                              intro, text_content)

    return new_article, html_content
Exemplo n.º 8
0
def extract_article_data(source):
    """
    source is either a file-like object, or a url.
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content)
    story = soup.find('div', {'id': 'story'})

    category = extract_category(story)
    title = extract_title(story)
    pub_date, pub_time = extract_date(story)
    author = extract_author_name(story)

    sidebar_links = extract_links(soup)

    intro = extract_intro(story)
    content, intext_links = extract_text_content(story)

    fetched_datetime = datetime.today()

    embedded_content_links = extract_links_from_embedded_content(story)

    all_links = sidebar_links + intext_links + embedded_content_links

    updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)

    # generate_unittest("links_overload", "lesoir", dict(urls=updated_tagged_urls), html_content, source.name, "/Users/sevas/PycharmProjects/csxj-crawler-dev/tests/datasources/test_data/lesoir", True)

    return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                       updated_tagged_urls,
                       category, author,
                       intro, content), html_content
Exemplo n.º 9
0
def extract_article_data_old(source, hxs):
    """ process an old-style lavenir.net article page"""
    article_detail_hxs = hxs.select("//div[@id='content']/div[starts-with(@class,'span-3 article-detail')]")

    category = hxs.select("//div[@id='content']/*[1]/p/a/text()").extract()
    intro_h1s = article_detail_hxs.select(".//div[@id='intro']/h1/text()").extract()

    title = ''
    if len(intro_h1s) == 1:
        title = intro_h1s[0].strip()
    else:
        return None, None

    # all the date stuff
    #raw_date = article_detail_hxs.select(".//div[@id='intro']//li[@id='liDate']/*").extract()
    raw_date = ''.join([t.strip() for t in article_detail_hxs.select(".//div[@id='intro']//li[@id='liDate']//text()").extract()])
    pub_date, pub_time = extract_publication_date(raw_date)
    fetched_datetime = datetime.today()

    #author(s)
    raw_author = article_detail_hxs.select("./div/ul/li[@class='author']/text()").extract()
    author = None
    if raw_author:
        author = raw_author[0].strip()

    #intro
    intro = None
    raw_intro = article_detail_hxs.select("./div/div[@class='intro ']//text()").extract()
    if raw_intro:
        intro = ''.join([fragment.strip() for fragment in raw_intro])

    # in photosets pages, the structure is a bit different
    if not intro:
        raw_intro = article_detail_hxs.select("./div/div[@class='intro']//text()").extract()
    if raw_intro:
        intro = ''.join([fragment.strip() for fragment in raw_intro])

    #detect photoset
    full_class = article_detail_hxs.select("./@class").extract()[0]
    if 'article-with-photoset' in full_class.split(" "):
        title = u"{0}|{1}".format("PHOTOSET", title)

    all_links = list()

    #content
    article_body = article_detail_hxs.select("./div/div[@class='article-body ']")
    content = article_body.select(".//p//text()").extract()

    all_links.extend(extract_links_from_article_body(article_body))
    all_links.extend(extract_links_from_highlight_section(article_body.select('../..')))

    # associated sidebar links
    sidebar_links = article_detail_hxs.select("./div/div[@class='article-side']/div[@class='article-related']//li/a")
    all_links.extend(extract_sidebar_links(sidebar_links))

    # bottom links
    bottom_box = hxs.select('//div[@class="span-3 lire-aussi"]//a')
    all_links.extend(extract_bottom_links(bottom_box))

    updated_tagged_urls = update_tagged_urls(all_links, LAVENIR_SAME_OWNER)

    article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                               updated_tagged_urls,
                               category, author,
                               intro, content)

    return article_data