예제 #1
0
파일: dhnet.py 프로젝트: sevas/csxj-crawler
def get_frontpage_toc():
    url = "http://www.dhnet.be"
    html_content = fetch_html_content(url)
    soup = make_soup_from_html_content(html_content)

    main_content = soup.find("div", {"id": "maincontent"})
    if main_content:
        all_titles_and_urls = []

        # so, the list here is a combination of several subcontainer types.
        # processing every type separately
        first_title, first_url = get_first_story_title_and_url(main_content)
        all_titles_and_urls.append((first_title, first_url))

        # this will pick up the 'annouceGroup' containers with same type in the 'regions' div
        first_announce_groups = main_content.findAll(
            "div", {"class": "announceGroupFirst announceGroup"}, recursive=True
        )
        announce_groups = main_content.findAll("div", {"class": "announceGroup"}, recursive=True)

        # all those containers have two sub stories
        for announce_group in chain(first_announce_groups, announce_groups):
            titles_and_urls = extract_title_and_link_from_anounce_group(announce_group)
            all_titles_and_urls.extend(titles_and_urls)

        return [(title, "http://www.dhnet.be%s" % url) for (title, url) in all_titles_and_urls], [], []
    else:
        return [], [], []
예제 #2
0
def extract_article_data(source):
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_content_from_url(source)

    soup = make_soup_from_html_content(html_content)

    main_article= soup.find('div', {'id':'mainArticle'})

    if main_article:
        title = extract_title(main_article)
        category = extract_category(main_article)
        pub_date, pub_time = extract_date_and_time(main_article)
        fetched_datetime = datetime.now()

        links = extract_links(main_article)

        author = None
        embedded_links, content = extract_links_and_text_content(main_article)
        intro = extract_intro(main_article)

        all_links = links+embedded_links

        article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, all_links, category, author, intro, content)
        return article_data, html_content

    else:
        return None, html_content
예제 #3
0
파일: dhnet.py 프로젝트: sevas/csxj-crawler
def extract_article_data(source):
    """
    """
    if hasattr(source, "read"):
        html_content = source.read()
    else:
        try:
            html_content = fetch_html_content(source)
        except HTTPError as e:
            if e.code == 404:
                return None, None
            else:
                raise
        except Exception:
            raise

    soup = make_soup_from_html_content(html_content)
    main_content = soup.find("div", {"id": "maincontent"})

    if main_content and main_content.h1:
        title = remove_text_formatting_markup_from_fragments(main_content.h1.contents)
        pub_date, pub_time = extract_date_from_maincontent(main_content)
        category = extract_category_from_maincontent(main_content)
        author_name = extract_author_name_from_maincontent(main_content)

        article_text = main_content.find("div", {"id": "articleText"})
        if article_has_intro(article_text):
            intro = extract_intro_from_articletext(article_text)
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text)
        else:
            intro = u""
            text, in_text_links = extract_text_content_and_links_from_articletext(article_text, False)

        audio_content_links = ipm_utils.extract_embedded_audio_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        sidebox_links = ipm_utils.extract_and_tag_associated_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        bottom_links = ipm_utils.extract_bottom_links(main_content, DHNET_NETLOC, DHNET_INTERNAL_SITES)
        embedded_content_links = extract_links_to_embedded_content(main_content)
        all_links = in_text_links + sidebox_links + embedded_content_links + bottom_links + audio_content_links

        updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.DHNET_SAME_OWNER)

        fetched_datetime = datetime.today()

        # print generate_test_func('twizz_stream', 'dhnet', dict(tagged_urls=updated_tagged_urls))
        # save_sample_data_file(html_content, source, 'twizz_stream', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/dhnet')

        # import os
        # generate_unittest("links_tweet_with_emoji", "dhnet", dict(urls=updated_tagged_urls), html_content, source, os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/dhnet"), True)

        new_article = ArticleData(
            source, title, pub_date, pub_time, fetched_datetime, updated_tagged_urls, category, author_name, intro, text
        )
        return new_article, html_content
    else:
        return None, html_content
예제 #4
0
def get_frontpage_toc():
    """
    Fetch links to articles listed on the 'Le Soir' front page.
    For each of them, extract the relevant data.
    """
    url = 'http://www.lesoir.be'
    html_content = fetch_html_content(url)

    soup = make_soup_from_html_content(html_content)

    # Here we have interlaced <ul>'s with a bunch of random other shit that
    # need some filtering
    stories_containers = soup.findAll('ul', {'class': 'stories_list grid_6'})

    articles_toc, blogpost_toc = [], []

    for container in stories_containers:
        all_stories = set(container.findAll('li', recursive=False))
        main_stories = set(container.findAll('li', {'class': 'stories_main clearfix'}, recursive=False))

        other_stories = all_stories - main_stories

        # So, in _some_ lists of stories, the first one ('main story') has its title in an <h1>
        # and the rest in an <h2>
        # Also, some have two columns stories.
        # Beautiful soup indeed.
        for item in main_stories:
            title, url = (item.h1.a.get('title'), item.h1.a.get('href'))
            if is_external_blog(url):
                blogpost_toc.append((title, url))
            else:
                articles_toc.append((title, url))

        for item in other_stories:
            if element_has_two_columns_stories(item):
                # For some reason, those links don't have a 'title' attribute.
                # Love this.
                def extract_title_and_link(item):
                    return item.h2.a.contents[0], item.h2.a.get('href')

                for story in get_two_columns_stories(item):
                    title, url = extract_title_and_link(story)
                    if is_external_blog(url):
                        blogpost_toc.append((title, url))
                    else:
                        articles_toc.append((title, url))
            else:
                title, url = (item.h2.a.get('title'), item.h2.a.get('href'))
                if is_external_blog(url):
                    blogpost_toc.append((title, url))
                else:
                    articles_toc.append((title, url))

    return [(title, 'http://www.lesoir.be{0}'.format(url)) for (title, url) in articles_toc], blogpost_toc, []
예제 #5
0
def get_frontpage_toc():
    url = 'http://sudpresse.be/'
    html_content = fetch_content_from_url(url)
    soup = make_soup_from_html_content(html_content)

    column1 = soup.find('div', {'class': 'column col-01'})
    headlines = extract_headlines_from_column_1(column1)
    column3 = soup.find('div', {'class': 'column col-03'})
    headlines.extend(extract_headlines_from_column_3(column3))

    regional_headlines = make_full_url(url, get_regional_toc())
    headlines.extend(regional_headlines)

    return make_full_url(url, headlines), [], []
예제 #6
0
def get_frontpage_toc():
    hostname_url = 'http://www.lalibre.be'
    html_content = fetch_html_content(hostname_url)

    soup = make_soup_from_html_content(html_content)

    article_list_container = soup.find('div', {'id': 'mainContent'})
    announces = article_list_container.findAll('div', {'class': 'announce'}, recursive=False)

    def extract_title_and_link(announce):
        title, url = announce.h1.a.contents[0], announce.h1.a.get('href')
        return title, '{0}{1}'.format(hostname_url, url)

    return [extract_title_and_link(announce) for announce in announces], [], []
예제 #7
0
def get_frontpage_toc():
    url = 'http://www.rtl.be/info/'
    html_content = fetch_content_from_url(url)
    soup = make_soup_from_html_content(html_content)

    maincontent = soup.find('div', {'class': 'mainContent'})

    first_articles = extract_first_articles(maincontent)
    small_articles = extract_small_articles(maincontent)
    modules_articles = extract_headlines_from_modules(maincontent)

    all_articles = first_articles + small_articles + modules_articles

    news_items, blogposts = separate_news_and_blogposts(all_articles)

    return [make_full_url(title_and_url) for title_and_url in news_items], list(blogposts), []
예제 #8
0
def extract_main_content_links(source):
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content)
    all_links = soup.findAll('a', recursive=True)

    def extract_url_and_title(item):
        url = item.get('href')
        if item.contents[0]:
            title = item.contents[0]
        else:
            # yes, this happens
            title = 'No Title found'
        return url, title

    return [extract_url_and_title(l) for l in all_links]
예제 #9
0
def extract_article_data(source):
    """
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content, additional_massage_functions=coveritlive_title_massage_functions)

    if is_page_error_404(soup):
        return None, html_content
    else:
        content = soup.find('div', {'id': 'content'})
        category = extract_category(content)

        article = soup.find('div', {'id': 'article'})
        title = extract_title(article)
        pub_date, pub_time = extract_date(article)
        author = extract_author_name(article)

        fetched_datetime = datetime.today()

        intro, intro_links = extract_intro_and_links(article)
        content, content_links = extract_content_and_links(article)

        associated_links = extract_associated_links(article)
        embedded_media = extract_embedded_media(article)

        all_links = intro_links + content_links + associated_links + embedded_media

        updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.SUDINFO_SAME_OWNER)

        #print generate_test_func('intext_links_tagging', 'sudpresse', dict(tagged_urls=updated_tagged_urls))
        #save_sample_data_file(html_content, source.name, 'intext_links_tagging', '/Users/judemaey/code/csxj-crawler/tests/datasources/test_data/sudpresse')

        return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                           updated_tagged_urls,
                           category, author,
                           intro, content), html_content
예제 #10
0
def extract_article_data_from_html(html_content, source_url):
    soup = make_soup_from_html_content(html_content)

    main_content = soup.find('div', {'id': 'mainContent'})

    if main_content.h1:
        title = main_content.h1.contents[0].rstrip().lstrip()
    else:
        return None, html_content

    category = extract_category(main_content)
    author = extract_author_name(main_content)
    pub_date, pub_time = extract_date(main_content)
    fetched_datetime = datetime.today()

    intro = extract_intro(main_content)
    text_content, in_text_links = extract_text_content_and_links(main_content)

    embedded_audio_links = ipm_utils.extract_embedded_audio_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    associated_tagged_urls = ipm_utils.extract_and_tag_associated_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    bottom_links = ipm_utils.extract_bottom_links(main_content, LALIBRE_NETLOC, LALIBRE_ASSOCIATED_SITES)
    embedded_content_links = extract_embedded_content_links(main_content)

    all_links = in_text_links + associated_tagged_urls + bottom_links + embedded_content_links + embedded_audio_links

    updated_tagged_urls = update_tagged_urls(all_links, ipm_utils.LALIBRE_SAME_OWNER)

    # generate_unittest("vuvox_without_title", 'lalibre', dict(updated_tagged_urls=updated_tagged_urls),
    #                   html_content, source_url,
    #                   os.path.join(os.path.dirname(__file__), "../../tests/datasources/test_data/lalibre"),
    #                   save_file=True)

    new_article = ArticleData(source_url, title,
                              pub_date, pub_time, fetched_datetime,
                              updated_tagged_urls,
                              category, author,
                              intro, text_content)

    return new_article, html_content
예제 #11
0
def extract_article_data(source):
    """
    source is either a file-like object, or a url.
    """
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_html_content(source)

    soup = make_soup_from_html_content(html_content)
    story = soup.find('div', {'id': 'story'})

    category = extract_category(story)
    title = extract_title(story)
    pub_date, pub_time = extract_date(story)
    author = extract_author_name(story)

    sidebar_links = extract_links(soup)

    intro = extract_intro(story)
    content, intext_links = extract_text_content(story)

    fetched_datetime = datetime.today()

    embedded_content_links = extract_links_from_embedded_content(story)

    all_links = sidebar_links + intext_links + embedded_content_links

    updated_tagged_urls = update_tagged_urls(all_links, rossel_utils.LESOIR_SAME_OWNER)

    # generate_unittest("links_overload", "lesoir", dict(urls=updated_tagged_urls), html_content, source.name, "/Users/sevas/PycharmProjects/csxj-crawler-dev/tests/datasources/test_data/lesoir", True)

    return ArticleData(source, title, pub_date, pub_time, fetched_datetime,
                       updated_tagged_urls,
                       category, author,
                       intro, content), html_content
예제 #12
0
def get_regional_toc():
    url = 'http://sudpresse.be/regions'
    html_content = fetch_content_from_url(url)
    soup = make_soup_from_html_content(html_content)

    return extract_regional_headlines(soup.find('div', {'id': 'content_first'}))