Пример #1
0
def get_regional_toc():
    url = 'http://www.sudinfo.be/8/regions'
    html_content = fetch_content_from_url(url)

    hxs = HtmlXPathSelector(text=html_content)
    links = hxs.select("//div[@class='first-content w630-300']//div[@class='bloc_section']//li/a")
    return [extract_title_url_from_hxs_a(link) for link in links]
Пример #2
0
def extract_article_data(source):
    if hasattr(source, 'read'):
        html_content = source.read()
    else:
        html_content = fetch_content_from_url(source)

    soup = make_soup_from_html_content(html_content)

    main_article= soup.find('div', {'id':'mainArticle'})

    if main_article:
        title = extract_title(main_article)
        category = extract_category(main_article)
        pub_date, pub_time = extract_date_and_time(main_article)
        fetched_datetime = datetime.now()

        links = extract_links(main_article)

        author = None
        embedded_links, content = extract_links_and_text_content(main_article)
        intro = extract_intro(main_article)

        all_links = links+embedded_links

        article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, all_links, category, author, intro, content)
        return article_data, html_content

    else:
        return None, html_content
Пример #3
0
def get_frontpage_toc():
    url = 'http://sudpresse.be/'
    html_content = fetch_content_from_url(url)
    soup = make_soup_from_html_content(html_content)

    column1 = soup.find('div', {'class': 'column col-01'})
    headlines = extract_headlines_from_column_1(column1)
    column3 = soup.find('div', {'class': 'column col-03'})
    headlines.extend(extract_headlines_from_column_3(column3))

    regional_headlines = make_full_url(url, get_regional_toc())
    headlines.extend(regional_headlines)

    return make_full_url(url, headlines), [], []
Пример #4
0
def get_frontpage_toc():
    url = 'http://www.rtl.be/info/'
    html_content = fetch_content_from_url(url)
    soup = make_soup_from_html_content(html_content)

    maincontent = soup.find('div', {'class': 'mainContent'})

    first_articles = extract_first_articles(maincontent)
    small_articles = extract_small_articles(maincontent)
    modules_articles = extract_headlines_from_modules(maincontent)

    all_articles = first_articles + small_articles + modules_articles

    news_items, blogposts = separate_news_and_blogposts(all_articles)

    return [make_full_url(title_and_url) for title_and_url in news_items], list(blogposts), []
Пример #5
0
def get_frontpage_toc():
    BASE_URL = u'http://www.sudinfo.be/'
    html_content = fetch_content_from_url(BASE_URL)
    hxs = HtmlXPathSelector(text=html_content)

    column1_headlines = hxs.select("//div[starts-with(@class, 'first-content')]//div[starts-with(@class, 'article')]//h2/a")
    column3_headlines = hxs.select("//div[@class='octetFun']/a/child::h3/..")
    buzz_headlines = hxs.select("//div[@class='buzz exergue clearfix']//h2/a")
    buzz_headlines.extend(hxs.select("//div[@class='buzz exergue clearfix']//li/a"))

    all_link_selectors = it.chain(column1_headlines, column3_headlines, buzz_headlines)
    headlines = [extract_title_url_from_hxs_a(link_selector) for link_selector in all_link_selectors]

    regional_headlines = get_regional_toc()
    headlines.extend(regional_headlines)

    news, blogposts = separate_blogposts_and_news(headlines)
    return [(title, convert_utf8_url_to_ascii(url)) for title, url in make_full_url(BASE_URL, news)], blogposts, []
Пример #6
0
def get_regional_toc():
    url = 'http://sudpresse.be/regions'
    html_content = fetch_content_from_url(url)
    soup = make_soup_from_html_content(html_content)

    return extract_regional_headlines(soup.find('div', {'id': 'content_first'}))