def get_regional_toc(): url = 'http://www.sudinfo.be/8/regions' html_content = fetch_content_from_url(url) hxs = HtmlXPathSelector(text=html_content) links = hxs.select("//div[@class='first-content w630-300']//div[@class='bloc_section']//li/a") return [extract_title_url_from_hxs_a(link) for link in links]
def extract_article_data(source): if hasattr(source, 'read'): html_content = source.read() else: html_content = fetch_content_from_url(source) soup = make_soup_from_html_content(html_content) main_article= soup.find('div', {'id':'mainArticle'}) if main_article: title = extract_title(main_article) category = extract_category(main_article) pub_date, pub_time = extract_date_and_time(main_article) fetched_datetime = datetime.now() links = extract_links(main_article) author = None embedded_links, content = extract_links_and_text_content(main_article) intro = extract_intro(main_article) all_links = links+embedded_links article_data = ArticleData(source, title, pub_date, pub_time, fetched_datetime, all_links, category, author, intro, content) return article_data, html_content else: return None, html_content
def get_frontpage_toc(): url = 'http://sudpresse.be/' html_content = fetch_content_from_url(url) soup = make_soup_from_html_content(html_content) column1 = soup.find('div', {'class': 'column col-01'}) headlines = extract_headlines_from_column_1(column1) column3 = soup.find('div', {'class': 'column col-03'}) headlines.extend(extract_headlines_from_column_3(column3)) regional_headlines = make_full_url(url, get_regional_toc()) headlines.extend(regional_headlines) return make_full_url(url, headlines), [], []
def get_frontpage_toc(): url = 'http://www.rtl.be/info/' html_content = fetch_content_from_url(url) soup = make_soup_from_html_content(html_content) maincontent = soup.find('div', {'class': 'mainContent'}) first_articles = extract_first_articles(maincontent) small_articles = extract_small_articles(maincontent) modules_articles = extract_headlines_from_modules(maincontent) all_articles = first_articles + small_articles + modules_articles news_items, blogposts = separate_news_and_blogposts(all_articles) return [make_full_url(title_and_url) for title_and_url in news_items], list(blogposts), []
def get_frontpage_toc(): BASE_URL = u'http://www.sudinfo.be/' html_content = fetch_content_from_url(BASE_URL) hxs = HtmlXPathSelector(text=html_content) column1_headlines = hxs.select("//div[starts-with(@class, 'first-content')]//div[starts-with(@class, 'article')]//h2/a") column3_headlines = hxs.select("//div[@class='octetFun']/a/child::h3/..") buzz_headlines = hxs.select("//div[@class='buzz exergue clearfix']//h2/a") buzz_headlines.extend(hxs.select("//div[@class='buzz exergue clearfix']//li/a")) all_link_selectors = it.chain(column1_headlines, column3_headlines, buzz_headlines) headlines = [extract_title_url_from_hxs_a(link_selector) for link_selector in all_link_selectors] regional_headlines = get_regional_toc() headlines.extend(regional_headlines) news, blogposts = separate_blogposts_and_news(headlines) return [(title, convert_utf8_url_to_ascii(url)) for title, url in make_full_url(BASE_URL, news)], blogposts, []
def get_regional_toc(): url = 'http://sudpresse.be/regions' html_content = fetch_content_from_url(url) soup = make_soup_from_html_content(html_content) return extract_regional_headlines(soup.find('div', {'id': 'content_first'}))