class NRC(object): index_url = "http://www.nrc.nl" source = 'NRC - website' def __init__(self, project, articleset): #borrow a scraper for it's getdoc() self.scraper = HTTPScraper(project = project, articleset = articleset) def _get_units(self): self.index_url = urljoin(self.index_url, self.scraper.getdoc(self.index_url).cssselect("div.watskeburt h2 a")[0].get('href')) doc = self.scraper.getdoc(self.index_url) div = doc.cssselect("div.related")[0] if div.cssselect("div.retentie"): div.cssselect("div.retentie")[0].drop_tree() for dl in div.cssselect("dl"): article = HTMLDocument() article.props.url = urljoin(self.index_url, dl.cssselect("a")[0].get('href')) article.props.headline = dl.cssselect("span.title-words")[0].text_content().strip() article.props.date = readDate(dl.cssselect("dt.tijd time")[0].get('datetime')) yield article def _scrape_unit(self, article): article.prepare(self.scraper) if article.doc.cssselect("div.author"): article.props.author = article.doc.cssselect("div.author")[0].text_content().lstrip("dor") article.props.text = article.doc.cssselect("#broodtekst")[0] yield article
class Volkskrant(HTTPScraper): index_url = "http://www.volkskrant.nl/vk/nl/2/Home/homepage/right.dhtml" cookie_url = "http://www.volkskrant.nl/?utm_source=scherm1&utm_medium=button&utm_campaign=Cookiecheck" source = 'Volkskrant - website' domain = '.volkskrant.nl' def __init__(self, project, articleset): self.scraper = HTTPScraper(project = project, articleset = articleset) def _set_cookies(self): for cookie in create_cc_cookies(self.domain): self.scraper.opener.cookiejar.set_cookie(cookie) def _get_units(self): self._set_cookies() doc = self.scraper.getdoc(self.index_url) for a in doc.cssselect("#top5 li a"): url = urljoin(self.cookie_url, a.get('href')) yield url def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self.scraper) article.props.headline = article.doc.cssselect("#articleDetailTitle")[0].text_content() time_post = article.doc.cssselect("div.time_post")[0] if time_post.cssselect("span.author"): article.props.author = time_post.cssselect("span.author")[0].text_content().lstrip("Dor:") time_post.cssselect("span.author")[0].drop_tree() article.props.date = readDate(time_post.text_content()) article.props.text = article.doc.cssselect("#art_box2")[0] yield article
class AD(HTTPScraper): source = 'Algemeen Dagblad - website' index_url = 'http://www.ad.nl' def __init__(self, project, articleset): self.scraper = HTTPScraper(project = project, articleset = articleset) def _get_units(self): doc = self.scraper.getdoc(self.index_url) for a in doc.cssselect('#hdr_hvdn_top_list a'): href = a.get('href') yield urljoin(self.index_url, href) def _scrape_unit(self, url): article = HTMLDocument(url=url) article.prepare(self.scraper) authordate = article.doc.cssselect('span.author')[0].text_content() p = "((Bewerkt door)|(Door)):?( |\n)([A-Za-z0-9 ]+)\n\n(([0-9]{1,2}\-){2}[0-9]{1,2})" pattern = re.compile(p) match = pattern.search(authordate.strip()) if match: article.props.author = match.group(5) article.props.date = readDate(match.group(6)) else: article.props.date = date.today() try: article.props.source = authordate.split("bron:")[1].strip() except IndexError: pass article.props.text = article.doc.cssselect("section#detail_content p.intro,section.clear") article.props.headline = article.doc.cssselect("h1")[0].text yield article
class Nu(HTTPScraper): source = 'nu.nl - website' index_url = 'http://www.nu.nl' def __init__(self, project, articleset): self.scraper = HTTPScraper(project = project, articleset = articleset) def _get_units(self): for a in self.scraper.getdoc(self.index_url).cssselect(".top5 a")[:5]: yield urljoin(self.index_url, a.get('href')) def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self.scraper) article.props.date = readDate(article.doc.cssselect("div.dateplace-data")[0].text) article.props.headline = article.doc.cssselect("h1")[0].text_content().strip() [s.drop_tree() for s in article.doc.cssselect("script")] article.props.text = article.doc.cssselect("#leadarticle div.content")[0] author = article.doc.cssselect("#leadarticle span.smallprint") if author: article.props.author = author[0].text.strip("| ") yield article
class Telegraaf(HTTPScraper): index_url = "http://www.telegraaf.nl/" source = 'Telegraaf - website' def __init__(self, project, articleset): self.scraper = HTTPScraper(project = project, articleset = articleset) def _get_units(self): doc = self.scraper.getdoc(self.index_url) for a in doc.cssselect("div.meestgelezenwidget div.pad5")[0].cssselect("li.item a"): yield a.get('href') def _scrape_unit(self, url): article = HTMLDocument(url = url) article.prepare(self.scraper) article.props.date = readDate(article.doc.cssselect("#artikel span.datum,#artikel span.datum-premium-content")[0].text_content()) article.props.headline = article.doc.cssselect("#artikel h1")[0].text_content() author = article.doc.cssselect("#artikel span.auteur") if author: article.props.author = author[0].text_content() [s.drop_tree() for s in article.doc.cssselect("#artikelKolom script")] article.props.text = article.doc.cssselect("#artikelKolom,#artikel div.zak-txt-premium-content")[0] yield article
def __init__(self, project, articleset): self.scraper = HTTPScraper(project = project, articleset = articleset)
def __init__(self, project, articleset): #borrow a scraper for it's getdoc() self.scraper = HTTPScraper(project = project, articleset = articleset)