Python HTTPScraper示例，amcat.scraping.scraper.HTTPScraper Python示例

示例#1

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

class NRC(object):
    index_url = "http://www.nrc.nl"
    source = 'NRC - website'

    def __init__(self, project, articleset):
        #borrow a scraper for it's getdoc()
        self.scraper = HTTPScraper(project = project, articleset = articleset)

    def _get_units(self):
        self.index_url = urljoin(self.index_url, self.scraper.getdoc(self.index_url).cssselect("div.watskeburt h2 a")[0].get('href'))
        doc = self.scraper.getdoc(self.index_url)
        div = doc.cssselect("div.related")[0]
        if div.cssselect("div.retentie"):
            div.cssselect("div.retentie")[0].drop_tree()
        for dl in div.cssselect("dl"):
            article = HTMLDocument()
            article.props.url = urljoin(self.index_url, dl.cssselect("a")[0].get('href'))
            article.props.headline = dl.cssselect("span.title-words")[0].text_content().strip()
            article.props.date = readDate(dl.cssselect("dt.tijd time")[0].get('datetime'))
            yield article

    def _scrape_unit(self, article):
        article.prepare(self.scraper)
        if article.doc.cssselect("div.author"):
            article.props.author = article.doc.cssselect("div.author")[0].text_content().lstrip("dor")
        article.props.text = article.doc.cssselect("#broodtekst")[0]
        yield article

示例#2

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

class Volkskrant(HTTPScraper):
    index_url = "http://www.volkskrant.nl/vk/nl/2/Home/homepage/right.dhtml"
    cookie_url = "http://www.volkskrant.nl/?utm_source=scherm1&utm_medium=button&utm_campaign=Cookiecheck"
    source = 'Volkskrant - website'
    domain = '.volkskrant.nl'

    def __init__(self, project, articleset):
        self.scraper = HTTPScraper(project = project, articleset = articleset)

    def _set_cookies(self):
        for cookie in create_cc_cookies(self.domain):
            self.scraper.opener.cookiejar.set_cookie(cookie)

    def _get_units(self):
        self._set_cookies()
        doc = self.scraper.getdoc(self.index_url)
        for a in doc.cssselect("#top5 li a"):
            url = urljoin(self.cookie_url, a.get('href'))
            yield url

    def _scrape_unit(self, url):
        article = HTMLDocument(url = url)
        article.prepare(self.scraper)
        article.props.headline = article.doc.cssselect("#articleDetailTitle")[0].text_content()
        time_post = article.doc.cssselect("div.time_post")[0]
        if time_post.cssselect("span.author"):
            article.props.author = time_post.cssselect("span.author")[0].text_content().lstrip("Dor:")
            time_post.cssselect("span.author")[0].drop_tree()
        article.props.date = readDate(time_post.text_content())
        article.props.text = article.doc.cssselect("#art_box2")[0]
        yield article

示例#3

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

class AD(HTTPScraper):
    source = 'Algemeen Dagblad - website'
    index_url = 'http://www.ad.nl'

    def __init__(self, project, articleset):
        self.scraper = HTTPScraper(project = project, articleset = articleset)

    def _get_units(self):
        doc = self.scraper.getdoc(self.index_url)
        for a in doc.cssselect('#hdr_hvdn_top_list a'):
            href = a.get('href')
            yield urljoin(self.index_url, href)

    def _scrape_unit(self, url):
        article = HTMLDocument(url=url)
        article.prepare(self.scraper)
        authordate = article.doc.cssselect('span.author')[0].text_content()
        
        p = "((Bewerkt door)|(Door)):?( |\n)([A-Za-z0-9 ]+)\n\n(([0-9]{1,2}\-){2}[0-9]{1,2})"
        pattern = re.compile(p)
        match = pattern.search(authordate.strip())
        if match:
            article.props.author = match.group(5)
            article.props.date = readDate(match.group(6))
        else:
            article.props.date = date.today()
        try:
            article.props.source = authordate.split("bron:")[1].strip()
        except IndexError:
            pass
        article.props.text = article.doc.cssselect("section#detail_content p.intro,section.clear")
        article.props.headline = article.doc.cssselect("h1")[0].text

        yield article

示例#4

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

class Nu(HTTPScraper):
    source = 'nu.nl - website'
    index_url = 'http://www.nu.nl'

    def __init__(self, project, articleset):
        self.scraper = HTTPScraper(project = project, articleset = articleset)

    def _get_units(self):
        for a in self.scraper.getdoc(self.index_url).cssselect(".top5 a")[:5]:
            yield urljoin(self.index_url, a.get('href'))

    def _scrape_unit(self, url):
        article = HTMLDocument(url = url)
        article.prepare(self.scraper)
        article.props.date = readDate(article.doc.cssselect("div.dateplace-data")[0].text)
        article.props.headline = article.doc.cssselect("h1")[0].text_content().strip()
        [s.drop_tree() for s in article.doc.cssselect("script")]
        article.props.text = article.doc.cssselect("#leadarticle div.content")[0]
        author = article.doc.cssselect("#leadarticle span.smallprint")
        if author:
            article.props.author = author[0].text.strip("| ")
        yield article

示例#5

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

class Telegraaf(HTTPScraper):
    index_url = "http://www.telegraaf.nl/"
    source = 'Telegraaf - website'

    def __init__(self, project, articleset):
        self.scraper = HTTPScraper(project = project, articleset = articleset)

    def _get_units(self):
        doc = self.scraper.getdoc(self.index_url)
        for a in doc.cssselect("div.meestgelezenwidget div.pad5")[0].cssselect("li.item a"):
            yield a.get('href')

    def _scrape_unit(self, url):
        article = HTMLDocument(url = url)
        article.prepare(self.scraper)
        article.props.date = readDate(article.doc.cssselect("#artikel span.datum,#artikel span.datum-premium-content")[0].text_content())
        article.props.headline = article.doc.cssselect("#artikel h1")[0].text_content()
        author = article.doc.cssselect("#artikel span.auteur")
        if author:
            article.props.author = author[0].text_content()
        [s.drop_tree() for s in article.doc.cssselect("#artikelKolom script")]
        article.props.text = article.doc.cssselect("#artikelKolom,#artikel div.zak-txt-premium-content")[0]
        yield article

示例#6

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

 def __init__(self, project, articleset):
     self.scraper = HTTPScraper(project = project, articleset = articleset)

示例#7

0

显示文件

文件： top5.py 项目： ToonAlfrink/amcatscraping

 def __init__(self, project, articleset):
     #borrow a scraper for it's getdoc()
     self.scraper = HTTPScraper(project = project, articleset = articleset)