示例#1
0
def spider():

    tasks_execution_log = []

    class LoggingTaskRunner(BaseTaskRunner):
        def __call__(self, task):
            tasks_execution_log.append((self, task.id))
            if task.get('abortme', False):
                raise AbortTask()
            if task.get('skipme', False):
                raise SkipRunner()
            if task.get('retryme', False):
                raise RetryTask()
            if task.get('raise_sth', False):
                raise Exception("This is an exception!")
            return iter([])

    class MyTaskRunner(LoggingTaskRunner):
        def match(self, task):
            return isinstance(task, MyTask)

    class MyTaskRunner2(LoggingTaskRunner):
        def match(self, task):
            return isinstance(task, MyTask)

        def __call__(self, task):
            super(MyTaskRunner2, self).__call__(task)
            yield MyOtherTask(
                task_id='was:' + task.id,
                previous=task)

    class MyOtherTaskRunner(LoggingTaskRunner):
        def match(self, task):
            return isinstance(task, MyOtherTask)

    runners = {
        0: MyTaskRunner(),
        1: MyTaskRunner2(),
        2: MyOtherTaskRunner(),
    }

    spider = Spider()
    spider.add_runners(x[1] for x in sorted(runners.iteritems()))
    # spider.execution_log = tasks_execution_log

    ## We need to pass some extra stuff..
    spider._testing = {
        'execution_log': tasks_execution_log,
        'runners': runners,
    }

    return spider
示例#2
0
        assert self.match(task)
        response = task['response']
        content_type, params = cgi.parse_header(
            response['headers'].get('content-type') or 'text/html')
        # content_type = task['response'].headers['Content-type'].split(';')
        if content_type[0] != 'text/html':
            return  # Nothing to do here..
        tree = lxml.html.fromstring(task['response'].content)
        el = tree.xpath('//h1[@id="firstHeading"]')[0]
        yield WikipediaPage(url=task['url'], title=el.text_content())


spider = Spider()
spider.add_runners([
    WikipediaDownloader(),
    WikipediaScraper(),
    LinkExtractor(max_depth=3),
])


if __name__ == '__main__':
    try:
        ## Prepare the storage
        if len(sys.argv) > 1:
            storage = AnydbmStorage(path=sys.argv[1])
        else:
            storage = DictStorage()
        spider.add_runners([storage])

        ## Queue the first task
        task = DownloadTask(url='http://en.wikipedia.org')