Пример #1
0
 def test_task_join_crawl_config(self):
     task = dict(self.sample_task_http)
     crawl_config = {
         'taskid': 'xxxx',       # should not affect finial task
         'proxy': 'username:password@hostname:port',  # should add proxy
         'headers': {            # should merge headers
             'Cookie': 'abc',    # should not affect cookie
             'c': 'd',           # should add header c
         }
     }
     
     ret = BaseHandler.task_join_crawl_config(task, crawl_config)
     self.assertDictEqual(ret, {
         'taskid': 'taskid',
         'project': 'project',
         'url': '',
         'fetch': {
             'method': 'GET',
             'proxy': 'username:password@hostname:port',
             'headers': {
                 'Cookie': 'a=b',
                 'a': 'b',
                 'c': 'd'
             },
             'cookies': {
                 'c': 'd',
             },
             'timeout': 60,
             'save': 'abc',
         },
         'process': {
             'callback': 'callback',
             'save': [1, 2, 3],
         },
     });
Пример #2
0
    def on_select_task(self, task):
        logger.info('select %(project)s:%(taskid)s %(url)s', task)

        project_info = self.projects.get(task['project'])
        assert project_info, 'no such project'
        task['type'] = self.TASK_PACK
        task['group'] = project_info.group
        task['project_md5sum'] = project_info.md5sum
        task['project_updatetime'] = project_info.updatetime

        if getattr(project_info, 'crawl_config', None):
            task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config)

        project_info.active_tasks.appendleft((time.time(), task))
        self.send_task(task)
        return task
Пример #3
0
    def on_select_task(self, task):
        '''Called when a task is selected to fetch & process'''
        # inject informations about project
        logger.info('select %(project)s:%(taskid)s %(url)s', task)

        project_info = self.projects.get(task['project'])
        assert project_info, 'no such project'
        task['group'] = project_info.group
        task['project_md5sum'] = project_info.md5sum
        task['project_updatetime'] = project_info.updatetime

        # lazy join project.crawl_config
        if getattr(project_info, 'crawl_config', None):
            task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config)

        project_info.active_tasks.appendleft((time.time(), task))
        self.send_task(task)
        return task
Пример #4
0
    def on_select_task(self, task):
        '''Called when a task is selected to fetch & process'''
        # inject informations about project
        logger.info('select %(project)s:%(taskid)s %(url)s', task)

        project_info = self.projects.get(task['project'])
        assert project_info, 'no such project'
        task['group'] = project_info.group
        task['project_md5sum'] = project_info.md5sum
        task['project_updatetime'] = project_info.updatetime

        # lazy join project.crawl_config
        if getattr(project_info, 'crawl_config', None):
            task = BaseHandler.task_join_crawl_config(task, project_info.crawl_config)

        project_info.active_tasks.appendleft((time.time(), task))
        self.send_task(task)
        return task
Пример #5
0
    def test_task_join_crawl_config(self):
        task = dict(self.sample_task_http)
        crawl_config = {
            'taskid': 'xxxx',  # should not affect finial task
            'proxy': 'username:password@hostname:port',  # should add proxy
            'headers': {  # should merge headers
                'Cookie': 'abc',  # should not affect cookie
                'c': 'd',  # should add header c
            }
        }

        ret = BaseHandler.task_join_crawl_config(task, crawl_config)
        self.assertDictEqual(
            ret, {
                'taskid': 'taskid',
                'project': 'project',
                'url': '',
                'fetch': {
                    'method': 'GET',
                    'proxy': 'username:password@hostname:port',
                    'headers': {
                        'Cookie': 'a=b',
                        'a': 'b',
                        'c': 'd'
                    },
                    'cookies': {
                        'c': 'd',
                    },
                    'timeout': 60,
                    'save': 'abc',
                },
                'process': {
                    'callback': 'callback',
                    'save': [1, 2, 3],
                },
            })