Exemplo n.º 1
0
    def process_handler_result(self, result, task):
        """
        Process result received from the task handler.

        Result could be:
        * None
        * Task instance
        * Data instance.
        * dict:
          * {type: "stat", counters: [], collections: []} 
        * ResponseNotValid-based exception
        * Arbitrary exception
        """

        if isinstance(result, Task):
            self.add_task(result)
        elif isinstance(result, Data):
            handler = self.find_data_handler(result)
            try:
                data_result = handler(**result.storage)
                if data_result is None:
                    pass
                else:
                    for something in data_result:
                        self.process_handler_result(something, task)

            except Exception as ex:
                self.process_handler_error('data_%s' % result.handler_key, ex,
                                           task)
        elif result is None:
            pass
        elif isinstance(result, ResponseNotValid):
            self.add_task(task.clone(refresh_cache=True))
            error_code = result.__class__.__name__.replace('_', '-')
            self.stat.inc('integrity:%s' % error_code)
        elif isinstance(result, Exception): 
            handler = self.find_task_handler(task)
            handler_name = getattr(handler, '__name__', 'NONE')
            self.process_handler_error(handler_name, result, task)
        elif isinstance(result, dict):
            if result.get('type') == 'stat':
                for name, count in result['counters'].items():
                    self.stat.inc(name, count)
                for name, items in result['collections'].items():
                    for item in items:
                        self.stat.collect(name, item)
            else:
                raise SpiderError('Unknown result type: %s' % result)
        else:
            raise SpiderError('Unknown result type: %s' % result)
Exemplo n.º 2
0
    def add_task(self, task, queue=None, raise_error=False):
        """
        Add task to the task queue.
        """

        if queue is None:
            queue = self.task_queue
        if queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_set_explicitly:
            task.priority = self.generate_task_priority()
            task.priority_set_explicitly = False
        else:
            task.priority_set_explicitly = True

        if not task.url.startswith(('http://', 'https://', 'ftp://',
                                    'file://', 'feed://')):
            self.stat.collect('task-with-invalid-url', task.url)
            msg = 'Invalid task URL: %s' % task.url
            if raise_error:
                raise SpiderError(msg)
            else:
                logger.error(
                    '%s\nTraceback:\n%s', msg, ''.join(format_stack()),
                )
                return False
        else:
            # TODO: keep original task priority if it was set explicitly
            # WTF the previous comment means?
            queue.put(
                task, priority=task.priority, schedule_time=task.schedule_time
            )
            return True
Exemplo n.º 3
0
    def process_handler_result(self, result, task=None):
        """
        Process result received from the task handler.

        Result could be:
        * None
        * Task instance
        * Data instance.
        """

        if isinstance(result, Task):
            self.add_task(result)
        elif isinstance(result, Data):
            handler = self.find_data_handler(result)
            try:
                data_result = handler(**result.storage)
                if data_result is None:
                    pass
                else:
                    for something in data_result:
                        self.process_handler_result(something, task)

            except Exception as ex:
                self.process_handler_error('data_%s' % result.handler_key, ex,
                                           task)
        elif result is None:
            pass
        elif isinstance(result, NullTask):
            pass
        else:
            raise SpiderError('Unknown result type: %s' % result)
Exemplo n.º 4
0
 def log_rejected_task(self, task, reason):
     if reason == 'task-try-count':
         self.stat.collect('task-count-rejected', task.url)
     elif reason == 'network-try-count':
         self.stat.collect('network-count-rejected', task.url)
     else:
         raise SpiderError('Unknown response from '
                           'check_task_limits: %s' % reason)
Exemplo n.º 5
0
 def log_rejected_task(self, task, reason):
     logger_verbose.debug('Task %s is rejected due to '
                          '%s limit' % (task.name, reason))
     if reason == 'task-try-count':
         self.stat.collect('task-count-rejected', task.url)
     elif reason == 'network-try-count':
         self.stat.collect('network-count-rejected', task.url)
     else:
         raise SpiderError('Unknown response from '
                           'check_task_limits: %s' % reason)
Exemplo n.º 6
0
    def process_handler_result(self, result, task=None):
        """
        Process result received from the task handler.

        Result could be:
        * None
        * Task instance
        * Data instance.
        """

        if isinstance(result, Task):
            self.add_task(result)
        elif isinstance(result, Data):
            handler = self.find_data_handler(result)
            try:
                data_result = handler(**result.storage)
                if data_result is None:
                    pass
                else:
                    for something in data_result:
                        self.process_handler_result(something, task)

            except Exception as ex:
                self.process_handler_error('data_%s' % result.handler_key, ex,
                                           task)
        elif result is None:
            pass
        elif isinstance(result, Exception):
            handler = self.find_task_handler(task)
            handler_name = getattr(handler, '__name__', 'NONE')
            self.process_handler_error(handler_name, result, task)
        elif isinstance(result, dict):
            if result.get('type') == 'stat':
                for name, count in result['counters'].items():
                    self.stat.inc(name, count)
                for name, items in result['collections'].items():
                    for item in items:
                        self.stat.collect(name, item)
            else:
                raise SpiderError('Unknown result type: %s' % result)
        else:
            raise SpiderError('Unknown result type: %s' % result)
Exemplo n.º 7
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        # MP:
        # ***
        if self.parser_mode:
            self.parser_result_queue.put((task, None))
            return

        if self.task_queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        try:
            if not task.url.startswith(
                ('http://', 'https://', 'ftp://', 'file://', 'feed://')):
                if self.base_url is None:
                    msg = 'Could not resolve relative URL because base_url ' \
                          'is not specified. Task: %s, URL: %s'\
                          % (task.name, task.url)
                    raise SpiderError(msg)
                else:
                    warn('Class attribute `Spider::base_url` is deprecated. '
                         'Use Task objects with absolute URLs')
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config['url'] = task.url
        except Exception as ex:
            self.stat.collect('task-with-invalid-url', task.url)
            if raise_error:
                raise
            else:
                logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        self.task_queue.put(task,
                            task.priority,
                            schedule_time=task.schedule_time)
        return True
Exemplo n.º 8
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        # MP:
        # ***
        if self.parser_mode:
            self.parser_result_queue.put((task, None))
            return

        if self.task_queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_set_explicitly:
            task.priority = self.generate_task_priority()
            task.priority_set_explicitly = False
        else:
            task.priority_set_explicitly = True

        if not task.url.startswith(('http://', 'https://', 'ftp://',
                                    'file://', 'feed://')):
            self.stat.collect('task-with-invalid-url', task.url)
            msg = ('It is not allowed to build Task object with '
                   'relative URL: %s' % task.url)
            ex = SpiderError(msg)
            if raise_error:
                raise ex
            else:
                # Just want to print traceback
                # Do this to avoid the error
                # http://bugs.python.org/issue23003
                # FIXME: use something less awkward
                try:
                    raise ex
                except SpiderError as ex:
                    logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        # WTF the previous comment means?
        self.task_queue.put(task, task.priority,
                            schedule_time=task.schedule_time)
        return True
Exemplo n.º 9
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        if self.taskq is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        if not isinstance(task, NullTask):
            try:
                if not task.url.startswith(
                    ('http://', 'https://', 'ftp://', 'file://', 'feed://')):
                    if self.base_url is None:
                        msg = 'Could not resolve relative URL because base_url ' \
                              'is not specified. Task: %s, URL: %s'\
                              % (task.name, task.url)
                        raise SpiderError(msg)
                    else:
                        task.url = urljoin(self.base_url, task.url)
                        # If task has grab_config object then update it too
                        if task.grab_config:
                            task.grab_config['url'] = task.url
            except Exception as ex:
                self.add_item('task-with-invalid-url', task.url)
                if raise_error:
                    raise
                else:
                    logger.error('', exc_info=ex)
                    return False

        # TODO: keep original task priority if it was set explicitly
        self.taskq.put(task, task.priority, schedule_time=task.schedule_time)
        return True
Exemplo n.º 10
0
    def run(self):
        """
        Main method. All work is done here.
        """

        self.start_timer('total')

        self.transport = MulticurlTransport(self.thread_number)

        try:
            self.setup_default_queue()
            self.prepare()

            self.start_timer('task_generator')
            if not self.slave:
                self.init_task_generator()
            self.stop_timer('task_generator')

            while self.work_allowed:
                self.start_timer('task_generator')
                if self.task_generator_enabled:
                    self.process_task_generator()
                self.stop_timer('task_generator')

                free_threads = self.transport.get_free_threads_number()
                if free_threads:
                    logger_verbose.debug(
                        'Transport has free resources (%d). '
                        'Trying to add new task (if exists).' % free_threads)

                    # Try five times to get new task and proces task generator
                    # because slave parser could agressively consume
                    # tasks from task queue
                    for x in six.moves.range(5):
                        task = self.load_new_task()
                        if task is None:
                            if not self.transport.active_task_number():
                                self.process_task_generator()
                        elif task is True:
                            # If only delayed tasks in queue
                            break
                        else:
                            # If got some task
                            break

                    if not task:
                        if not self.transport.active_task_number():
                            logger_verbose.debug('Network transport has no '
                                                 'active tasks')
                            if not self.task_generator_enabled:
                                self.stop()
                        else:
                            logger_verbose.debug(
                                'Transport active tasks: %d' %
                                self.transport.active_task_number())
                    elif isinstance(task, NullTask):
                        logger_verbose.debug('Got NullTask')
                        if not self.transport.active_task_number():
                            if task.sleep:
                                logger.debug('Got NullTask with sleep '
                                             'instruction. Sleeping for'
                                             ' %.2f seconds' % task.sleep)
                                time.sleep(task.sleep)
                    elif isinstance(task, bool) and (task is True):
                        # Take some sleep to not load CPU
                        if not self.transport.active_task_number():
                            time.sleep(0.1)
                    else:
                        logger_verbose.debug(
                            'Got new task from task queue: %s' % task)
                        self.process_task_counters(task)

                        is_valid, reason = self.check_task_limits(task)
                        if not is_valid:
                            logger_verbose.debug('Task %s is rejected due to '
                                                 '%s limit' %
                                                 (task.name, reason))
                            if reason == 'task-try-count':
                                self.add_item('task-count-rejected', task.url)
                            elif reason == 'network-try-count':
                                self.add_item('network-count-rejected',
                                              task.url)
                            else:
                                raise SpiderError('Unknown response from '
                                                  'check_task_limits: %s' %
                                                  reason)
                            handler = task.get_fallback_handler(self)
                            if handler:
                                handler(task)
                        else:
                            self.process_new_task(task)
                            self.transport.process_handlers()

                with self.save_timer('network_transport'):
                    logger_verbose.debug('Asking transport layer to do '
                                         'something')
                    self.transport.process_handlers()

                logger_verbose.debug('Processing network results (if any).')
                # Iterate over network trasport ready results
                # Each result could be valid or failed
                # Result format: {ok, grab, grab_config_backup, task, emsg}

                # print '[transport iterate results - start]'
                for result in self.transport.iterate_results():
                    if self.is_valid_for_cache(result):
                        with self.save_timer('cache'):
                            with self.save_timer('cache.write'):
                                self.cache.save_response(
                                    result['task'].url, result['grab'])

                    # print '[process network results]'
                    self.process_network_result(result)
                    # print '[done]'
                    self.inc_count('request')

                # print '[transport iterate results - end]'

            logger_verbose.debug('Work done')
        except KeyboardInterrupt:
            print('\nGot ^C signal in process %d. Stopping.' % os.getpid())
            self.interrupted = True
            raise
        finally:
            # This code is executed when main cycles is breaked
            self.stop_timer('total')
            self.shutdown()
Exemplo n.º 11
0
    def process_service_result(self, result, task, meta=None):
        """
        Process result submitted from any service to task dispatcher service.

        Result could be:
        * Task
        * None
        * Task instance
        * ResponseNotValid-based exception
        * Arbitrary exception
        * Network response:
            {ok, ecode, emsg, error_abbr, exc, grab, grab_config_backup}

        Exception can come only from parser_service and it always has
        meta {"from": "parser", "exc_info": <...>}
        """

        if meta is None:
            meta = {}
        if isinstance(result, Task):
            self.spider.add_task(result)
        elif result is None:
            pass
        elif isinstance(result, ResponseNotValid):
            self.spider.add_task(task.clone())
            error_code = result.__class__.__name__.replace('_', '-')
            self.spider.stat.inc('integrity:%s' % error_code)
        elif isinstance(result, Exception):
            if task:
                handler = self.spider.find_task_handler(task)
                handler_name = getattr(handler, '__name__', 'NONE')
            else:
                handler_name = 'NA'
            self.spider.process_parser_error(
                handler_name, task, meta['exc_info'],
            )
            if isinstance(result, FatalError):
                self.spider.fatal_error_queue.put(meta['exc_info'])
        elif isinstance(result, dict) and 'grab' in result:
            # TODO: Move to network service
            # starts
            self.spider.log_network_result_stats(result, task)
            # ends
            is_valid = False
            if task.get('raw'):
                is_valid = True
            elif result['ok']:
                res_code = result['grab'].doc.code
                is_valid = self.spider.is_valid_network_response_code(
                    res_code, task
                )
            if is_valid:
                self.spider.parser_service.input_queue.put((result, task))
            else:
                self.spider.log_failed_network_result(result)
                # Try to do network request one more time
                # TODO:
                # Implement valid_try_limit
                # Use it if request failed not because of network error
                # But because of content integrity check
                if self.spider.network_try_limit > 0:
                    task.setup_grab_config(
                        result['grab_config_backup'])
                    self.spider.add_task(task)
            self.spider.stat.inc('spider:request')
        else:
            raise SpiderError('Unknown result received from a service: %s'
                              % result)