Exemplo n.º 1
0
 def _set_response(self, val):
     warn(
         'The `Grab.response` attribute is deprecated. '
         'Use `Grab.doc` instead.',
         stacklevel=3)
     # pylint: disable=assigning-non-slot, attribute-defined-outside-init
     self.doc = val
Exemplo n.º 2
0
Arquivo: base.py Projeto: lorien/grab
    def setup_cache(self, backend='mongodb', database=None,
                    **kwargs):
        """
        Setup cache.

        :param backend: Backend name
            Should be one of the following: 'mongo', 'mysql' or 'postgresql'.
        :param database: Database name.
        :param kwargs: Additional credentials for backend.

        """
        if database is None:
            raise SpiderMisuseError('setup_cache method requires database '
                                    'option')
        if backend == 'mongo':
            warn('Backend name "mongo" is deprecated. Use "mongodb" instead.')
            backend = 'mongodb'
        mod = __import__('grab.spider.cache_backend.%s' % backend,
                         globals(), locals(), ['foo'])
        backend = mod.CacheBackend(
            database=database, spider=self, **kwargs
        )
        self.cache_reader_service = CacheReaderService(self, backend)
        backend = mod.CacheBackend(
            database=database, spider=self, **kwargs
        )
        self.cache_writer_service = CacheWriterService(self, backend)
Exemplo n.º 3
0
 def submit(self, *args, **kwargs):
     warn(
         'Method `Document.submit` is deprecated. '
         'Use `Grab.submit` method instead.',
         stacklevel=3
     )
     self.grab.submit(*args, **kwargs)
Exemplo n.º 4
0
    def setup_cache(self, backend='mongodb', database=None,
                    **kwargs):
        """
        Setup cache.

        :param backend: Backend name
            Should be one of the following: 'mongo', 'mysql' or 'postgresql'.
        :param database: Database name.
        :param kwargs: Additional credentials for backend.

        """
        if database is None:
            raise SpiderMisuseError('setup_cache method requires database '
                                    'option')
        if backend == 'mongo':
            warn('Backend name "mongo" is deprecated. Use "mongodb" instead.')
            backend = 'mongodb'
        mod = __import__('grab.spider.cache_backend.%s' % backend,
                         globals(), locals(), ['foo'])
        backend = mod.CacheBackend(
            database=database, spider=self, **kwargs
        )
        self.cache_reader_service = CacheReaderService(self, backend)
        backend = mod.CacheBackend(
            database=database, spider=self, **kwargs
        )
        self.cache_writer_service = CacheWriterService(self, backend)
Exemplo n.º 5
0
 def xml_tree(self):
     """
     Return DOM-tree of the document built with XML DOM builder.
     """
     warn('Attribute `grab.xml_tree` is deprecated. '
          'Use `Grab.doc.tree` attribute '
          'AND content_type="xml" option instead.')
     return self.build_xml_tree()
Exemplo n.º 6
0
 def xml_tree(self):
     """
     Return DOM-tree of the document built with XML DOM builder.
     """
     warn('Attribute `grab.xml_tree` is deprecated. '
          'Use `Grab.doc.tree` attribute '
          'AND content_type="xml" option instead.')
     return self.build_xml_tree()
Exemplo n.º 7
0
 def save_timer(self, key):
     warn('Method `Spider::save_timer` is deprecated. '
          'Use `Spider::timer.log_time` method instead.')
     self.timer.start(key)
     try:
         yield
     finally:
         self.timer.stop(key)
Exemplo n.º 8
0
    def load_module(self, name):
        """
        This method is called by Python if CustomImporter.find_module
         does not return None.
        """
        try:
            module = import_module(self.name, 'weblib')
            sys.modules[name] = module
            warn('Module `grab.tools%s` is deprecated. '
                 'Use `weblib%s` module.' % (self.name, self.name))
        except:
            raise ImportError(name)

        return module
Exemplo n.º 9
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        # MP:
        # ***
        if self.parser_mode:
            self.parser_result_queue.put((task, None))
            return

        if self.task_queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        try:
            if not task.url.startswith(
                ('http://', 'https://', 'ftp://', 'file://', 'feed://')):
                if self.base_url is None:
                    msg = 'Could not resolve relative URL because base_url ' \
                          'is not specified. Task: %s, URL: %s'\
                          % (task.name, task.url)
                    raise SpiderError(msg)
                else:
                    warn('Class attribute `Spider::base_url` is deprecated. '
                         'Use Task objects with absolute URLs')
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config['url'] = task.url
        except Exception as ex:
            self.stat.collect('task-with-invalid-url', task.url)
            if raise_error:
                raise
            else:
                logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        self.task_queue.put(task,
                            task.priority,
                            schedule_time=task.schedule_time)
        return True
Exemplo n.º 10
0
    def setup_queue(self, backend='memory', **kwargs):
        """
        Setup queue.

        :param backend: Backend name
            Should be one of the following: 'memory', 'redis' or 'mongo'.
        :param kwargs: Additional credentials for backend.
        """
        if backend == 'mongo':
            warn('Backend name "mongo" is deprecated. Use "mongodb" instead.')
            backend = 'mongodb'
        logger.debug('Using %s backend for task queue', backend)
        mod = __import__('grab.spider.queue_backend.%s' % backend,
                         globals(), locals(), ['foo'])
        self.task_queue = mod.QueueBackend(spider_name=self.get_spider_name(),
                                           **kwargs)
Exemplo n.º 11
0
Arquivo: base.py Projeto: lorien/grab
    def setup_queue(self, backend='memory', **kwargs):
        """
        Setup queue.

        :param backend: Backend name
            Should be one of the following: 'memory', 'redis' or 'mongo'.
        :param kwargs: Additional credentials for backend.
        """
        if backend == 'mongo':
            warn('Backend name "mongo" is deprecated. Use "mongodb" instead.')
            backend = 'mongodb'
        logger.debug('Using %s backend for task queue', backend)
        mod = __import__('grab.spider.queue_backend.%s' % backend,
                         globals(), locals(), ['foo'])
        self.task_queue = mod.QueueBackend(spider_name=self.get_spider_name(),
                                           **kwargs)
Exemplo n.º 12
0
    def add_task(self, task, raise_error=False):
        """
        Add task to the task queue.
        """

        # MP:
        # ***
        if self.parser_mode:
            self.parser_result_queue.put((task, None))
            return

        if self.task_queue is None:
            raise SpiderMisuseError('You should configure task queue before '
                                    'adding tasks. Use `setup_queue` method.')
        if task.priority is None or not task.priority_is_custom:
            task.priority = self.generate_task_priority()
            task.priority_is_custom = False
        else:
            task.priority_is_custom = True

        try:
            if not task.url.startswith(('http://', 'https://', 'ftp://',
                                        'file://', 'feed://')):
                if self.base_url is None:
                    msg = 'Could not resolve relative URL because base_url ' \
                          'is not specified. Task: %s, URL: %s'\
                          % (task.name, task.url)
                    raise SpiderError(msg)
                else:
                    warn('Class attribute `Spider::base_url` is deprecated. '
                         'Use Task objects with absolute URLs')
                    task.url = urljoin(self.base_url, task.url)
                    # If task has grab_config object then update it too
                    if task.grab_config:
                        task.grab_config['url'] = task.url
        except Exception as ex:
            self.stat.collect('task-with-invalid-url', task.url)
            if raise_error:
                raise
            else:
                logger.error('', exc_info=ex)
                return False

        # TODO: keep original task priority if it was set explicitly
        self.task_queue.put(task, task.priority, schedule_time=task.schedule_time)
        return True
Exemplo n.º 13
0
Arquivo: base.py Projeto: lorien/grab
    def render_stats(self, timing=None):
        if timing is not None:
            warn('Option timing of method render_stats is deprecated.'
                 ' There is no more timing feature.')
        out = ['------------ Stats: ------------']
        out.append('Counters:')

        # Process counters
        items = sorted(self.stat.counters.items(),
                       key=lambda x: x[0], reverse=True)
        for item in items:
            out.append('  %s: %s' % item)
        out.append('')

        out.append('Lists:')
        # Process collections sorted by size desc
        col_sizes = [(x, len(y)) for x, y in self.stat.collections.items()]
        col_sizes = sorted(col_sizes, key=lambda x: x[1], reverse=True)
        for col_size in col_sizes:
            out.append('  %s: %d' % col_size)
        out.append('')

        # Process extra metrics
        if 'download-size' in self.stat.counters:
            out.append('Network download: %s' %
                       metric.format_traffic_value(
                           self.stat.counters['download-size']))
        out.append('Queue size: %d' % self.task_queue.size()
                   if self.task_queue else 'NA')
        out.append('Network streams: %d' % self.thread_number)
        if self._started:
            elapsed = time.time() - self._started
        else:
            elapsed = 0
        hours, seconds = divmod(elapsed, 3600)
        minutes, seconds = divmod(seconds, 60)
        out.append('Time elapsed: %d:%d:%d (H:M:S)' % (
            hours, minutes, seconds))
        out.append('End time: %s' %
                   datetime.utcnow().strftime('%d %b %Y, %H:%M:%S UTC'))
        return '\n'.join(out) + '\n'
Exemplo n.º 14
0
    def render_stats(self, timing=None):
        if timing is not None:
            warn('Option timing of method render_stats is deprecated.'
                 ' There is no more timing feature.')
        out = ['------------ Stats: ------------']
        out.append('Counters:')

        # Process counters
        items = sorted(self.stat.counters.items(),
                       key=lambda x: x[0], reverse=True)
        for item in items:
            out.append('  %s: %s' % item)
        out.append('')

        out.append('Lists:')
        # Process collections sorted by size desc
        col_sizes = [(x, len(y)) for x, y in self.stat.collections.items()]
        col_sizes = sorted(col_sizes, key=lambda x: x[1], reverse=True)
        for col_size in col_sizes:
            out.append('  %s: %d' % col_size)
        out.append('')

        # Process extra metrics
        if 'download-size' in self.stat.counters:
            out.append('Network download: %s' %
                       metric.format_traffic_value(
                           self.stat.counters['download-size']))
        out.append('Queue size: %d' % self.task_queue.size()
                   if self.task_queue else 'NA')
        out.append('Network streams: %d' % self.thread_number)
        if self._started:
            elapsed = time.time() - self._started
        else:
            elapsed = 0
        hours, seconds = divmod(elapsed, 3600)
        minutes, seconds = divmod(seconds, 60)
        out.append('Time elapsed: %d:%d:%d (H:M:S)' % (
            hours, minutes, seconds))
        out.append('End time: %s' %
                   datetime.utcnow().strftime('%d %b %Y, %H:%M:%S UTC'))
        return '\n'.join(out) + '\n'
Exemplo n.º 15
0
    def setup(self, **kwargs):
        """
        Setting up Grab instance configuration.
        """

        if 'hammer_mode' in kwargs:
            warn('Option `hammer_mode` is deprecated. Grab does not '
                 'support hammer mode anymore.')
            del kwargs['hammer_mode']

        if 'hammer_timeouts' in kwargs:
            warn('Option `hammer_timeouts` is deprecated. Grab does not '
                 'support hammer mode anymore.')
            del kwargs['hammer_timeouts']

        for key in kwargs:
            if key not in self.config.keys():
                raise error.GrabMisuseError('Unknown option: %s' % key)

        if 'url' in kwargs:
            if self.config.get('url'):
                kwargs['url'] = self.make_url_absolute(kwargs['url'])
        self.config.update(kwargs)
Exemplo n.º 16
0
Arquivo: base.py Projeto: ixtel/grab
    def setup(self, **kwargs):
        """
        Setting up Grab instance configuration.
        """

        if 'hammer_mode' in kwargs:
            warn('Option `hammer_mode` is deprecated. Grab does not '
                 'support hammer mode anymore.')
            del kwargs['hammer_mode']

        if 'hammer_timeouts' in kwargs:
            warn('Option `hammer_timeouts` is deprecated. Grab does not '
                 'support hammer mode anymore.')
            del kwargs['hammer_timeouts']

        for key in kwargs:
            if key not in self.config.keys():
                raise error.GrabMisuseError('Unknown option: %s' % key)

        if 'url' in kwargs:
            if self.config.get('url'):
                kwargs['url'] = self.make_url_absolute(kwargs['url'])
        self.config.update(kwargs)
Exemplo n.º 17
0
Arquivo: base.py Projeto: lorien/grab
    def __init__(
            self,
            thread_number=None,
            network_try_limit=None, task_try_limit=None,
            request_pause=NULL,
            priority_mode='random',
            meta=None,
            only_cache=False,
            config=None,
            args=None,
            parser_requests_per_process=10000,
            parser_pool_size=1,
            http_api_port=None,
            network_service='threaded',
            grab_transport='pycurl',
            # Deprecated
            transport=None):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * task_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        """

        self.fatal_error_queue = Queue()
        self.task_queue_parameters = None
        self.http_api_port = http_api_port
        self._started = None
        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport
        self.parser_requests_per_process = parser_requests_per_process
        self.stat = Stat()
        self.task_queue = None
        if args is None:
            self.args = {}
        else:
            self.args = args
        if config is not None:
            self.config = config
        else:
            self.config = {}
        if meta:
            self.meta = meta
        else:
            self.meta = {}
        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))
        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode
        self.only_cache = only_cache
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')
        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
        self.cache_reader_service = None
        self.cache_writer_service = None
        self.parser_pool_size = parser_pool_size
        self.parser_service = ParserService(
            spider=self,
            pool_size=self.parser_pool_size,
        )
        if transport is not None:
            warn('The "transport" argument of Spider constructor is'
                 ' deprecated. Use "network_service" argument.')
            network_service = transport
        assert network_service in ('threaded',)
        if network_service == 'threaded':
            # pylint: disable=no-name-in-module, import-error
            from grab.spider.network_service.threaded import (
                NetworkServiceThreaded
            )
            self.network_service = NetworkServiceThreaded(
                self, self.thread_number
            )
        self.task_dispatcher = TaskDispatcherService(self)
        if self.http_api_port:
            self.http_api_service = HttpApiService(self)
        else:
            self.http_api_service = None
        self.task_generator_service = TaskGeneratorService(
            self.task_generator(), self,
        )
Exemplo n.º 18
0
    def __init__(
            self,
            thread_number=None,
            network_try_limit=None, task_try_limit=None,
            request_pause=NULL,
            priority_mode='random',
            meta=None,
            config=None,
            args=None,
            parser_requests_per_process=10000,
            parser_pool_size=1,
            http_api_port=None,
            network_service='threaded',
            grab_transport='pycurl',
            # Deprecated
            transport=None,
            only_cache=False,
        ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * task_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        """

        self.fatal_error_queue = Queue()
        self.task_queue_parameters = None
        self.http_api_port = http_api_port
        self._started = None
        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport
        self.parser_requests_per_process = parser_requests_per_process
        self.stat = Stat()
        self.task_queue = None
        if args is None:
            self.args = {}
        else:
            self.args = args
        if config is not None:
            self.config = config
        else:
            self.config = {}
        if meta:
            self.meta = meta
        else:
            self.meta = {}
        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))
        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode
        if only_cache:
            raise_feature_is_deprecated('Cache feature')
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')
        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
        self.parser_pool_size = parser_pool_size
        self.parser_service = ParserService(
            spider=self,
            pool_size=self.parser_pool_size,
        )
        if transport is not None:
            warn('The "transport" argument of Spider constructor is'
                 ' deprecated. Use "network_service" argument.')
            network_service = transport
        assert network_service in ('threaded',)
        if network_service == 'threaded':
            # pylint: disable=no-name-in-module, import-error
            from grab.spider.network_service.threaded import (
                NetworkServiceThreaded
            )
            self.network_service = NetworkServiceThreaded(
                self, self.thread_number
            )
        self.task_dispatcher = TaskDispatcherService(self)
        if self.http_api_port:
            self.http_api_service = HttpApiService(self)
        else:
            self.http_api_service = None
        self.task_generator_service = TaskGeneratorService(
            self.task_generator(), self,
        )
Exemplo n.º 19
0
    def __init__(self, thread_number=None,
                 network_try_limit=None, task_try_limit=None,
                 request_pause=NULL,
                 priority_mode='random',
                 meta=None,
                 only_cache=False,
                 config=None,
                 slave=None,
                 args=None,
                 # New options start here
                 taskq=None,
                 # MP:
                 network_result_queue=None,
                 parser_result_queue=None,
                 is_parser_idle=None,
                 shutdown_event=None,
                 mp_mode=False,
                 parser_pool_size=None,
                 parser_mode=False,
                 parser_requests_per_process=10000,
                 # http api
                 http_api_port=None,
                 transport='multicurl',
                 grab_transport='pycurl',
                 ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * taskq=None,
        * newtork_response_queue=None,
        """

        if slave is not None:
            raise SpiderConfigurtionError(
                'Slave mode is not supported anymore. '
                'Use `mp_mode=True` option to run multiple HTML'
                ' parser processes.')

        # API:
        self.http_api_port = http_api_port

        assert transport in ('multicurl', 'threaded')
        self.transport_name = transport

        assert grab_transport in ('pycurl', 'urllib3')
        self.grab_transport_name = grab_transport

        # MP:
        self.mp_mode = mp_mode
        if self.mp_mode:
            from multiprocessing import Process, Event, Queue
        else:
            from multiprocessing.dummy import Process, Event, Queue

        if network_result_queue is not None:
            self.network_result_queue = network_result_queue
        else:
            self.network_result_queue = Queue()
        self.parser_result_queue = parser_result_queue
        self.is_parser_idle = is_parser_idle
        if shutdown_event is not None:
            self.shutdown_event = shutdown_event
        else:
            self.shutdown_event = Event()
        if not self.mp_mode and parser_pool_size and parser_pool_size > 1:
            raise SpiderConfigurationError(
                'Parser pool size could be only 1 in '
                'non-multiprocess mode')
        self.parser_pool_size = parser_pool_size
        self.parser_mode = parser_mode
        self.parser_requests_per_process = parser_requests_per_process

        self.stat = Stat()
        self.timer = Timer()
        self.task_queue = taskq

        if args is None:
            self.args = {}
        else:
            self.args = args

        if config is not None:
            self.config = config
        else:
            self.config = {}

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))

        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        self.only_cache = only_cache
        self.cache_pipeline = None
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
Exemplo n.º 20
0
 def valid_response_code(self, code, task):
     warn('Method `Spider::valid_response_code` is deprecated. '
          'Use `Spider::is_valid_network_response_code` method or '
          '`Spider::is_valid_network_result` method.')
     return self.is_valid_network_response_code(code, task)
Exemplo n.º 21
0
 def set_grab_config(self, val):
     warn('Using `grab_config` attribute is deprecated. Override '
          '`create_grab_instance method instead.')
     self._grab_config = val
Exemplo n.º 22
0
 def __init__(self, *args, **kwargs):
     warn('You are using XpathSelector from deprecated `grab.selector` '
          'package. Please, switch to `selection` package.')
     super(XpathSelector, self).__init__(*args, **kwargs)
Exemplo n.º 23
0
 def items(self):
     warn('Attribute `Spider::items` is deprecated. '
          'Use `Spider::stat.collections` attribute instead.')
     return self.stat.collections
Exemplo n.º 24
0
 def inc_count(self, key, count=1):
     warn('Method `Spider::inc_count` is deprecated. '
          'Use `Spider::stat.inc` method instead.')
     self.stat.inc(key, count)
Exemplo n.º 25
0
 def test_warn(self):
     out = StringIO()
     with mock.patch('sys.stderr', out):
         warn('abc')
     self.assertTrue('GrabDeprecationWarning: abc' in out.getvalue())
Exemplo n.º 26
0
 def form(self):
     warn('The `Grab.form` attribute is deprecated. '
          'Use `Grab.doc.form` instead.')
     return self.doc.form
Exemplo n.º 27
0
from grab.selector.selector import *  # noqa
from grab.util.warning import warn

warn("Module `grab.selector` is deprecated. Use `selection` package.")
Exemplo n.º 28
0
    def __init__(self, thread_number=None,
                 network_try_limit=None, task_try_limit=None,
                 request_pause=NULL,
                 priority_mode='random',
                 meta=None,
                 only_cache=False,
                 config=None,
                 slave=None,
                 args=None,
                 # New options start here
                 taskq=None,
                 # MP:
                 network_result_queue=None,
                 parser_result_queue=None,
                 is_parser_idle=None,
                 shutdown_event=None,
                 mp_mode=False,
                 parser_pool_size=None,
                 parser_mode=False,
                 parser_requests_per_process=10000,
                 # http api
                 http_api_port=None,
                 ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * taskq=None,
        * newtork_response_queue=None,
        """

        if slave is not None:
            raise SpiderConfigurtionError(
                'Slave mode is not supported anymore. '
                'Use `mp_mode=True` option to run multiple HTML'
                ' parser processes.')

        # API:
        self.http_api_port = http_api_port

        # MP:
        self.mp_mode = mp_mode
        if self.mp_mode:
            from multiprocessing import Process, Event, Queue
        else:
            from multiprocessing.dummy import Process, Event, Queue

        if network_result_queue is not None:
            self.network_result_queue = network_result_queue
        else:
            self.network_result_queue = Queue()
        self.parser_result_queue = parser_result_queue
        self.is_parser_idle = is_parser_idle
        if shutdown_event is not None:
            self.shutdown_event = shutdown_event
        else:
            self.shutdown_event = Event()
        if not self.mp_mode and parser_pool_size and parser_pool_size > 1:
            raise SpiderConfigurationError(
                'Parser pool size could be only 1 in '
                'non-multiprocess mode')
        self.parser_pool_size = parser_pool_size
        self.parser_mode = parser_mode
        self.parser_requests_per_process = parser_requests_per_process

        self.stat = Stat()
        self.timer = Timer()
        self.task_queue = taskq

        if args is None:
            self.args = {}
        else:
            self.args = args

        if config is not None:
            self.config = config
        else:
            self.config = {}

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.thread_number = (
            thread_number or
            int(self.config.get('thread_number',
                                DEFAULT_NETWORK_STREAM_NUMBER)))
        self.task_try_limit = (
            task_try_limit or
            int(self.config.get('task_try_limit', DEFAULT_TASK_TRY_LIMIT)))
        self.network_try_limit = (
            network_try_limit or
            int(self.config.get('network_try_limit',
                                DEFAULT_NETWORK_TRY_LIMIT)))

        self._grab_config = {}
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        self.only_cache = only_cache
        self.cache_pipeline = None
        self.work_allowed = True
        if request_pause is not NULL:
            warn('Option `request_pause` is deprecated and is not '
                 'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False
        self.interrupted = False
Exemplo n.º 29
0
 def add_item(self, list_name, item):
     warn('Method `Spider::add_item` is deprecated. '
          'Use `Spider::stat.collect` method instead.')
     self.stat.collect(list_name, item)
Exemplo n.º 30
0
 def test_warn(self):
     out = StringIO()
     with mock.patch('sys.stderr', out):
         warn('abc')
     self.assertTrue('GrabDeprecationWarning: abc' in out.getvalue())
Exemplo n.º 31
0
 def stop_timer(self, key):
     warn('Method `Spider::stop_timer` is deprecated. '
          'Use `Spider::timer.stop` method instead.')
     self.timer.stop(key)
Exemplo n.º 32
0
 def setup_spider_config(cls, config):
     warn('Method `Spider::setup_spider_config` is deprecated. '
          'Use `Spider::update_spider_config` method.')
     cls.update_spider_config(config)
Exemplo n.º 33
0
 def counters(self):
     warn('Attribute `Spider::counters` is deprecated. '
          'Use `Spider::stat.counters` attribute instead.')
     return self.stat.counters
Exemplo n.º 34
0
Arquivo: task.py Projeto: lorien/grab
    def __init__(self, name=None, url=None, grab=None, grab_config=None,
                 priority=None, priority_set_explicitly=True,
                 network_try_count=0, task_try_count=1,
                 disable_cache=False, refresh_cache=False,
                 valid_status=None, use_proxylist=True,
                 cache_timeout=None, delay=None,
                 raw=False, callback=None,
                 fallback_name=None,
                 **kwargs):
        """
        Create `Task` object.

        If more than one of url, grab and grab_config options are non-empty
        then they processed in following order:
        * grab overwrite grab_config
        * grab_config overwrite url

        Args:
            :param name: name of the task. After successful network operation
                task's result will be passed to `task_<name>` method.
            :param url: URL of network document. Any task requires `url` or
                `grab` option to be specified.
            :param grab: configured `Grab` instance. You can use that option in
                case when `url` option is not enough. Do not forget to
                configure `url` option of `Grab` instance because in this case
                the `url` option of `Task` constructor will be overwritten
                with `grab.config['url']`.
            :param priority: - priority of the Task. Tasks with lower priority
                will be processed earlier. By default each new task is assigned
                with random priority from (80, 100) range.
            :param priority_set_explicitly: - internal flag which tells if that
                task priority was assigned manually or generated by spider
                according to priority generation rules.
            :param network_try_count: you'll probably will not need to use it.
                It is used internally to control how many times this task was
                restarted due to network errors. The `Spider` instance has
                `network_try_limit` option. When `network_try_count` attribute
                of the task exceeds the `network_try_limit` attribute then
                processing of the task is abandoned.
            :param task_try_count: the as `network_try_count` but it increased
                only then you use `clone` method. Also you can set it manually.
                It is useful if you want to restart the task after it was
                cancelled due to multiple network errors. As you might guessed
                there is `task_try_limit` option in `Spider` instance. Both
                options `network_try_count` and `network_try_limit` guarantee
                you that you'll not get infinite loop of restarting some task.
            :param disable_cache: if `True` disable cache subsystem.
                The document will be fetched from the Network and it will not
                be saved to cache.
            :param refresh_cache: if `True` the document will be fetched from
                the Network and saved to cache.
            :param valid_status: extra status codes which counts as valid
            :param use_proxylist: it means to use proxylist which was
                configured via `setup_proxylist` method of spider
            :param delay: if specified tells the spider to schedule the task
                and execute    it after `delay` seconds
            :param raw: if `raw` is True then the network response is
                forwarding to the corresponding handler without any check of
                HTTP status code of network error, if `raw` is False (by
                default) then failed response is putting back to task queue or
                if tries limit is reached then the processing of this  request
                is finished.
            :param callback: if you pass some function in `callback` option
                then the network response will be passed to this callback and
                the usual 'task_*' handler will be ignored and no error will be
                raised if such 'task_*' handler does not exist.
            :param fallback_name: the name of method that is called when spider
                gives up to do the task (due to multiple network errors)

            Any non-standard named arguments passed to `Task` constructor will
            be saved as attributes of the object. You can get their values
            later as attributes or with `get` method which allows to use
            default value if attribute does not exist.
        """

        if name == 'generator':
            # The name "generator" is restricted because
            # `task_generator` handler could not be created because
            # this name is already used for special method which
            # generates new tasks
            raise SpiderMisuseError('Task name could not be "generator"')

        self.name = name

        if url is None and grab is None and grab_config is None:
            raise SpiderMisuseError('Either url, grab or grab_config argument '
                                    'of Task constructor should not be None')

        if url is not None and grab is not None:
            raise SpiderMisuseError('Options url and grab could not be used '
                                    'together')

        if url is not None and grab_config is not None:
            raise SpiderMisuseError('Options url and grab_config could not be '
                                    'used together')

        if grab is not None and grab_config is not None:
            raise SpiderMisuseError(
                'Options grab and grab_config could not be used together')

        if grab:
            self.setup_grab_config(grab.dump_config())
        elif grab_config:
            self.setup_grab_config(grab_config)
        else:
            self.grab_config = None
            self.url = url

        if valid_status is None:
            self.valid_status = []
        else:
            self.valid_status = valid_status

        self.process_delay_option(delay)
        self.cache_timeout = cache_timeout
        if cache_timeout is not None:
            warn(
                'Option `cache_timeout` is deprecated and'
                ' is not supported anymore'
            )

        self.fallback_name = fallback_name
        self.priority_set_explicitly = priority_set_explicitly
        self.priority = priority
        self.network_try_count = network_try_count
        self.task_try_count = task_try_count
        self.disable_cache = disable_cache
        self.refresh_cache = refresh_cache
        self.use_proxylist = use_proxylist
        self.raw = raw
        self.callback = callback
        self.coroutines_stack = []
        for key, value in kwargs.items():
            setattr(self, key, value)
Exemplo n.º 35
0
 def get_grab_config(self):
     warn('Using `grab_config` attribute is deprecated. Override '
          '`create_grab_instance method instead.')
     return self._grab_config
Exemplo n.º 36
0
    def __init__(self,
                 name=None,
                 url=None,
                 grab=None,
                 grab_config=None,
                 priority=None,
                 priority_set_explicitly=True,
                 network_try_count=0,
                 task_try_count=1,
                 disable_cache=False,
                 refresh_cache=False,
                 valid_status=None,
                 use_proxylist=True,
                 cache_timeout=None,
                 delay=None,
                 raw=False,
                 callback=None,
                 fallback_name=None,
                 **kwargs):
        """
        Create `Task` object.

        If more than one of url, grab and grab_config options are non-empty
        then they processed in following order:
        * grab overwrite grab_config
        * grab_config overwrite url

        Args:
            :param name: name of the task. After successful network operation
                task's result will be passed to `task_<name>` method.
            :param url: URL of network document. Any task requires `url` or
                `grab` option to be specified.
            :param grab: configured `Grab` instance. You can use that option in
                case when `url` option is not enough. Do not forget to
                configure `url` option of `Grab` instance because in this case
                the `url` option of `Task` constructor will be overwritten
                with `grab.config['url']`.
            :param priority: - priority of the Task. Tasks with lower priority
                will be processed earlier. By default each new task is assigned
                with random priority from (80, 100) range.
            :param priority_set_explicitly: - internal flag which tells if that
                task priority was assigned manually or generated by spider
                according to priority generation rules.
            :param network_try_count: you'll probably will not need to use it.
                It is used internally to control how many times this task was
                restarted due to network errors. The `Spider` instance has
                `network_try_limit` option. When `network_try_count` attribute
                of the task exceeds the `network_try_limit` attribute then
                processing of the task is abandoned.
            :param task_try_count: the as `network_try_count` but it increased
                only then you use `clone` method. Also you can set it manually.
                It is useful if you want to restart the task after it was
                cancelled due to multiple network errors. As you might guessed
                there is `task_try_limit` option in `Spider` instance. Both
                options `network_try_count` and `network_try_limit` guarantee
                you that you'll not get infinite loop of restarting some task.
            :param disable_cache: if `True` disable cache subsystem.
                The document will be fetched from the Network and it will not
                be saved to cache.
            :param refresh_cache: if `True` the document will be fetched from
                the Network and saved to cache.
            :param valid_status: extra status codes which counts as valid
            :param use_proxylist: it means to use proxylist which was
                configured via `setup_proxylist` method of spider
            :param delay: if specified tells the spider to schedule the task
                and execute    it after `delay` seconds
            :param raw: if `raw` is True then the network response is
                forwarding to the corresponding handler without any check of
                HTTP status code of network error, if `raw` is False (by
                default) then failed response is putting back to task queue or
                if tries limit is reached then the processing of this  request
                is finished.
            :param callback: if you pass some function in `callback` option
                then the network response will be passed to this callback and
                the usual 'task_*' handler will be ignored and no error will be
                raised if such 'task_*' handler does not exist.
            :param fallback_name: the name of method that is called when spider
                gives up to do the task (due to multiple network errors)

            Any non-standard named arguments passed to `Task` constructor will
            be saved as attributes of the object. You can get their values
            later as attributes or with `get` method which allows to use
            default value if attribute does not exist.
        """

        if name == 'generator':
            # The name "generator" is restricted because
            # `task_generator` handler could not be created because
            # this name is already used for special method which
            # generates new tasks
            raise SpiderMisuseError('Task name could not be "generator"')

        self.name = name

        if url is None and grab is None and grab_config is None:
            raise SpiderMisuseError('Either url, grab or grab_config argument '
                                    'of Task constructor should not be None')

        if url is not None and grab is not None:
            raise SpiderMisuseError('Options url and grab could not be used '
                                    'together')

        if url is not None and grab_config is not None:
            raise SpiderMisuseError('Options url and grab_config could not be '
                                    'used together')

        if grab is not None and grab_config is not None:
            raise SpiderMisuseError(
                'Options grab and grab_config could not be used together')

        if grab:
            self.setup_grab_config(grab.dump_config())
        elif grab_config:
            self.setup_grab_config(grab_config)
        else:
            self.grab_config = None
            self.url = url

        if valid_status is None:
            self.valid_status = []
        else:
            self.valid_status = valid_status

        self.process_delay_option(delay)
        self.cache_timeout = cache_timeout
        if cache_timeout is not None:
            warn('Option `cache_timeout` is deprecated and'
                 ' is not supported anymore')

        self.fallback_name = fallback_name
        self.priority_set_explicitly = priority_set_explicitly
        self.priority = priority
        self.network_try_count = network_try_count
        self.task_try_count = task_try_count
        self.disable_cache = disable_cache
        self.refresh_cache = refresh_cache
        self.use_proxylist = use_proxylist
        self.raw = raw
        self.callback = callback
        self.coroutines_stack = []
        for key, value in kwargs.items():
            setattr(self, key, value)
Exemplo n.º 37
0
 def setup_grab(self, **kwargs):
     warn('Method `Spider::setup_grab` is deprecated. '
          'Define `Spider::create_grab_instance` or '
          'Spider::update_grab_instance` methods in your '
          'Spider sub-class.')
     self.grab_config.update(**kwargs)
Exemplo n.º 38
0
 def _get_response(self):
     warn('The `Grab.response` attribute is deprecated. '
          'Use `Grab.doc` instead.')
     return self.doc
Exemplo n.º 39
0
 def taskq(self):
     warn('Attribute `Spider::taskq` is deprecated. '
          'Use `Spider::task_queue` attribute.')
     return self.task_queue
Exemplo n.º 40
0
 def load_cookies(self, path, file_required=None):
     if file_required is not None:
         warn('The option `file_required` is no longer supported')
     self.cookies.load_from_file(path)  # pylint: disable=no-member
Exemplo n.º 41
0
 def append(self, key, val):
     warn('Method `Stat::append` is deprecated. '
          'Use `Stat::collect` method instead.')
     self.collect(key, val)
Exemplo n.º 42
0
 def form(self):
     warn('The `Grab.form` attribute is deprecated. '
          'Use `Grab.doc.form` instead.')
     return self.doc.form
Exemplo n.º 43
0
 def time(self):
     warn('Attribute `Document.time` is deprecated. '
          'Use `Document.total_time` instead.')
     return self.total_time
Exemplo n.º 44
0
from grab.selector.selector import *  # noqa
from grab.util.warning import warn

warn('Module `grab.selector` is deprecated. Use `selection` package.')
Exemplo n.º 45
0
Arquivo: stat.py Projeto: wyrover/grab
 def append(self, key, val):
     warn('Method `Stat::append` is deprecated. '
          'Use `Stat::collect` method instead.')
     self.collect(key, val)
Exemplo n.º 46
0
 def time(self):
     warn('Attribute `Document.time` is deprecated. '
          'Use `Document.total_time` instead.')
     return self.total_time