예제 #1
0
    def test_config_constructor(self):
        config = Config()
        self.assertEqual(len(config.keys()), 0)

        config = Config({'foo': 'bar'})
        self.assertEqual(len(config.keys()), 1)
        self.assertEqual(config['foo'], 'bar')
예제 #2
0
파일: base.py 프로젝트: enchantner/grab
    def __init__(self, thread_number=3,
                 network_try_limit=10, task_try_limit=10,
                 request_pause=NULL,
                 priority_mode='random',
                 meta=None,
                 only_cache=False,
                 config=None,
                 slave=False,
                 max_task_generator_chunk=None,
                 # New options start here
                 waiting_shutdown_event=None,
                 taskq=None,
                 result_queue=None,
                 network_response_queue=None,
                 shutdown_event=None,
                 generator_done_event=None,
                 ng=False,
                 ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occuried, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automaticall in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuid_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        New options:
        * waiting_shutdown_event=None,
        * taskq=None,
        * result_queue=None,
        * newtork_response_queue=None,
        * shutdown_event=None,
        * generator_done_event=None):
        """

        # New options starts
        self.waiting_shutdown_event = waiting_shutdown_event
        self.taskq = taskq
        self.result_queue = result_queue
        self.shutdown_event = shutdown_event
        self.generator_done_event = generator_done_event
        self.network_response_queue = network_response_queue
        self.ng = ng
        # New options ends

        self.slave = slave

        self.max_task_generator_chunk = max_task_generator_chunk
        self.timers = {}
        self.time_points = {}
        self.start_timer('total')
        if config is not None:
            self.config = config
        else:
            # Fix curcular import error
            from grab.util.config import Config
            self.config = Config()

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.task_generator_enabled = False
        self.only_cache = only_cache
        self.thread_number = thread_number
        self.counters = defaultdict(int)
        self.grab_config = {}
        self.items = {}
        self.task_try_limit = task_try_limit
        self.network_try_limit = network_try_limit
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be "random" or "const"')
        else:
            self.priority_mode = priority_mode
        try:
            signal.signal(signal.SIGUSR1, self.sigusr1_handler)
        except (ValueError, AttributeError):
            pass
        try:
            signal.signal(signal.SIGUSR2, self.sigusr2_handler)
        except (ValueError, AttributeError):
            pass

        # Initial cache-subsystem values
        self.cache_enabled = False
        self.cache = None

        self.work_allowed = True
        if request_pause is not NULL:
            logger.error('Option `request_pause` is deprecated and is not supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False

        # FIXIT: REMOVE
        self.dump_spider_stats = None
예제 #3
0
 def test_update_with_path(self):
     modname = setup_settings_file(SOME_DICT)
     config = Config()
     config.update_with_path(modname)
     self.assertEqual(config, {'VAR1': 'val1', 'VAR2': 'val2'})
예제 #4
0
 def test_update_with_dict_allowed_keys(self):
     config = Config({'VAR1': 'ORIGINAL'})
     config.update_with_object(SOME_DICT, allowed_keys=['VAR1'])
     self.assertEqual(config, {'VAR1': 'val1'})
예제 #5
0
 def test_update_with_dict_new_keys(self):
     config = Config({'VAR1': 'ORIGINAL'})
     config.update_with_object(SOME_DICT, only_new_keys=True)
     self.assertEqual(config, {'VAR1': 'ORIGINAL', 'VAR2': 'val2'})
예제 #6
0
 def test_update_with_dict(self):
     config = Config()
     config.update_with_object(SOME_DICT)
     self.assertEqual(config, {'VAR1': 'val1', 'VAR2': 'val2'})
예제 #7
0
 def test_update_with_object_allowed_keys(self):
     config = Config({'VAR1': 'ORIGINAL'})
     obj = SomeSettings()
     config.update_with_object(obj, allowed_keys=['VAR1'])
     self.assertEqual(config, {'VAR1': 'val1'})
예제 #8
0
 def test_update_with_object_new_keys(self):
     config = Config({'VAR1': 'ORIGINAL'})
     obj = SomeSettings()
     config.update_with_object(obj, only_new_keys=True)
     self.assertEqual(config, {'VAR1': 'ORIGINAL', 'VAR2': 'val2'})
예제 #9
0
 def test_update_with_object(self):
     config = Config()
     obj = SomeSettings()
     config.update_with_object(obj)
     self.assertEqual(config, {'VAR1': 'val1', 'VAR2': 'val2'})
예제 #10
0
 def test_clone(self):
     config = Config({'foo': 'bar'})
     config2 = config.clone()
     self.assertEqual(config, config2)
     self.assertFalse(config is config2)
예제 #11
0
 def test_update_with_path_allowed_keys(self):
     modname = setup_settings_file(SOME_DICT)
     config = Config()
     config.update_with_path(modname, allowed_keys=['VAR1'])
     self.assertEqual(config, {'VAR1': 'val1'})
예제 #12
0
 def test_update_with_path_new_keys(self):
     modname = setup_settings_file(SOME_DICT)
     config = Config({'VAR1': 'ORIGINAL'})
     config.update_with_path(modname, only_new_keys=True)
     self.assertEqual(config, {'VAR1': 'ORIGINAL', 'VAR2': 'val2'})
예제 #13
0
파일: base.py 프로젝트: sergithon/grab
    def __init__(
        self,
        thread_number=3,
        network_try_limit=10,
        task_try_limit=10,
        request_pause=NULL,
        priority_mode='random',
        meta=None,
        only_cache=False,
        config=None,
        slave=False,
        max_task_generator_chunk=None,
        args=None,
        # New options start here
        waiting_shutdown_event=None,
        taskq=None,
        result_queue=None,
        network_response_queue=None,
        shutdown_event=None,
        generator_done_event=None,
        ng=False,
    ):
        """
        Arguments:
        * thread-number - Number of concurrent network streams
        * network_try_limit - How many times try to send request
            again if network error was occurred, use 0 to disable
        * network_try_limit - Limit of tries to execute some task
            this is not the same as network_try_limit
            network try limit limits the number of tries which
            are performed automatically in case of network timeout
            of some other physical error
            but task_try_limit limits the number of attempts which
            are scheduled manually in the spider business logic
        * priority_mode - could be "random" or "const"
        * meta - arbitrary user data
        * retry_rebuild_user_agent - generate new random user-agent for each
            network request which is performed again due to network error
        * args - command line arguments parsed with `setup_arg_parser` method
        New options:
        * waiting_shutdown_event=None,
        * taskq=None,
        * result_queue=None,
        * newtork_response_queue=None,
        * shutdown_event=None,
        * generator_done_event=None):
        """

        # New options starts
        self.waiting_shutdown_event = waiting_shutdown_event
        self.taskq = taskq
        self.result_queue = result_queue
        self.shutdown_event = shutdown_event
        self.generator_done_event = generator_done_event
        self.network_response_queue = network_response_queue
        self.ng = ng
        # New options ends

        if args is None:
            self.args = {}
        else:
            self.args = args

        self.slave = slave

        self.max_task_generator_chunk = max_task_generator_chunk
        self.timers = {
            'network-name-lookup': 0,
            'network-connect': 0,
            'network-total': 0,
        }
        self.time_points = {}
        self.start_timer('total')
        if config is not None:
            self.config = config
        else:
            # Fix circular import error
            from grab.util.config import Config
            self.config = Config()

        if meta:
            self.meta = meta
        else:
            self.meta = {}

        self.task_generator_enabled = False
        self.only_cache = only_cache
        self.thread_number = thread_number
        self.counters = defaultdict(int)
        self._grab_config = {}
        self.items = {}
        self.task_try_limit = task_try_limit
        self.network_try_limit = network_try_limit
        if priority_mode not in ['random', 'const']:
            raise SpiderMisuseError('Value of priority_mode option should be '
                                    '"random" or "const"')
        else:
            self.priority_mode = priority_mode

        try:
            signal.signal(signal.SIGUSR1, self.sigusr1_handler)
        except (ValueError, AttributeError):
            pass

        try:
            signal.signal(signal.SIGUSR2, self.sigusr2_handler)
        except (ValueError, AttributeError):
            pass

        # Initial cache-subsystem values
        self.cache_enabled = False
        self.cache = None

        self.work_allowed = True
        if request_pause is not NULL:
            logger.error('Option `request_pause` is deprecated and is not '
                         'supported anymore')

        self.proxylist_enabled = None
        self.proxylist = None
        self.proxy = None
        self.proxy_auto_change = False

        # FIXIT: REMOVE
        self.dump_spider_stats = None

        self.controller = CommandController(self)

        # snapshots contains information about spider's state
        # for each 10 seconds interval
        self.snapshots = {}
        self.last_snapshot_values = {
            'timestamp': 0,
            'download-size': 0,
            'upload-size': 0,
            'download-size-with-cache': 0,
            'request-count': 0,
        }
        self.snapshot_timestamps = []
        self.snapshot_interval = self.config.get('GRAB_SNAPSHOT_CONFIG',
                                                 {}).get('interval', 10)
        self.snapshot_file = self.config.get('GRAB_SNAPSHOT_CONFIG',
                                             {}).get('file', None)
        if self.snapshot_file:
            open(self.snapshot_file, 'w').write('')