def test_config_constructor(self): config = Config() self.assertEqual(len(config.keys()), 0) config = Config({'foo': 'bar'}) self.assertEqual(len(config.keys()), 1) self.assertEqual(config['foo'], 'bar')
def __init__(self, thread_number=3, network_try_limit=10, task_try_limit=10, request_pause=NULL, priority_mode='random', meta=None, only_cache=False, config=None, slave=False, max_task_generator_chunk=None, # New options start here waiting_shutdown_event=None, taskq=None, result_queue=None, network_response_queue=None, shutdown_event=None, generator_done_event=None, ng=False, ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occuried, use 0 to disable * network_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automaticall in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuid_user_agent - generate new random user-agent for each network request which is performed again due to network error New options: * waiting_shutdown_event=None, * taskq=None, * result_queue=None, * newtork_response_queue=None, * shutdown_event=None, * generator_done_event=None): """ # New options starts self.waiting_shutdown_event = waiting_shutdown_event self.taskq = taskq self.result_queue = result_queue self.shutdown_event = shutdown_event self.generator_done_event = generator_done_event self.network_response_queue = network_response_queue self.ng = ng # New options ends self.slave = slave self.max_task_generator_chunk = max_task_generator_chunk self.timers = {} self.time_points = {} self.start_timer('total') if config is not None: self.config = config else: # Fix curcular import error from grab.util.config import Config self.config = Config() if meta: self.meta = meta else: self.meta = {} self.task_generator_enabled = False self.only_cache = only_cache self.thread_number = thread_number self.counters = defaultdict(int) self.grab_config = {} self.items = {} self.task_try_limit = task_try_limit self.network_try_limit = network_try_limit if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be "random" or "const"') else: self.priority_mode = priority_mode try: signal.signal(signal.SIGUSR1, self.sigusr1_handler) except (ValueError, AttributeError): pass try: signal.signal(signal.SIGUSR2, self.sigusr2_handler) except (ValueError, AttributeError): pass # Initial cache-subsystem values self.cache_enabled = False self.cache = None self.work_allowed = True if request_pause is not NULL: logger.error('Option `request_pause` is deprecated and is not supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False # FIXIT: REMOVE self.dump_spider_stats = None
def test_update_with_path(self): modname = setup_settings_file(SOME_DICT) config = Config() config.update_with_path(modname) self.assertEqual(config, {'VAR1': 'val1', 'VAR2': 'val2'})
def test_update_with_dict_allowed_keys(self): config = Config({'VAR1': 'ORIGINAL'}) config.update_with_object(SOME_DICT, allowed_keys=['VAR1']) self.assertEqual(config, {'VAR1': 'val1'})
def test_update_with_dict_new_keys(self): config = Config({'VAR1': 'ORIGINAL'}) config.update_with_object(SOME_DICT, only_new_keys=True) self.assertEqual(config, {'VAR1': 'ORIGINAL', 'VAR2': 'val2'})
def test_update_with_dict(self): config = Config() config.update_with_object(SOME_DICT) self.assertEqual(config, {'VAR1': 'val1', 'VAR2': 'val2'})
def test_update_with_object_allowed_keys(self): config = Config({'VAR1': 'ORIGINAL'}) obj = SomeSettings() config.update_with_object(obj, allowed_keys=['VAR1']) self.assertEqual(config, {'VAR1': 'val1'})
def test_update_with_object_new_keys(self): config = Config({'VAR1': 'ORIGINAL'}) obj = SomeSettings() config.update_with_object(obj, only_new_keys=True) self.assertEqual(config, {'VAR1': 'ORIGINAL', 'VAR2': 'val2'})
def test_update_with_object(self): config = Config() obj = SomeSettings() config.update_with_object(obj) self.assertEqual(config, {'VAR1': 'val1', 'VAR2': 'val2'})
def test_clone(self): config = Config({'foo': 'bar'}) config2 = config.clone() self.assertEqual(config, config2) self.assertFalse(config is config2)
def test_update_with_path_allowed_keys(self): modname = setup_settings_file(SOME_DICT) config = Config() config.update_with_path(modname, allowed_keys=['VAR1']) self.assertEqual(config, {'VAR1': 'val1'})
def test_update_with_path_new_keys(self): modname = setup_settings_file(SOME_DICT) config = Config({'VAR1': 'ORIGINAL'}) config.update_with_path(modname, only_new_keys=True) self.assertEqual(config, {'VAR1': 'ORIGINAL', 'VAR2': 'val2'})
def __init__( self, thread_number=3, network_try_limit=10, task_try_limit=10, request_pause=NULL, priority_mode='random', meta=None, only_cache=False, config=None, slave=False, max_task_generator_chunk=None, args=None, # New options start here waiting_shutdown_event=None, taskq=None, result_queue=None, network_response_queue=None, shutdown_event=None, generator_done_event=None, ng=False, ): """ Arguments: * thread-number - Number of concurrent network streams * network_try_limit - How many times try to send request again if network error was occurred, use 0 to disable * network_try_limit - Limit of tries to execute some task this is not the same as network_try_limit network try limit limits the number of tries which are performed automatically in case of network timeout of some other physical error but task_try_limit limits the number of attempts which are scheduled manually in the spider business logic * priority_mode - could be "random" or "const" * meta - arbitrary user data * retry_rebuild_user_agent - generate new random user-agent for each network request which is performed again due to network error * args - command line arguments parsed with `setup_arg_parser` method New options: * waiting_shutdown_event=None, * taskq=None, * result_queue=None, * newtork_response_queue=None, * shutdown_event=None, * generator_done_event=None): """ # New options starts self.waiting_shutdown_event = waiting_shutdown_event self.taskq = taskq self.result_queue = result_queue self.shutdown_event = shutdown_event self.generator_done_event = generator_done_event self.network_response_queue = network_response_queue self.ng = ng # New options ends if args is None: self.args = {} else: self.args = args self.slave = slave self.max_task_generator_chunk = max_task_generator_chunk self.timers = { 'network-name-lookup': 0, 'network-connect': 0, 'network-total': 0, } self.time_points = {} self.start_timer('total') if config is not None: self.config = config else: # Fix circular import error from grab.util.config import Config self.config = Config() if meta: self.meta = meta else: self.meta = {} self.task_generator_enabled = False self.only_cache = only_cache self.thread_number = thread_number self.counters = defaultdict(int) self._grab_config = {} self.items = {} self.task_try_limit = task_try_limit self.network_try_limit = network_try_limit if priority_mode not in ['random', 'const']: raise SpiderMisuseError('Value of priority_mode option should be ' '"random" or "const"') else: self.priority_mode = priority_mode try: signal.signal(signal.SIGUSR1, self.sigusr1_handler) except (ValueError, AttributeError): pass try: signal.signal(signal.SIGUSR2, self.sigusr2_handler) except (ValueError, AttributeError): pass # Initial cache-subsystem values self.cache_enabled = False self.cache = None self.work_allowed = True if request_pause is not NULL: logger.error('Option `request_pause` is deprecated and is not ' 'supported anymore') self.proxylist_enabled = None self.proxylist = None self.proxy = None self.proxy_auto_change = False # FIXIT: REMOVE self.dump_spider_stats = None self.controller = CommandController(self) # snapshots contains information about spider's state # for each 10 seconds interval self.snapshots = {} self.last_snapshot_values = { 'timestamp': 0, 'download-size': 0, 'upload-size': 0, 'download-size-with-cache': 0, 'request-count': 0, } self.snapshot_timestamps = [] self.snapshot_interval = self.config.get('GRAB_SNAPSHOT_CONFIG', {}).get('interval', 10) self.snapshot_file = self.config.get('GRAB_SNAPSHOT_CONFIG', {}).get('file', None) if self.snapshot_file: open(self.snapshot_file, 'w').write('')