def test_without_none_values(self): self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4]) self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4)) self.assertEqual( without_none_values({ 'one': 1, 'none': None, 'three': 3, 'four': 4 }), { 'one': 1, 'three': 3, 'four': 4 })
def build_component_list(compdict, convert=update_classpath): """Compose a component list from a { class: order } dictionary.""" def _check_components(complist): if len({convert(c) for c in complist}) != len(complist): raise ValueError('Some paths in {!r} convert to the same object, ' 'please update your settings'.format(complist)) def _map_keys(compdict): if isinstance(compdict, BaseSettings): compbs = BaseSettings() for k, v in six.iteritems(compdict): prio = compdict.getpriority(k) if compbs.getpriority(convert(k)) == prio: raise ValueError('Some paths in {!r} convert to the same ' 'object, please update your settings' ''.format(list(compdict.keys()))) else: compbs.set(convert(k), v, priority=prio) return compbs else: _check_components(compdict) return {convert(k): v for k, v in six.iteritems(compdict)} if isinstance(compdict, (list, tuple)): _check_components(compdict) return type(compdict)(convert(c) for c in compdict) compdict = without_none_values(_map_keys(compdict)) return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) if opts.output: if opts.output == '-': self.settings.set('FEED_URI', 'stdout:', priority='cmdline') else: self.settings.set('FEED_URI', opts.output, priority='cmdline') feed_exporters = without_none_values( self.settings.getwithbase('FEED_EXPORTERS')) valid_output_formats = feed_exporters.keys() if not opts.output_format: opts.output_format = os.path.splitext(opts.output)[1].replace( ".", "") if opts.output_format not in valid_output_formats: raise UsageError( "Unrecognized output format '%s', set one" " using the '-t' switch or as a file extension" " from the supported list %s" % (opts.output_format, tuple(valid_output_formats))) self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def build_component_list(compdict, convert=update_classpath): """Compose a component list from a { class: order } dictionary.""" def _check_components(complist): if len({convert(c) for c in complist}) != len(complist): raise ValueError('Some paths in {!r} convert to the same object, ' 'please update your settings'.format(complist)) def _map_keys(compdict): if isinstance(compdict, BaseSettings): compbs = BaseSettings() for k, v in six.iteritems(compdict): prio = compdict.getpriority(k) if compbs.getpriority(convert(k)) == prio: raise ValueError('Some paths in {!r} convert to the same ' 'object, please update your settings' ''.format(list(compdict.keys()))) else: compbs.set(convert(k), v, priority=prio) return compbs else: _check_components(compdict) return {convert(k): v for k, v in six.iteritems(compdict)} if isinstance(compdict, (list, tuple)): _check_components(compdict) return type(compdict)(convert(c) for c in compdict) compdict = without_none_values(_map_keys(compdict)) return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
def test_without_none_values(self): self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4]) self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4)) self.assertEqual( without_none_values({ "one": 1, "none": None, "three": 3, "four": 4 }), { "one": 1, "three": 3, "four": 4 }, )
def __init__(self, crawler): """ 就是需下载的资源是什么类型,就选用哪一种下载处理器进行网络下载,其中最常用的就是http和https对应的处理器。 """ self._crawler = crawler self._schemes = { } # stores acceptable schemes on instancing 存储scheme对应的类路径,后面用于实例化 self._handlers = { } # stores instanced handlers for schemes 存储scheme对应的下载器 self._notconfigured = {} # remembers failed handlers """ 从配置中找到DOWNLOAD_HANDLERS_BASE,构造下载处理器 注意:这里是调用getwithbase方法,取的是配置中的XXXX_BASE配置。 handlers就是包含了default_settins.py中DOWNLOAD_HANDLERS_BASE项下的所有handler, 也就是根据下载资源的类型,采用不同的下载器,最常用的就是http和https了 """ handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) for scheme, clspath in six.iteritems( handlers): # 存储scheme对应的类路径,后面用于实例化 self._schemes[ scheme] = clspath # 其实就是把handler复制了一遍,为啥不用深拷贝?clspath就是不同handler类的路径,可用于实例化 self._load_handler( scheme, skip_lazy=True ) # 这里就把每种handler实例化了,保存在self._handlers中。所以不用深拷贝,这里还有实例化这个步骤 crawler.signals.connect(self._close, signals.engine_stopped)
def _load_components(self, setting_prefix): conf = without_none_values(self.settings.getwithbase(setting_prefix)) d = {} for k, v in conf.items(): try: d[k] = load_object(v) except NotConfigured: pass return d
def __init__(self, crawler): self._crawler = crawler self._schemes = {} # stores acceptable schemes on instancing self._handlers = {} # stores instanced handlers for schemes self._notconfigured = {} # remembers failed handlers handlers = without_none_values(crawler.settings._getcomposite('DOWNLOAD_HANDLERS')) for scheme, clspath in six.iteritems(handlers): self._schemes[scheme] = clspath crawler.signals.connect(self._close, signals.engine_stopped)
def __init__(self, crawler): self._crawler = crawler self._schemes = {} # stores acceptable schemes on instancing self._handlers = {} # stores instanced handlers for schemes self._notconfigured = {} # remembers failed handlers handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) for scheme, clspath in six.iteritems(handlers): self._schemes[scheme] = clspath crawler.signals.connect(self._close, signals.engine_stopped)
def _build_component_list(self, compdict, custom=None, convert=update_classpath): """ Compose a component list from a { class: order|[orders] } dictionary. Adapted from scrapy.utils.conf.build_component_list """ def _check_components(complist): if len({convert(c) for c in complist}) != len(complist): raise ValueError('Some paths in {!r} convert to the same object, ' 'please update your settings'.format(complist)) def _map_keys(compdict): if isinstance(compdict, BaseSettings): compbs = BaseSettings() for k, v in six.iteritems(compdict): prio = compdict.getpriority(k) if compbs.getpriority(convert(k)) == prio: raise ValueError('Some paths in {!r} convert to the same ' 'object, please update your settings' ''.format(list(compdict.keys()))) else: compbs.set(convert(k), v, priority=prio) return compbs else: _check_components(compdict) return {convert(k): v for k, v in six.iteritems(compdict)} def _validate_values(compdict): """Fail if a value in the components dict is not a real number or a list of them or None.""" for name, value in six.iteritems(compdict): try: vals = iter(value) except: vals = [value] for val in vals: if val is not None and not isinstance(val, numbers.Real): raise ValueError('Invalid value {} for component {}, please provide ' 'a real number or None instead'.format(val, name)) # BEGIN Backward compatibility for old (base, custom) call signature if isinstance(custom, (list, tuple)): _check_components(custom) return type(custom)(convert(c) for c in custom) if custom is not None: compdict.update(custom) # END Backward compatibility _validate_values(compdict) compdict = without_none_values(_map_keys(compdict)) comptuples = dol2lot(compdict) return [k for k, v in sorted(comptuples, key=itemgetter(1))]
def feed_process_params_from_cli(settings, output, output_format=None): """ Receives feed export params (from the 'crawl' or 'runspider' commands), checks for inconsistencies in their quantities and returns a dictionary suitable to be used as the FEEDS setting. """ valid_output_formats = without_none_values( settings.getwithbase("FEED_EXPORTERS")).keys() def check_valid_format(output_format): if output_format not in valid_output_formats: raise UsageError( "Unrecognized output format '%s', set one after a" " colon using the -o option (i.e. -o <URI>:<FORMAT>)" " or as a file extension, from the supported list %s" % (output_format, tuple(valid_output_formats))) if output_format: if len(output) == 1: check_valid_format(output_format) warnings.warn( "The -t command line option is deprecated in favor" " of specifying the output format within the -o" " option, please check the -o option docs for more details", category=ScrapyDeprecationWarning, stacklevel=2, ) return {output[0]: {"format": output_format}} else: raise UsageError( "The -t command line option cannot be used if multiple" " output files are specified with the -o option") result = {} for element in output: try: feed_uri, feed_format = element.rsplit(":", 1) except ValueError: feed_uri = element feed_format = os.path.splitext(element)[1].replace(".", "") else: if feed_uri == "-": feed_uri = "stdout:" check_valid_format(feed_format) result[feed_uri] = {"format": feed_format} # FEEDS setting should take precedence over the -o and -t CLI options result.update(settings.getdict("FEEDS")) return result
def __init__(self, crawler): self._crawler = crawler self._schemes = {} # 存储scheme对应的类路径,后面用于实例化 self._handlers = {} # 存储scheme对应的下载器 self._notconfigured = {} # remembers failed handlers # 从配置中找到DOWNLOAD_HANDLERS_BASE,构造下载处理器 # 注意:这里是调用getwithbase方法,取的是配置中的XXXX_BASE配置 handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) # 存储scheme对应的类路径,后面用于实例化 for scheme, clspath in six.iteritems(handlers): self._schemes[scheme] = clspath crawler.signals.connect(self._close, signals.engine_stopped)
def __init__(self, crawler): """ 下载器处理器功能: 管理各种资源对应的下载器, 在真正发起网络请求时, 选择对应的下载器, 此时才实例化 """ self._crawler = crawler self._schemes = {} # stores acceptable schemes on instancing self._handlers = {} # stores instanced handlers for schemes self._notconfigured = {} # remembers failed handlers # 根据配置中的handlers构造下载处理器, 见配置: DOWNLOAD_HANDLERS_BASE handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) # 存储每一个下载器对应的类路径, 以便后续的实例化操作 for scheme, clspath in six.iteritems(handlers): self._schemes[scheme] = clspath self._load_handler(scheme, skip_lazy=True) crawler.signals.connect(self._close, signals.engine_stopped)
def __init__(self, crawler): self._crawler = crawler self._schemes = { } # stores acceptable schemes on instancing | 存储实例化可接受的协议 self._handlers = { } # stores instanced handlers for schemes | 存储实例化可接受的处理函数 self._notconfigured = {} # remembers failed handlers | 存储失败的处理程序 # 返回不为None的处理函数路径 handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) for scheme, clspath in handlers.items(): # 实例化各个协议的处理函数 self._schemes[scheme] = clspath self._load_handler(scheme, skip_lazy=True) # s.engine_stopped todo 这里有一个信号,暂时还不知道具体用处 crawler.signals.connect(self._close, signals.engine_stopped)
def __init__(self, crawler): self._crawler = crawler self._schemes = { } # stores acceptable schemes on instancing 存储方案的,http及方案路径 self._handlers = { } # stores instanced handlers for schemes 存储所有的handler对象,都是load_obj之后的 self._notconfigured = {} # remembers failed handlers handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) # 获取BASE设置 for scheme, clspath in six.iteritems( handlers ): # 'http'和'https': 'scrapy.core.downloader.handlers.http.HTTPDownloadHandler', self._schemes[scheme] = clspath self._load_handler(scheme, skip_lazy=True) crawler.signals.connect(self._close, signals.engine_stopped)
def build_component_list(compdict, custom=None, convert=update_classpath): """Compose a component list from a { class: order } dictionary.""" # 根据{class:order}字典组成一个组件列表。 def _check_components(complist): if len({convert(c) for c in complist}) != len(complist): raise ValueError(f'Some paths in {complist!r} convert to the same object, ' 'please update your settings') def _map_keys(compdict): if isinstance(compdict, BaseSettings): compbs = BaseSettings() for k, v in compdict.items(): prio = compdict.getpriority(k) if compbs.getpriority(convert(k)) == prio: raise ValueError(f'Some paths in {list(compdict.keys())!r} ' 'convert to the same ' 'object, please update your settings' ) else: compbs.set(convert(k), v, priority=prio) return compbs else: _check_components(compdict) return {convert(k): v for k, v in compdict.items()} def _validate_values(compdict): """Fail if a value in the components dict is not a real number or None.""" # 如果组件字典中的值不是实数或无,则失败。 for name, value in compdict.items(): if value is not None and not isinstance(value, numbers.Real): raise ValueError(f'Invalid value {value} for component {name}, ' 'please provide a real number or None instead') # BEGIN Backward compatibility for old (base, custom) call signature if isinstance(custom, (list, tuple)): _check_components(custom) return type(custom)(convert(c) for c in custom) if custom is not None: compdict.update(custom) # END Backward compatibility _validate_values(compdict) compdict = without_none_values(_map_keys(compdict)) # 前面都是验证,这里最关键,根据后面的值排序 return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
def __init__(self, crawler): self._crawler = crawler ## 存储在实例化时可接受的协议 self._schemes = {} # stores acceptable schemes on instancing ## 存储协议对应的处理器 self._handlers = {} # stores instanced handlers for schemes self._notconfigured = {} # remembers failed handlers ## 从配置中找到 DOWNLOAD_HANDLERS_BASE,构造下载处理器 ## 注意:这里是调用 getwithbase 方法,取的是配置中的 XXXX_BASE 配置 handlers = without_none_values( crawler.settings.getwithbase('DOWNLOAD_HANDLERS')) ## 存储协议对应的类路径,后面用于实例化 for scheme, clspath in six.iteritems(handlers): self._schemes[scheme] = clspath self._load_handler(scheme, skip_lazy=True) crawler.signals.connect(self._close, signals.engine_stopped)
def build_component_list(compdict, custom=None, convert=update_classpath): """Compose a component list from a { class: order } dictionary.""" def _check_components(complist): if len({convert(c) for c in complist}) != len(complist): raise ValueError('Some paths in {!r} convert to the same object, ' 'please update your settings'.format(complist)) def _map_keys(compdict): if isinstance(compdict, BaseSettings): compbs = BaseSettings() for k, v in six.iteritems(compdict): prio = compdict.getpriority(k) if compbs.getpriority(convert(k)) == prio: raise ValueError('Some paths in {!r} convert to the same ' 'object, please update your settings' ''.format(list(compdict.keys()))) else: compbs.set(convert(k), v, priority=prio) return compbs else: _check_components(compdict) return {convert(k): v for k, v in six.iteritems(compdict)} def _validate_values(compdict): """Fail if a value in the components dict is not a real number or None.""" for name, value in six.iteritems(compdict): if value is not None and not isinstance(value, numbers.Real): raise ValueError('Invalid value {} for component {}, please provide ' \ 'a real number or None instead'.format(value, name)) # BEGIN Backwards compatibility for old (base, custom) call signature if isinstance(custom, (list, tuple)): _check_components(custom) return type(custom)(convert(c) for c in custom) if custom is not None: compdict.update(custom) # END Backwards compatibility _validate_values(compdict) compdict = without_none_values(_map_keys(compdict)) return [k for k, v in sorted(six.iteritems(compdict), key=itemgetter(1))]
def build_component_list(compdict, custom=None, convert=update_classpath): """Compose a component list from a { class: order } dictionary.""" def _check_components(complist): if len({convert(c) for c in complist}) != len(complist): raise ValueError("Some paths in {!r} convert to the same object, " "please update your settings".format(complist)) def _map_keys(compdict): if isinstance(compdict, BaseSettings): compbs = BaseSettings() for k, v in compdict.items(): prio = compdict.getpriority(k) if compbs.getpriority(convert(k)) == prio: raise ValueError("Some paths in {!r} convert to the same " "object, please update your settings" "".format(list(compdict.keys()))) else: compbs.set(convert(k), v, priority=prio) return compbs else: _check_components(compdict) return {convert(k): v for k, v in compdict.items()} def _validate_values(compdict): """Fail if a value in the components dict is not a real number or None.""" for name, value in compdict.items(): if value is not None and not isinstance(value, numbers.Real): raise ValueError( "Invalid value {} for component {}, please provide " "a real number or None instead".format(value, name)) # BEGIN Backward compatibility for old (base, custom) call signature if isinstance(custom, (list, tuple)): _check_components(custom) return type(custom)(convert(c) for c in custom) if custom is not None: compdict.update(custom) # END Backward compatibility _validate_values(compdict) compdict = without_none_values(_map_keys(compdict)) return [k for k, v in sorted(compdict.items(), key=itemgetter(1))]
def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) if opts.output: if opts.output == '-': self.settings.set('FEED_URI', 'stdout:', priority='cmdline') else: self.settings.set('FEED_URI', opts.output, priority='cmdline') feed_exporters = without_none_values(self.settings._getcomposite('FEED_EXPORTERS')) valid_output_formats = feed_exporters.keys() if not opts.output_format: opts.output_format = os.path.splitext(opts.output)[1].replace(".", "") if opts.output_format not in valid_output_formats: raise UsageError("Unrecognized output format '%s', set one" " using the '-t' switch or as a file extension" " from the supported list %s" % (opts.output_format, tuple(valid_output_formats))) self.settings.set('FEED_FORMAT', opts.output_format, priority='cmdline')
def test_without_none_values(self): self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4]) self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4)) self.assertEqual( without_none_values({"one": 1, "none": None, "three": 3, "four": 4}), {"one": 1, "three": 3, "four": 4} )
def test_without_none_values(self): self.assertEqual(without_none_values([1, None, 3, 4]), [1, 3, 4]) self.assertEqual(without_none_values((1, None, 3, 4)), (1, 3, 4)) self.assertEqual( without_none_values({'one': 1, 'none': None, 'three': 3, 'four': 4}), {'one': 1, 'three': 3, 'four': 4})
def feed_process_params_from_cli(settings, output, output_format=None, overwrite_output=None): """ Receives feed export params (from the 'crawl' or 'runspider' commands), checks for inconsistencies in their quantities and returns a dictionary suitable to be used as the FEEDS setting. """ valid_output_formats = without_none_values( settings.getwithbase('FEED_EXPORTERS')).keys() def check_valid_format(output_format): if output_format not in valid_output_formats: raise UsageError( f"Unrecognized output format '{output_format}'. " f"Set a supported one ({tuple(valid_output_formats)}) " "after a colon at the end of the output URI (i.e. -o/-O " "<URI>:<FORMAT>) or as a file extension.") overwrite = False if overwrite_output: if output: raise UsageError( "Please use only one of -o/--output and -O/--overwrite-output") output = overwrite_output overwrite = True if output_format: if len(output) == 1: check_valid_format(output_format) message = ( 'The -t command line option is deprecated in favor of ' 'specifying the output format within the output URI. See the ' 'documentation of the -o and -O options for more information.', ) warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2) return {output[0]: {'format': output_format}} else: raise UsageError( 'The -t command-line option cannot be used if multiple output ' 'URIs are specified') result = {} for element in output: try: feed_uri, feed_format = element.rsplit(':', 1) except ValueError: feed_uri = element feed_format = os.path.splitext(element)[1].replace('.', '') else: if feed_uri == '-': feed_uri = 'stdout:' check_valid_format(feed_format) result[feed_uri] = {'format': feed_format} if overwrite: result[feed_uri]['overwrite'] = True # FEEDS setting should take precedence over the matching CLI options result.update(settings.getdict('FEEDS')) return result
def from_crawler(cls, crawler): headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS']) return cls(headers.items())
def from_crawler(cls, crawler): headers = without_none_values(crawler.settings['DEFAULT_REQUEST_HEADERS']) return cls(headers.items())