def setUp(self): crawler = CrawlerProcess(settings) crawler.install() # what does this do? inside_project() self.items = [] self.crawl_cmd = scrapy.commands.crawl.Command() self.crawl_cmd.set_crawler(crawler) self.parser = optparse.OptionParser() self.crawl_cmd.add_options(self.parser) dispatcher.connect(self._item_passed, signals.item_passed)
def execute(argv=None): if argv is None: argv = sys.argv crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() _check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11 cmds = _get_commands_dict(inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None): if argv is None: argv = sys.argv crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) #cmds中有scrapy.commands目录下所有命令 cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) # 真正执行CrawlerProcess的入口在这里 cmd.crawler_process = CrawlerProcess(settings) #运行CrawlerProcess _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def get_project_root(): """ Returns the absolute path of the root of the project, and raise an exception if the current directory is not inside a project path """ os.path.abspath('.') if inside_project(): return os.path.dirname(closest_scrapy_cfg()) raise Exception(os.getcwd(), " does not belong to a Scrapy project")
def execute(argv=None, settings=None): print("in cmdline line 98 : i am in eclipes") print("in cmdline line 98 argv %s" % argv) print("in cmdline line 99 settings %s" % settings) if argv is None: argv = sys.argv print("in cmdline line 101 argv %s" % argv) print("in cmdline line 102 settings %s" % settings) # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() print("in cmdline line 115 settings %s" % settings) check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) #从这一步就开始了...开始启动 _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def get_scrapy_data_path(createdir=True): """ Return a path to a folder where Scrapy is storing data. Usually that's a .scrapy folder inside the project. """ # This code is extracted from scrapy.utils.project.data_path function, # which does too many things. path = project_data_dir() if inside_project() else ".scrapy" if createdir: os.makedirs(path, exist_ok=True) return path
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): #如果函数参数 argv是none的话 有从命令行读取参数 if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- #判断函数参数中是否有Settings以及 'scrapy.conf 是否在模块列表中 if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf #判断对象object是否包含名为name的特性(hasattr是通过调用getattr(ojbect, name)是否抛出异常来实现的)。 if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ #如果settings是NONE if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def main(): opts, args = parse_opts() exitcode = 0 if not inside_project(): _log("Error: no Scrapy project found in this location") sys.exit(1) install_opener( build_opener(HTTPRedirectHandler) ) if opts.list_targets: for name, target in _get_targets().items(): print("%-20s %s" % (name, target['url'])) return if opts.list_projects: target = _get_target(opts.list_projects) req = Request(_url(target, 'listprojects.json')) _add_auth_header(req, target) f = urlopen(req) projects = json.loads(f.read())['projects'] print(os.linesep.join(projects)) return tmpdir = None if opts.build_egg: # build egg only egg, tmpdir = _build_egg() dest = opts.build_egg if(os.path.isdir(dest)): dest = os.path.join(dest, str(int(time.time())) + '.egg') _log("Writing egg to %s" % dest) shutil.copyfile(egg, dest) elif opts.deploy_all_targets: version = None for name, target in _get_targets().items(): if version is None: version = _get_version(target, opts) _build_egg_and_deploy_target(target, version, opts) else: # buld egg and deploy target_name = _get_target_name(args) target = _get_target(target_name) version = _get_version(target, opts) exitcode, tmpdir = _build_egg_and_deploy_target(target, version, opts) if tmpdir: if opts.debug: _log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir) sys.exit(exitcode)
def execute(argv=None, settings=None): # 是否用其他方式传入命令参数,否的话使用命令行参数 if argv is None: argv = sys.argv #sys.argv 的第一个元素一定是本体的名字,不论后面带不带参数 所以不能用非空来判断 if settings is None: # 没指定setting的话,就调用默认方法 settings = get_project_settings( ) # 从scrapy.cfg载入setting 再从环境中载入scrapy相关的setting到setting对象里 # set EDITOR from environment if available 用于编辑文件 try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor inproject = inside_project( ) #判断是否在项目内(先尝试载入setting模块 否则用是否能找到scrapy.cfg来判断) cmds = _get_commands_dict(settings, inproject) #拿到当前状态下所有可用模块 cmdname = _pop_command_name(argv) #从命令行里拿到指向哪个命令 parser = optparse.OptionParser( formatter=optparse.TitledHelpFormatter( ), ##指定一个 optparse 解析器 已经被argparse 替代了 不用学这个 conflict_handler='resolve') if not cmdname: #未解析出指向模块 _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: #解析出的字符不在可用模块内 _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] #拿到模块 ## 解析命令 parser.usage = f"scrapy {cmdname} {cmd.syntax()}" parser.description = cmd.long_desc() #将命令中的设置弄到setting中 settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) ## 解析命令结束 #运行命令 _run_print_help(parser, cmd.process_options, args, opts) #调用对应的命令 传递由解析器解析出来的参数 # 生成CrawlerProcess cmd.crawler_process = CrawlerProcess(settings) # 运行crawler_process 对应的命令 _run_print_help(parser, _run_command, cmd, args, opts) #调用command的run启动这两个参数 sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and "scrapy.conf" in sys.modules: from scrapy import conf if hasattr(conf, "settings"): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve") if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def main(): opts, args = parse_opts() exitcode = 0 if not inside_project(): _log("Error: no Scrapy project found in this location") sys.exit(1) urllib2.install_opener(urllib2.build_opener(HTTPRedirectHandler)) if opts.list_targets: for name, target in _get_targets().items(): print "%-20s %s" % (name, target['url']) return if opts.list_projects: target = _get_target(opts.list_projects) req = urllib2.Request(_url(target, 'listprojects.json')) _add_auth_header(req, target) f = urllib2.urlopen(req) projects = json.loads(f.read())['projects'] print os.linesep.join(projects) return tmpdir = None if opts.build_egg: # build egg only egg, tmpdir = _build_egg() _log("Writing egg to %s" % opts.build_egg) shutil.copyfile(egg, opts.build_egg) else: # buld egg and deploy target_name = _get_target_name(args) target = _get_target(target_name) project = _get_project(target, opts) version = _get_version(target, opts) if opts.egg: _log("Using egg: %s" % opts.egg) egg = opts.egg else: _log("Packing version %s" % version) egg, tmpdir = _build_egg() if not _upload_egg(target, egg, project, version): exitcode = 1 if tmpdir: if opts.debug: _log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir) sys.exit(exitcode)
def deploy(): opts, args = parse_opts() if not inside_project(): _log("Error: no Scrapy project found in this location") sys.exit(1) _delete_old_package() urllib2.install_opener(urllib2.build_opener(HTTPRedirectHandler)) if opts.list_targets: for name, target in _get_targets().items(): print "%-20s %s" % (name, target['url']) return if opts.list_projects: target = _get_target(opts.list_projects) req = urllib2.Request(_url(target, 'listprojects.json')) _add_auth_header(req, target) f = urllib2.urlopen(req) projects = json.loads(f.read())['projects'] print os.linesep.join(projects) return tmpdir = None # build egg only if opts.build_egg: egg, tmpdir = _build_egg() _log("Writing egg to %s" % opts.build_egg) shutil.copyfile(egg, opts.build_egg) elif opts.deploy_all_targets: version = None for name, target in _get_targets().items(): if version is None: version = _get_version(target, opts) _build_egg_and_deploy_target(target, version, opts) else: # buld egg and deploy target_name = _get_target_name(args) target = _get_target(target_name) version = _get_version(target, opts) exitcode, tmpdir = _build_egg_and_deploy_target(target, version, opts) if tmpdir: if opts.debug: _log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir) _delete_old_package()
def execute(argv=None, settings=None): """ 解析命令; 利用CrawlerProcess构建 crawler; """ # 1. 初始化执行环境; 读取用户自定义配置; 读取默认配置; 检查弃用配置项 if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # 检查弃用的配置项 # 2. 检查执行环境是否在项目中: 即scrapy命令执行时的项目目录, 相邻文件夹是否存在scrapy.cfg inproject = inside_project() # 3. 读取当前scrapy支持的所有命令(commands文件夹)并解析命令 cmds = _get_commands_dict(settings, inproject) # 读取所有命令 cmdname = _pop_command_name(argv) # 解析命令 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) # 4. 获取命令实例并设置项目级别(command), 添加解析规则 cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) # 解析命令行 _run_print_help(parser, cmd.process_options, args, opts) # 5. 初始化CrawlerProcess并执行命令实例的run方法(比如命令srapy/command/crawl.py) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) # 运行相应cmd的run方法 sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self.error("No active Scrapy project") self.command = self.args.command self.spider = sanitize_module_name(self.args.spider) self.callback = self.args.callback self.fixture = self.args.fixture self.project_dir = get_project_dir() sys.path.append(self.project_dir) self.settings = get_project_settings() base_path = self.settings.get( 'AUTOUNIT_BASE_PATH', default=os.path.join(self.project_dir, 'autounit')) self.tests_dir = os.path.join(base_path, 'tests') self.spider_dir = os.path.join(self.tests_dir, self.spider) if not os.path.isdir(self.spider_dir): self.error( "No recorded data found " "for spider '{}'".format(self.spider)) extra_path = self.settings.get('AUTOUNIT_EXTRA_PATH') or '' self.callback_dir = os.path.join( self.spider_dir, extra_path, self.callback) if not os.path.isdir(self.callback_dir): self.error( "No recorded data found for callback " "'{}' from '{}' spider".format(self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join( self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self.error("Fixture '{}' not found".format(self.fixture_path))
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self._error("No active Scrapy project") self.command = self.args.command self.spider = self.args.spider self.callback = self.args.callback self.fixture = self.args.fixture self.project_dir = get_project_dir() sys.path.append(self.project_dir) self.settings = get_project_settings() base_path = get_base_path(self.settings) self.tests_dir = os.path.join(base_path, 'tests') if self.spider: self.spider = sanitize_module_name(self.spider) self.callbacks_dir = self._get_callbacks_dir(self.spider) if not os.path.isdir(self.callbacks_dir): self._error("No recorded data found for spider '{}'".format( self.spider)) if self.callback: self.callback_dir = os.path.join(self.callbacks_dir, self.callback) if not os.path.isdir(self.callback_dir): self._error("No recorded data found for callback " "'{}' from '{}' spider".format( self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join(self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self._error("Fixture '{}' not found".format( self.fixture_path))
def cli(target, project, version, list_targets, debug, egg, build_egg): exitcode = 0 if not inside_project(): _log("Error: no Scrapy project found in this location") sys.exit(1) if list_targets: for name, target in _get_targets().items(): click.echo(name) return tmpdir = None if build_egg: # build egg only egg, tmpdir = _build_egg() _log("Writing egg to %s" % build_egg) shutil.copyfile(egg, build_egg) else: # buld egg and deploy target = _get_target(target) project = _get_project(target, project) version = _get_version(target, version) if egg: _log("Using egg: %s" % egg) egg = egg else: _log("Packing version %s" % version) egg, tmpdir = _build_egg() if _upload_egg(target, egg, project, version): click.echo( "Run your spiders at: https://dash.scrapinghub.com/p/%s/" % project) else: exitcode = 1 if tmpdir: if debug: _log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir) sys.exit(exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # 校验弃用的配置项 inproject = inside_project() # 校验执行环境是否在项目中,命令根据执行环境不同而结果不同 cmds = _get_commands_dict(settings, inproject) # 获取所有命令,以命令名为键,对应的命令对象为值 cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] # 获取形参中命令的对象实例 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command' ) # 把cmd.default_settings(这里为空)加入setting对象的attribute中 cmd.settings = settings # 把setting对象附加到命令对象上 cmd.add_options(parser) # 添加解析规则 opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess( settings) # 初始化CrawlerProcess实例,把实例添加到当前命令中 _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def cli(target, project, version, list_targets, debug, egg, build_egg): exitcode = 0 if not inside_project(): _log("Error: no Scrapy project found in this location") sys.exit(1) if list_targets: for name, target in _get_targets().items(): click.echo(name) return tmpdir = None if build_egg: egg, tmpdir = _build_egg() _log("Writing egg to %s" % build_egg) shutil.copyfile(egg, build_egg) else: target = _get_target(target) project = _get_project(target, project) version = _get_version(target, version) if egg: _log("Using egg: %s" % egg) egg = egg else: _log("Packing version %s" % version) egg, tmpdir = _build_egg() if _upload_egg(target, egg, project, version): click.echo("Run your spiders at: https://dash.scrapinghub.com/p/%s/" % project) else: exitcode = 1 if tmpdir: if debug: _log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir) sys.exit(exitcode)
def cli(target, project, version, list_targets, debug, egg, build_egg): if not inside_project(): log("Error: no Scrapy project found in this location") sys.exit(1) if list_targets: for name, target in scrapycfg.get_targets().items(): click.echo(name) return tmpdir = None try: if build_egg: egg, tmpdir = _build_egg() log("Writing egg to %s" % build_egg) shutil.copyfile(egg, build_egg) else: target = scrapycfg.get_target(target) project = scrapycfg.get_project(target, project) version = scrapycfg.get_version(target, version) apikey = target.get('username') or find_api_key() auth = (apikey, '') if egg: log("Using egg: %s" % egg) egg = egg else: log("Packing version %s" % version) egg, tmpdir = _build_egg() _upload_egg(target, egg, project, version, auth) click.echo("Run your spiders at: https://dash.scrapinghub.com/p/%s/" % project) finally: if tmpdir: if debug: log("Output dir not removed: %s" % tmpdir) else: shutil.rmtree(tmpdir, ignore_errors=True)
def execute(argv=None, settings=None): ## 假设我们是以 execute(['scrapy', 'crawl', 'spidername']) 的形式执行该函数 if argv is None: argv = sys.argv # --- backward compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: ## 获取项目配置 ## 根据环境变量和 scrapy.cfg 初始化环境,最终生成一个 Settings 实例 settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor ## 校验弃用的配置项 check_deprecated_settings(settings) # --- backward compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ ## 执行环境是否在项目中,主要检查 scrapy.cfg 配置文件是否存在 inproject = inside_project() ## 读取 commands 文件夹,把所有的命令类转换为 {cmd_name: cmd_instance, ...} 的字典 cmds = _get_commands_dict(settings, inproject) ## 从命令行参数中解析出执行的是哪个子命令 ## 例如,若命令行中执行的命令是 `scrapy crawl xxx`,这里的 cmdname 就是 'crawl' cmdname = _pop_command_name(argv) ## optparse 模块,可以让程序员能轻松设计出简单明了、易于使用、符合标准的 Unix ## 命令例程式的帮助文档 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') ## 如果 cmdname 为空,则打印所有命令的帮助信息,并退出 Python 程序 if not cmdname: _print_commands(settings, inproject) sys.exit(0) ## 如果 cmdname 不为空,但不在 cmds 字典的键中,则打印未知命令错误,并异常退出程序 elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) ## 根据命令名称找到对应的命令实例 cmd = cmds[cmdname] ## cmd.syntax 方法返回命令的用法 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) ## cmd.long_desc 方法返回对命令的描述 parser.description = cmd.long_desc() ## 设置命令实例的默认配置并指定优先级为 command settings.setdict(cmd.default_settings, priority='command') ## 设置命令实例的配置 cmd.settings = settings ## 为命令实例添加解析规则 cmd.add_options(parser) ## 解析命令参数,并交由 Scrapy 命令实例处理 ## 对于命令 `scrapy crawl xxx` 而言,argv[1:] 为 ['crawl', 'xxx'] ## opts = {'logfile': xxx, 'loglevel': xxx, ...} ## args = ['xxx'] opts, args = parser.parse_args(args=argv[1:]) ## 执行命令实例的 process_options 方法,若执行出错时,打印相关帮助文档后再退出程序 ## 在执行命令前,可以为命令添加一些可选项,用于在命令行级别下,更新配置中的相应配置项 _run_print_help(parser, cmd.process_options, args, opts) ## 根据配置,初始化 CrawlerProcess(爬虫进程) 类的实例 ## 并赋值给命令实例的 crawler_process 属性 cmd.crawler_process = CrawlerProcess(settings) ## 执行命令实例的 run 方法 ## 如果运行命令是 scrapy crawl <spider_name>,则运行的就是 ## commands/crawl.py 的 run 方法 _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', '--spider', help='The spider where to look fixtures for') parser.add_argument( '-c', '--callback', help='The callback where to look fixtures for (requires spider)') parser.add_argument( '-f', '--fixture', help=('The fixture number to inspect (requires spider and callback).' 'It can be an integer indicating the fixture number or a string' 'indicating the fixture name')) parser.add_argument('-p', '--path', help='The full path for the fixture to inspect') args = parser.parse_args() if args.path: retcode = handle_path(args.path) sys.exit(retcode) if not inside_project(): print('No active Scrapy project') sys.exit(1) if not args.spider: print('Must specify a spider') parser.print_help() sys.exit(1) if not args.callback: print('Must specify a callback') parser.print_help() sys.exit(1) if not args.fixture: print('Must specify a fixture') parser.print_help() sys.exit(1) settings = get_project_settings() base_path = settings.get('AUTOUNIT_BASE_PATH', default=os.path.join(get_project_dir(), 'autounit')) tests_dir = os.path.join(base_path, 'tests') if not os.path.isdir(tests_dir): print('Autounit tests directory not found\n') sys.exit(1) args.fixture = parse_fixture_arg(args.fixture) extra_path = settings.get('AUTOUNIT_EXTRA_PATH') or '' path = os.path.join(tests_dir, args.spider, extra_path, args.callback, args.fixture) retcode = handle_path(path) sys.exit(retcode)
def __init__(self, parser): self.parser = parser self.args = parser.parse_args() if not inside_project(): self.error("No active Scrapy project") self.command = self.args.command self.spider = sanitize_module_name(self.args.spider) if \ self.args.spider else None try: self.callback = self.args.callback except AttributeError: self.callback = None try: self.fixture = self.args.fixture except AttributeError: self.fixture = None if self.command == 'update': try: self.new = self.args.new except AttributeError: self.new = None try: self.dynamic = self.args.dynamic except AttributeError: self.dynamic = None if self.command == 'clear': self.fixtures = self.args.fixtures.split(',') if self.fixture and not self.callback: self.error("Can't specify a fixture without a callback") self.project_dir, self.project_name = get_project_dirs() sys.path.append(self.project_dir) self.settings = get_project_settings() if self.command == "parse": url_list = [url.strip() for url in self.args.urls.split('|')] for url in url_list: if not is_url(url): self.error("Something went wrong with your urls arg! " "Note that as of version 1.0, the character for separating " "multiple urls is '|', as opposed to ','") self.args = process_options(self.args) crawler_process = CrawlerProcess(self.settings) run_command(crawler_process, url_list, self.args) else: self.base_path = self.settings.get( 'TESTMASTER_BASE_PATH', default=os.path.join(self.project_dir, 'testmaster')) self.tests_dir = os.path.join(self.base_path, 'tests') self.spider_dir = os.path.join(self.tests_dir, self.spider) if not os.path.isdir(self.spider_dir) and self.command != "establish": self.error( "No recorded data found " "for spider '{}'".format(self.spider)) self.extra_path = self.settings.get('TESTMASTER_EXTRA_PATH') or '' if self.callback: self.callback_dir = os.path.join( self.spider_dir, self.extra_path, self.callback) if self.command == 'establish': if os.path.isdir(self.callback_dir): self.error( "Can't use 'establish' with callback arg " "if callback dir for spider '{}' " "exists already".format(self.spider)) else: if self.command == 'inspect': self.error( "No recorded data found for callback " "'{}' from '{}' spider".format(self.callback, self.spider)) if self.fixture: self.fixture_path = os.path.join(self.callback_dir, self.parse_fixture_arg()) if not os.path.isfile(self.fixture_path): self.error("Fixture '{}' not found".format(self.fixture_path))
def execute(argv=None, settings=None): # 获取命令行输入参数 if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- # 向上兼容scrapy.conf单例,其实是报不支持这种配置方式的异常了 if settings is None and 'scrapy.conf' in sys.modules: from . import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ # 获取项目配置 if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor # 检验失效配置项(提示哪些配置已经失效了,这应该是打的补丁把-.-) check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings #获取项目配置 # ------------------------------------------------------------------ # 通过查找有没有cfg文件判定是否在当前执行目录下,用于后面对比require_project属性,决定指令是否可用 inproject = inside_project() # 获取命令字典(key=命令名:value=实例对象) cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) #解析返回当前指令名称 # 指令解析对象 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: # 不存在的指令,打印内容(和 -version打印的内容其实一样) _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) #这里告诫我们,没事不要瞎输,你看程序退出了把=。= # 获取指令对象 cmd = cmds[cmdname] # 指令语法 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) # 指令描述 parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') #保存cmd的settings,并且重复项将会覆盖之前的全局配置 cmd.settings = settings #重复项覆盖之后重新赋值给当前命令settings属性 cmd.add_options(parser) #给指令添加相应指令选项 opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) #输出公共指令参数 # 创建爬虫进程对象,通过此对象进行爬虫启动运行(划重点!!此处可能运行报错不通过,重新安装cryptography) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): ''' Run command for scrapy, the original command exit the program when finish crawling so we modified it :param argv: run_command :param settings: setting :return: ''' if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = cmdline._get_commands_dict(settings, inproject) cmdname = cmdline._pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') if not cmdname: cmdline._print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: cmdline._print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) cmdline._run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) try: cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts) except Exception as e: print('Done')