def get_start_requests(project_path, spider_name): """ get start requests :param project_path: project path :param spider_name: spider name :return: """ work_cwd = os.getcwd() try: # change work dir os.chdir(project_path) # load settings settings = get_project_settings() check_deprecated_settings(settings) runner = CrawlerRunner(settings=settings) # add crawler spider_cls = runner.spider_loader.load(spider_name) runner.crawl(spider_cls) # get crawler crawler = list(runner.crawlers)[0] # get spider by crawler spider = crawler.spider # get start requests requests = list(spider.start_requests()) if not requests and hasattr(spider, 'start'): requests = list(spider.start()) requests = list(map(lambda r: process_request(r), requests)) return {'finished': True, 'requests': requests} finally: os.chdir(work_cwd)
def execute(url, project, spider, callback, **kwargs): argv = sys.argv print(argv) argv.append(url) if spider: argv.append('--spider') argv.append(spider) if callback: argv.append('--callback') argv.append(callback) print(argv) work_cwd = os.getcwd() print(work_cwd) try: os.chdir(project) settings = get_project_settings() check_deprecated_settings(settings) cmd = Parser() parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) print('opt, args', opts, args) cmd.process_options(args, opts) cmd.crawler_process = CrawlerProcess(settings) cmd.run(args, opts) finally: os.chdir(work_cwd)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) #cmds中有scrapy.commands目录下所有命令 cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) # 真正执行CrawlerProcess的入口在这里 cmd.crawler_process = CrawlerProcess(settings) #运行CrawlerProcess _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): print("in cmdline line 98 : i am in eclipes") print("in cmdline line 98 argv %s" % argv) print("in cmdline line 99 settings %s" % settings) if argv is None: argv = sys.argv print("in cmdline line 101 argv %s" % argv) print("in cmdline line 102 settings %s" % settings) # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() print("in cmdline line 115 settings %s" % settings) check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) #从这一步就开始了...开始启动 _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): #如果函数参数 argv是none的话 有从命令行读取参数 if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- #判断函数参数中是否有Settings以及 'scrapy.conf 是否在模块列表中 if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf #判断对象object是否包含名为name的特性(hasattr是通过调用getattr(ojbect, name)是否抛出异常来实现的)。 if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ #如果settings是NONE if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and "scrapy.conf" in sys.modules: from scrapy import conf if hasattr(conf, "settings"): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve") if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(url, project_path, spider_name, callback, result, *arg, **kwargs): """ execute parsing :param url: url :param project_path: project path :param spider_name: spider name :param callback: callback :param result: results generated by multiprocessing :return: """ argv = sys.argv argv.append(url) if spider_name: argv.append('--spider') argv.append(spider_name) if callback: argv.append('--callback') argv.append(callback) work_cwd = os.getcwd() try: # change work dir os.chdir(project_path) print('Move to ', project_path) # get settings of project settings = get_project_settings() check_deprecated_settings(settings) # get args by optparse parser = optparse.OptionParser( formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') # init SpiderParser spider_parser = SpiderParser() settings.setdict(spider_parser.default_settings, priority='command') spider_parser.settings = settings spider_parser.add_options(parser) opts, _ = parser.parse_args(args=argv[1:]) args = [url] spider_parser.process_options(args, opts) # use CrawlerRunner instead of CrawlerProcess spider_parser.crawler_process = CrawlerRunner(settings) # spider_parser.crawler_process = CrawlerProcess(settings) spider_parser.run(args, opts) # get follow requests, items, response requests, items, response = spider_parser.get_requests( ), spider_parser.get_items(), spider_parser.get_response() result['requests'] = requests result['items'] = items result['response'] = response finally: os.chdir(work_cwd)
def execute(argv=None, settings=None): """ 解析命令; 利用CrawlerProcess构建 crawler; """ # 1. 初始化执行环境; 读取用户自定义配置; 读取默认配置; 检查弃用配置项 if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # 检查弃用的配置项 # 2. 检查执行环境是否在项目中: 即scrapy命令执行时的项目目录, 相邻文件夹是否存在scrapy.cfg inproject = inside_project() # 3. 读取当前scrapy支持的所有命令(commands文件夹)并解析命令 cmds = _get_commands_dict(settings, inproject) # 读取所有命令 cmdname = _pop_command_name(argv) # 解析命令 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) # 4. 获取命令实例并设置项目级别(command), 添加解析规则 cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) # 解析命令行 _run_print_help(parser, cmd.process_options, args, opts) # 5. 初始化CrawlerProcess并执行命令实例的run方法(比如命令srapy/command/crawl.py) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) # 运行相应cmd的run方法 sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ crawler = CrawlerProcess(settings) crawler.install() inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.defaults.update(cmd.default_settings) cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.set_crawler(crawler) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def get_follow_requests_and_items(project_path, spider_name, args): """ get follows :param project_path: :param spider_name: :param args: :return: """ work_cwd = os.getcwd() try: os.chdir(project_path) settings = get_project_settings() check_deprecated_settings(settings) sp = SpiderParser(settings, spider_name, args) results = sp.run() return results finally: os.chdir(work_cwd)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # 校验弃用的配置项 inproject = inside_project() # 校验执行环境是否在项目中,命令根据执行环境不同而结果不同 cmds = _get_commands_dict(settings, inproject) # 获取所有命令,以命令名为键,对应的命令对象为值 cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] # 获取形参中命令的对象实例 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command' ) # 把cmd.default_settings(这里为空)加入setting对象的attribute中 cmd.settings = settings # 把setting对象附加到命令对象上 cmd.add_options(parser) # 添加解析规则 opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess( settings) # 初始化CrawlerProcess实例,把实例添加到当前命令中 _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): if argv is None: argv = sys.argv if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) inproject = inside_project() cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') if not cmdname: _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def detect_project_spiders(project_path): """ 检测爬虫工程下所有爬虫脚本及其文件路径 :param project_path: :return: """ work_cwd = os.getcwd() try: os.chdir(project_path) settings = get_project_settings() check_deprecated_settings(settings) runner = CrawlerRunner(settings=settings) # # easy_spiders = runner.spiders # spiders = dict() spider_dict = runner.spider_loader._spiders for spider_name, spider_cls in spider_dict.items(): spiders[spider_name] = spider_cls.__module__ + ".py" finally: ENVVAR = 'SCRAPY_SETTINGS_MODULE' os.environ.__delitem__(ENVVAR) os.chdir(work_cwd) return spiders
def run(): #读取配置文件 setting = get_project_settings() check_deprecated_settings(setting)
def execute(argv=None, settings=None): ## 假设我们是以 execute(['scrapy', 'crawl', 'spidername']) 的形式执行该函数 if argv is None: argv = sys.argv # --- backward compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: ## 获取项目配置 ## 根据环境变量和 scrapy.cfg 初始化环境,最终生成一个 Settings 实例 settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor ## 校验弃用的配置项 check_deprecated_settings(settings) # --- backward compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ ## 执行环境是否在项目中,主要检查 scrapy.cfg 配置文件是否存在 inproject = inside_project() ## 读取 commands 文件夹,把所有的命令类转换为 {cmd_name: cmd_instance, ...} 的字典 cmds = _get_commands_dict(settings, inproject) ## 从命令行参数中解析出执行的是哪个子命令 ## 例如,若命令行中执行的命令是 `scrapy crawl xxx`,这里的 cmdname 就是 'crawl' cmdname = _pop_command_name(argv) ## optparse 模块,可以让程序员能轻松设计出简单明了、易于使用、符合标准的 Unix ## 命令例程式的帮助文档 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') ## 如果 cmdname 为空,则打印所有命令的帮助信息,并退出 Python 程序 if not cmdname: _print_commands(settings, inproject) sys.exit(0) ## 如果 cmdname 不为空,但不在 cmds 字典的键中,则打印未知命令错误,并异常退出程序 elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) ## 根据命令名称找到对应的命令实例 cmd = cmds[cmdname] ## cmd.syntax 方法返回命令的用法 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) ## cmd.long_desc 方法返回对命令的描述 parser.description = cmd.long_desc() ## 设置命令实例的默认配置并指定优先级为 command settings.setdict(cmd.default_settings, priority='command') ## 设置命令实例的配置 cmd.settings = settings ## 为命令实例添加解析规则 cmd.add_options(parser) ## 解析命令参数,并交由 Scrapy 命令实例处理 ## 对于命令 `scrapy crawl xxx` 而言,argv[1:] 为 ['crawl', 'xxx'] ## opts = {'logfile': xxx, 'loglevel': xxx, ...} ## args = ['xxx'] opts, args = parser.parse_args(args=argv[1:]) ## 执行命令实例的 process_options 方法,若执行出错时,打印相关帮助文档后再退出程序 ## 在执行命令前,可以为命令添加一些可选项,用于在命令行级别下,更新配置中的相应配置项 _run_print_help(parser, cmd.process_options, args, opts) ## 根据配置,初始化 CrawlerProcess(爬虫进程) 类的实例 ## 并赋值给命令实例的 crawler_process 属性 cmd.crawler_process = CrawlerProcess(settings) ## 执行命令实例的 run 方法 ## 如果运行命令是 scrapy crawl <spider_name>,则运行的就是 ## commands/crawl.py 的 run 方法 _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): # 获取命令行输入参数 if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- # 向上兼容scrapy.conf单例,其实是报不支持这种配置方式的异常了 if settings is None and 'scrapy.conf' in sys.modules: from . import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ # 获取项目配置 if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor # 检验失效配置项(提示哪些配置已经失效了,这应该是打的补丁把-.-) check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings #获取项目配置 # ------------------------------------------------------------------ # 通过查找有没有cfg文件判定是否在当前执行目录下,用于后面对比require_project属性,决定指令是否可用 inproject = inside_project() # 获取命令字典(key=命令名:value=实例对象) cmds = _get_commands_dict(settings, inproject) cmdname = _pop_command_name(argv) #解析返回当前指令名称 # 指令解析对象 parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \ conflict_handler='resolve') if not cmdname: # 不存在的指令,打印内容(和 -version打印的内容其实一样) _print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: _print_unknown_command(settings, cmdname, inproject) sys.exit(2) #这里告诫我们,没事不要瞎输,你看程序退出了把=。= # 获取指令对象 cmd = cmds[cmdname] # 指令语法 parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) # 指令描述 parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') #保存cmd的settings,并且重复项将会覆盖之前的全局配置 cmd.settings = settings #重复项覆盖之后重新赋值给当前命令settings属性 cmd.add_options(parser) #给指令添加相应指令选项 opts, args = parser.parse_args(args=argv[1:]) _run_print_help(parser, cmd.process_options, args, opts) #输出公共指令参数 # 创建爬虫进程对象,通过此对象进行爬虫启动运行(划重点!!此处可能运行报错不通过,重新安装cryptography) cmd.crawler_process = CrawlerProcess(settings) _run_print_help(parser, _run_command, cmd, args, opts) sys.exit(cmd.exitcode)
def execute(argv=None, settings=None): ''' Run command for scrapy, the original command exit the program when finish crawling so we modified it :param argv: run_command :param settings: setting :return: ''' if argv is None: argv = sys.argv # --- backwards compatibility for scrapy.conf.settings singleton --- if settings is None and 'scrapy.conf' in sys.modules: from scrapy import conf if hasattr(conf, 'settings'): settings = conf.settings # ------------------------------------------------------------------ if settings is None: settings = get_project_settings() # set EDITOR from environment if available try: editor = os.environ['EDITOR'] except KeyError: pass else: settings['EDITOR'] = editor check_deprecated_settings(settings) # --- backwards compatibility for scrapy.conf.settings singleton --- import warnings from scrapy.exceptions import ScrapyDeprecationWarning with warnings.catch_warnings(): warnings.simplefilter("ignore", ScrapyDeprecationWarning) from scrapy import conf conf.settings = settings # ------------------------------------------------------------------ inproject = inside_project() cmds = cmdline._get_commands_dict(settings, inproject) cmdname = cmdline._pop_command_name(argv) parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve') if not cmdname: cmdline._print_commands(settings, inproject) sys.exit(0) elif cmdname not in cmds: cmdline._print_unknown_command(settings, cmdname, inproject) sys.exit(2) cmd = cmds[cmdname] parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax()) parser.description = cmd.long_desc() settings.setdict(cmd.default_settings, priority='command') cmd.settings = settings cmd.add_options(parser) opts, args = parser.parse_args(args=argv[1:]) cmdline._run_print_help(parser, cmd.process_options, args, opts) cmd.crawler_process = CrawlerProcess(settings) try: cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts) except Exception as e: print('Done')