def run(self, args, opts): url = args[0] if args else None if url: # first argument may be a local file url = guess_scheme(url) spider_loader = self.crawler_process.spider_loader spidercls = DefaultSpider if opts.spider: spidercls = spider_loader.load(opts.spider) elif url: spidercls = spidercls_for_request(spider_loader, Request(url), spidercls, log_multiple=True) # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_crawler(spidercls) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url, redirect=not opts.no_redirect)
def run(self, args, opts): url = args[0] if args else None spiders = self.crawler_process.spiders spidercls = DefaultSpider if opts.spider: spidercls = spiders.load(opts.spider) elif url: spidercls = spidercls_for_request(spiders, Request(url), spidercls, log_multiple=True) # The crawler is created this way since the Shell manually handles the # crawling engine, so the set up in the crawl method won't work crawler = self.crawler_process._create_logged_crawler(spidercls) # The Shell class needs a persistent engine in the crawler crawler.engine = crawler._create_engine() crawler.engine.start() self.crawler_process.start(start_reactor=False) self._start_crawler_thread() shell = Shell(crawler, update_vars=self.update_vars, code=opts.code) shell.start(url=url)
def run(self, args, opts): # 参数校验 if len(args) != 1 or not is_url(args[0]): raise UsageError() # 定义输出回调函数 cb = lambda x: self._print_response(x, opts) # 初始化一个request对象 request = Request(args[0], callback=cb, dont_filter=True) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx # 如果选项中没有no_redirect,即不进行转发,则可处理的状态列表中包含除了300到400之间的所有状态码 if not opts.no_redirect: request.meta['handle_httpstatus_list'] = SequenceExclude( range(300, 400)) else: # 否则全部够可以处理,转发有请求库自动处理 request.meta['handle_httpstatus_all'] = True # 初始化赋值为自带简易爬虫 spidercls = DefaultSpider # 初始化爬虫加载器 spider_loader = self.crawler_process.spider_loader # 如果给定了爬虫选项,则根据给定的爬虫来进行爬取,否则根据request url来查找匹配爬虫 if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) # 默认使用自带的简易爬虫(scrapy.utils.spider.DefaultSpider)来进行给定url的数据抓取,只需要传递start_requests self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) # 爬虫开启 self.crawler_process.start()
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() request = Request( args[0], callback=self._print_response, cb_kwargs={"opts": opts}, dont_filter=True, ) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx if not opts.no_redirect: request.meta["handle_httpstatus_list"] = SequenceExclude( range(300, 400)) else: request.meta["handle_httpstatus_all"] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spiders = self.crawler_process.spiders if opts.spider: spidercls = spiders.load(opts.spider) else: spidercls = spidercls_for_request(spiders, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader if opts.spider: try: self.spidercls = spider_loader.load(opts.spider) except KeyError: logger.error('Unable to find spider: %(spider)s', {'spider': opts.spider}) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: logger.error('Unable to find spider for: %(url)s', {'url': url}) def _start_requests(spider): yield self.prepare_request(spider, Request(url), opts) self.spidercls.start_requests = _start_requests
def set_spidercls(self, url, opts): spiders = self.crawler_process.spiders if opts.spider: try: self.spidercls = spiders.load(opts.spider) except KeyError: log.msg(format='Unable to find spider: %(spider)s', level=log.ERROR, spider=opts.spider) else: self.spidercls = spidercls_for_request(spiders, Request(url)) if not self.spidercls: log.msg(format='Unable to find spider for: %(url)s', level=log.ERROR, url=url) request = Request(url, opts.callback) _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests
def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader if opts.spider: try: self.spidercls = spider_loader.load(opts.spider) except KeyError: logger.error('Unable to find spider: %(spider)s', {'spider': opts.spider}) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: logger.error('Unable to find spider for: %(url)s', {'url': url}) request = Request(url, opts.callback) _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests
def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader if opts.spider: try: self.spidercls = spider_loader.load(opts.spider) except KeyError: log.msg(format='Unable to find spider: %(spider)s', level=log.ERROR, spider=opts.spider) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: log.msg(format='Unable to find spider for: %(url)s', level=log.ERROR, url=url) request = Request(url, opts.callback) _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests
def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader if opts.spider: try: self.spidercls = spider_loader.load(opts.spider) except KeyError: logger.error('Unable to find spider: %(spider)s', {'spider': opts.spider}) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: logger.error('Unable to find spider for: %(url)s', {'url': url}) # Request requires callback argument as callable or None, not string request = Request(url, None) _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx if not opts.no_redirect: request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400)) else: request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def run(self, args, opts): ## NLP parser settings self.parser_server = self.settings['PARSER_RPC_SERVER'] # For standalone rpc server (msgpack-rpc) self.parser_msgpack_host = self.settings['PARSER_MSGPACK_HOST'] self.parser_msgpack_port = self.settings['PARSER_MSGPACK_PORT'] # For distributed rpc server (json-rpc) self.parser_jsonrpc_url = self.settings['PARSER_PROXY_URL'] if len(args) != 1 or not is_url(args[0]): raise UsageError() cb = lambda x: self._print_response(x, opts) request = Request(args[0], callback=cb, dont_filter=True) request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def set_spidercls(url_list, args): global crawler_process, spidercls spider_loader = crawler_process.spider_loader if args.spider: try: spidercls = spider_loader.load(args.spider) except KeyError: logger.error('Unable to find spider: %(spider)s', {'spider': args.spider}) else: spidercls = spidercls_for_request(spider_loader, Request(url_list[0])) if not spidercls: logger.error('Unable to find spider for: %(url)s', {'url': url_list[0]}) # Request requires callback argument as callable or None, not string request_list = [] for url in url_list: request_list.append(Request(url, None)) _start_requests = lambda s: [ prepare_request(s, request, args) for request in request_list ] spidercls.start_requests = _start_requests
for rule in spider.rules: if rule.link_extractor.matches(response.url): return rule.callback or "parse" else: logger.error('No CrawlSpider rules found in spider %(spider)r,'please specify a callback to use for parsing'{'spider': spider.name}) def set_spidercls(self, url, opts): spider_loader = self.crawler_process.spider_loader if opts.spider: try: self.spidercls = spider_loader.load(opts.spider) except KeyError: logger.error('Unable to find spider: %(spider)s', {'spider': opts.spider}) else: self.spidercls = spidercls_for_request(spider_loader, Request(url)) if not self.spidercls: logger.error('Unable to find spider for: %(url)s', {'url': url}) # Request requires callback argument as callable or None, not string request = Request(url, None) _start_requests = lambda s: [self.prepare_request(s, request, opts)] self.spidercls.start_requests = _start_requests def start_parsing(self, url, opts): self.crawler_process.crawl(self.spidercls, **opts.spargs) self.pcrawler = list(self.crawler_process.crawlers)[0] self.crawler_process.start() if not self.first_response: logger.error('No response downloaded for: %(url)s',