示例#1
0
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            # first argument may be a local file
            url = guess_scheme(url)

        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader,
                                              Request(url),
                                              spidercls,
                                              log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, redirect=not opts.no_redirect)
示例#2
0
    def run(self, args, opts):
        url = args[0] if args else None
        spiders = self.crawler_process.spiders

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spiders,
                                              Request(url),
                                              spidercls,
                                              log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_logged_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self.crawler_process.start(start_reactor=False)
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url)
示例#3
0
 def run(self, args, opts):
     # 参数校验
     if len(args) != 1 or not is_url(args[0]):
         raise UsageError()
     # 定义输出回调函数
     cb = lambda x: self._print_response(x, opts)
     # 初始化一个request对象
     request = Request(args[0], callback=cb, dont_filter=True)
     # by default, let the framework handle redirects,
     # i.e. command handles all codes expect 3xx
     # 如果选项中没有no_redirect,即不进行转发,则可处理的状态列表中包含除了300到400之间的所有状态码
     if not opts.no_redirect:
         request.meta['handle_httpstatus_list'] = SequenceExclude(
             range(300, 400))
     else:
         # 否则全部够可以处理,转发有请求库自动处理
         request.meta['handle_httpstatus_all'] = True
     # 初始化赋值为自带简易爬虫
     spidercls = DefaultSpider
     # 初始化爬虫加载器
     spider_loader = self.crawler_process.spider_loader
     # 如果给定了爬虫选项,则根据给定的爬虫来进行爬取,否则根据request url来查找匹配爬虫
     if opts.spider:
         spidercls = spider_loader.load(opts.spider)
     else:
         spidercls = spidercls_for_request(spider_loader, request,
                                           spidercls)
     # 默认使用自带的简易爬虫(scrapy.utils.spider.DefaultSpider)来进行给定url的数据抓取,只需要传递start_requests
     self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
     # 爬虫开启
     self.crawler_process.start()
示例#4
0
文件: shell.py 项目: elacuesta/scrapy
    def run(self, args, opts):
        url = args[0] if args else None
        if url:
            # first argument may be a local file
            url = guess_scheme(url)

        spider_loader = self.crawler_process.spider_loader

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spider_loader, Request(url),
                                              spidercls, log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url, redirect=not opts.no_redirect)
示例#5
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        request = Request(
            args[0],
            callback=self._print_response,
            cb_kwargs={"opts": opts},
            dont_filter=True,
        )
        # by default, let the framework handle redirects,
        # i.e. command handles all codes expect 3xx
        if not opts.no_redirect:
            request.meta["handle_httpstatus_list"] = SequenceExclude(
                range(300, 400))
        else:
            request.meta["handle_httpstatus_all"] = True

        spidercls = DefaultSpider
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spider_loader, request,
                                              spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
示例#6
0
文件: fetch.py 项目: jtwaleson/scrapy
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spiders = self.crawler_process.spiders
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spiders, request, spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
示例#7
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spiders = self.crawler_process.spiders
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spiders, request, spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
示例#8
0
    def set_spidercls(self, url, opts):
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            try:
                self.spidercls = spider_loader.load(opts.spider)
            except KeyError:
                logger.error('Unable to find spider: %(spider)s',
                             {'spider': opts.spider})
        else:
            self.spidercls = spidercls_for_request(spider_loader, Request(url))
            if not self.spidercls:
                logger.error('Unable to find spider for: %(url)s', {'url': url})

        def _start_requests(spider):
            yield self.prepare_request(spider, Request(url), opts)
        self.spidercls.start_requests = _start_requests
示例#9
0
文件: parse.py 项目: zhongxig/scrapy
    def set_spidercls(self, url, opts):
        spiders = self.crawler_process.spiders
        if opts.spider:
            try:
                self.spidercls = spiders.load(opts.spider)
            except KeyError:
                log.msg(format='Unable to find spider: %(spider)s',
                        level=log.ERROR, spider=opts.spider)
        else:
            self.spidercls = spidercls_for_request(spiders, Request(url))
            if not self.spidercls:
                log.msg(format='Unable to find spider for: %(url)s',
                        level=log.ERROR, url=url)

        request = Request(url, opts.callback)
        _start_requests = lambda s: [self.prepare_request(s, request, opts)]
        self.spidercls.start_requests = _start_requests
示例#10
0
    def set_spidercls(self, url, opts):
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            try:
                self.spidercls = spider_loader.load(opts.spider)
            except KeyError:
                logger.error('Unable to find spider: %(spider)s',
                             {'spider': opts.spider})
        else:
            self.spidercls = spidercls_for_request(spider_loader, Request(url))
            if not self.spidercls:
                logger.error('Unable to find spider for: %(url)s',
                             {'url': url})

        request = Request(url, opts.callback)
        _start_requests = lambda s: [self.prepare_request(s, request, opts)]
        self.spidercls.start_requests = _start_requests
示例#11
0
文件: parse.py 项目: AlekseyEf/scrapy
    def set_spidercls(self, url, opts):
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            try:
                self.spidercls = spider_loader.load(opts.spider)
            except KeyError:
                log.msg(format='Unable to find spider: %(spider)s',
                        level=log.ERROR, spider=opts.spider)
        else:
            self.spidercls = spidercls_for_request(spider_loader, Request(url))
            if not self.spidercls:
                log.msg(format='Unable to find spider for: %(url)s',
                        level=log.ERROR, url=url)

        request = Request(url, opts.callback)
        _start_requests = lambda s: [self.prepare_request(s, request, opts)]
        self.spidercls.start_requests = _start_requests
    def set_spidercls(self, url, opts):
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            try:
                self.spidercls = spider_loader.load(opts.spider)
            except KeyError:
                logger.error('Unable to find spider: %(spider)s',
                             {'spider': opts.spider})
        else:
            self.spidercls = spidercls_for_request(spider_loader, Request(url))
            if not self.spidercls:
                logger.error('Unable to find spider for: %(url)s',
                             {'url': url})

        # Request requires callback argument as callable or None, not string
        request = Request(url, None)
        _start_requests = lambda s: [self.prepare_request(s, request, opts)]
        self.spidercls.start_requests = _start_requests
示例#13
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        cb = lambda x: self._print_response(x, opts)
        request = Request(args[0], callback=cb, dont_filter=True)
        # by default, let the framework handle redirects,
        # i.e. command handles all codes expect 3xx
        if not opts.no_redirect:
            request.meta['handle_httpstatus_list'] = SequenceExclude(range(300, 400))
        else:
            request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spider_loader, request, spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
示例#14
0
文件: shell.py 项目: Bia-lx/scrapy
    def run(self, args, opts):
        url = args[0] if args else None
        spiders = self.crawler_process.spiders

        spidercls = DefaultSpider
        if opts.spider:
            spidercls = spiders.load(opts.spider)
        elif url:
            spidercls = spidercls_for_request(spiders, Request(url),
                                              spidercls, log_multiple=True)

        # The crawler is created this way since the Shell manually handles the
        # crawling engine, so the set up in the crawl method won't work
        crawler = self.crawler_process._create_logged_crawler(spidercls)
        # The Shell class needs a persistent engine in the crawler
        crawler.engine = crawler._create_engine()
        crawler.engine.start()

        self.crawler_process.start(start_reactor=False)
        self._start_crawler_thread()

        shell = Shell(crawler, update_vars=self.update_vars, code=opts.code)
        shell.start(url=url)
示例#15
0
  def run(self, args, opts):
    ## NLP parser settings
    self.parser_server = self.settings['PARSER_RPC_SERVER']
    # For standalone rpc server (msgpack-rpc)
    self.parser_msgpack_host = self.settings['PARSER_MSGPACK_HOST']
    self.parser_msgpack_port = self.settings['PARSER_MSGPACK_PORT']
    # For distributed rpc server (json-rpc)
    self.parser_jsonrpc_url  = self.settings['PARSER_PROXY_URL'] 
    
    if len(args) != 1 or not is_url(args[0]):
      raise UsageError()
    
    cb = lambda x: self._print_response(x, opts)
    request = Request(args[0], callback=cb, dont_filter=True)
    request.meta['handle_httpstatus_all'] = True

    spidercls = DefaultSpider
    spider_loader = self.crawler_process.spider_loader
    if opts.spider:
      spidercls = spider_loader.load(opts.spider)
    else:
      spidercls = spidercls_for_request(spider_loader, request, spidercls)
    self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
    self.crawler_process.start()
示例#16
0
def set_spidercls(url_list, args):
    global crawler_process, spidercls
    spider_loader = crawler_process.spider_loader
    if args.spider:
        try:
            spidercls = spider_loader.load(args.spider)
        except KeyError:
            logger.error('Unable to find spider: %(spider)s',
                         {'spider': args.spider})
    else:
        spidercls = spidercls_for_request(spider_loader, Request(url_list[0]))
        if not spidercls:
            logger.error('Unable to find spider for: %(url)s',
                         {'url': url_list[0]})

    # Request requires callback argument as callable or None, not string
    request_list = []
    for url in url_list:
        request_list.append(Request(url, None))

    _start_requests = lambda s: [
        prepare_request(s, request, args) for request in request_list
    ]
    spidercls.start_requests = _start_requests
示例#17
0
文件: parse.py 项目: Kunal614/scrapy
            for rule in spider.rules:
                if rule.link_extractor.matches(response.url):
                    return rule.callback or "parse"
        else:
            logger.error('No CrawlSpider rules found in spider %(spider)r,'please specify a callback to use for parsing'{'spider': spider.name})

    def set_spidercls(self, url, opts):
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            try:
                self.spidercls = spider_loader.load(opts.spider)
            except KeyError:
                logger.error('Unable to find spider: %(spider)s',
                             {'spider': opts.spider})
        else:
            self.spidercls = spidercls_for_request(spider_loader, Request(url))
            if not self.spidercls:
                logger.error('Unable to find spider for: %(url)s', {'url': url})

        # Request requires callback argument as callable or None, not string
        request = Request(url, None)
        _start_requests = lambda s: [self.prepare_request(s, request, opts)]
        self.spidercls.start_requests = _start_requests

    def start_parsing(self, url, opts):
        self.crawler_process.crawl(self.spidercls, **opts.spargs)
        self.pcrawler = list(self.crawler_process.crawlers)[0]
        self.crawler_process.start()

        if not self.first_response:
            logger.error('No response downloaded for: %(url)s',