Пример #1
0
    def _download_request(self, request, info):
        """This method is used for HEAD and PUT requests sent to amazon S3

        It tries to use a specific spider domain for uploads, or defaults
        to current domain spider.
        """
        if self.s3_spider:
            # need to use schedule to auto-open domain
            return scrapyengine.schedule(request, self.s3_spider)
        return scrapyengine.download(request, info.spider)
Пример #2
0
 def robot_parser(self, request, spider):
     url = urlparse_cached(request)
     netloc = url.netloc
     if netloc not in self._parsers:
         self._parsers[netloc] = None
         robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
         robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
         dfd = scrapyengine.download(robotsreq, spider)
         dfd.addCallback(self._parse_robots)
         self._spider_netlocs[spider].add(netloc)
     return self._parsers[netloc]
Пример #3
0
    def download(self, request, info):
        """ Defines how to request the download of media

        Default gives high priority to media requests and use scheduler,
        shouldn't be necessary to override.

        This methods is called only if result for request isn't cached,
        request fingerprint is used as cache key.

        """
        request.priority = self.DOWNLOAD_PRIORITY
        return scrapyengine.download(request, info.spider)