def validate(self): super().validate() res = None try: http_cache = self.job.parameters["dispatcher"].get( "http_url_format_string", "") if http_cache: self.logger.info("Using caching service: '%s'", http_cache) try: self.url = urlparse(http_cache % quote_plus(self.url.geturl())) except TypeError as exc: self.logger.error("Invalid http_url_format_string: '%s'", exc) self.errors = "Invalid http_url_format_string: '%s'" % str( exc) return headers = {"Accept-Encoding": ""} if self.params and "headers" in self.params: headers.update(self.params["headers"]) self.logger.debug("Validating that %s exists", self.url.geturl()) # Force the non-use of Accept-Encoding: gzip, this will permit to know the final size res = requests_retry().head( self.url.geturl(), allow_redirects=True, headers=headers, timeout=HTTP_DOWNLOAD_TIMEOUT, ) if res.status_code != requests.codes.OK: # try using (the slower) get for services with broken redirect support self.logger.debug( "Using GET because HEAD is not supported properly") res.close() # Like for HEAD, we need get a size, so disable gzip res = requests_retry().get( self.url.geturl(), allow_redirects=True, stream=True, headers=headers, timeout=HTTP_DOWNLOAD_TIMEOUT, ) if res.status_code != requests.codes.OK: self.errors = "Resource unavailable at '%s' (%d)" % ( self.url.geturl(), res.status_code, ) return self.size = int(res.headers.get("content-length", -1)) except requests.Timeout: self.logger.error("Request timed out") self.errors = "'%s' timed out" % (self.url.geturl()) except requests.RequestException as exc: self.logger.error("Resource not available") self.errors = "Unable to get '%s': %s" % (self.url.geturl(), str(exc)) finally: if res is not None: res.close()
def reader(self): res = None try: # FIXME: When requests 3.0 is released, use the enforce_content_length # parameter to raise an exception the file is not fully downloaded headers = None if self.params and "headers" in self.params: headers = self.params["headers"] res = requests_retry().get( self.url.geturl(), allow_redirects=True, stream=True, headers=headers, timeout=HTTP_DOWNLOAD_TIMEOUT, ) if res.status_code != requests.codes.OK: # This is an Infrastructure error because the validate function # checked that the file does exist. raise InfrastructureError("Unable to download '%s'" % (self.url.geturl())) for buff in res.iter_content(HTTP_DOWNLOAD_CHUNK_SIZE): yield buff except requests.RequestException as exc: raise InfrastructureError("Unable to download '%s': %s" % (self.url.geturl(), str(exc))) finally: if res is not None: res.close()