def test_from_content_type(self): mappings = [ ('text/html; charset=UTF-8', HtmlResponse), ('text/xml; charset=UTF-8', XmlResponse), ('application/xhtml+xml; charset=UTF-8', HtmlResponse), ('application/xml; charset=UTF-8', XmlResponse), ('application/octet-stream', Response), ] for source, cls in mappings: retcls = responsetypes.from_content_type(source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def test_from_content_type(self): mappings = [ ("text/html; charset=UTF-8", HtmlResponse), ("text/xml; charset=UTF-8", XmlResponse), ("application/xhtml+xml; charset=UTF-8", HtmlResponse), ("application/vnd.wap.xhtml+xml; charset=utf-8", HtmlResponse), ("application/xml; charset=UTF-8", XmlResponse), ("application/octet-stream", Response), ("application/x-json; encoding=UTF8;charset=UTF-8", TextResponse), ] for source, cls in mappings: retcls = responsetypes.from_content_type(source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def test_from_content_type(self): mappings = [ ('text/html; charset=UTF-8', HtmlResponse), ('text/xml; charset=UTF-8', XmlResponse), ('application/xhtml+xml; charset=UTF-8', HtmlResponse), ('application/vnd.wap.xhtml+xml; charset=utf-8', HtmlResponse), ('application/xml; charset=UTF-8', XmlResponse), ('application/octet-stream', Response), ('application/x-json; encoding=UTF8;charset=UTF-8', TextResponse), ] for source, cls in mappings: retcls = responsetypes.from_content_type(source) assert retcls is cls, "{0!s} ==> {1!s} != {2!s}".format(source, retcls, cls)
def test_from_content_type(self): mappings = [ ('text/html; charset=UTF-8', HtmlResponse), ('text/xml; charset=UTF-8', XmlResponse), ('application/xhtml+xml; charset=UTF-8', HtmlResponse), ('application/vnd.wap.xhtml+xml; charset=utf-8', HtmlResponse), ('application/xml; charset=UTF-8', XmlResponse), ('application/octet-stream', Response), ('application/x-json; encoding=UTF8;charset=UTF-8', TextResponse), ('application/json-amazonui-streaming;charset=UTF-8', TextResponse), ] for source, cls in mappings: retcls = responsetypes.from_content_type(source) assert retcls is cls, f"{source} ==> {retcls} != {cls}"
def test_from_content_type(self): mappings = [ ("text/html; charset=UTF-8", HtmlResponse), ("text/xml; charset=UTF-8", XmlResponse), ("application/xhtml+xml; charset=UTF-8", HtmlResponse), ("application/vnd.wap.xhtml+xml; charset=utf-8", HtmlResponse), ("application/xml; charset=UTF-8", XmlResponse), ("application/octet-stream", Response), ("application/x-json; encoding=UTF8;charset=UTF-8", TextResponse), ("application/json-amazonui-streaming;charset=UTF-8", TextResponse), ] for source, cls in mappings: retcls = responsetypes.from_content_type(source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def on_headers_received(self, response, request, spider): maxsize = getattr(spider, 'download_maxsize', self._default_maxsize) maxsize = request.meta.get('download_maxsize', maxsize) warnsize = getattr(spider, 'download_warnsize', self._default_maxsize) warnsize = request.meta.get('download_warnsize', warnsize) expected_size = response.meta.get('expected_size') # cancel if expected_size is above maxsize if maxsize and expected_size > maxsize: error_message = ("Cancelling download of {url}: expected response " "size ({size}) larger than " "download max size ({maxsize})." ).format(url=request.url, size=expected_size, maxsize=maxsize) logger.info(error_message) return True if warnsize and expected_size > warnsize: logger.info("Expected response size (%(size)s) larger than " "download warn size (%(warnsize)s).", {'size': expected_size, 'warnsize': warnsize}) # don't cancel if non-200 request if not (200 <= response.status < 300): logger.info('response code not between 200 and 300 {0}'.format(response.status)) return False # don't cancel if robots.txt request if 'robots.txt' in request.url: logger.info('robots.txt request') return False # cancel if response is not HTML if b'Content-Type' in response.headers: cls = responsetypes.from_content_type(response.headers[b'Content-Type']) return not issubclass(cls, HtmlResponse) # else don't cancel return False