def test_get_gzip(): url = 'http://httpbin.org/gzip' for i, readable in enumerate(URL(url).get(decode_content=False)): response = Response.from_readable(readable) data = json.load(utf8_reader(decode(response))) assert data.get('gzipped') assert i == 0
def urls_from_urlset_or_sitemapindex(response): """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per `sitemaps.org <http://www.sitemaps.org/protocol.html>`_. """ sitemap = URL(response.url).fragment_dict.get('sitemap') content_subtypes = response.headers.get_content_subtype().split('+') if not sitemap and not 'xml' in content_subtypes: return root = None for _, elem in iterparse(decode(response)): if root is None: root = elem.getroottree().getroot() if not (root.tag.endswith('}sitemapindex') or root.tag.endswith('}urlset')): # root element has wrong tag - give up break if elem.tag.endswith('}loc') and elem.text is not None: text = elem.text.strip() if text: # http://www.sitemaps.org/protocol.html#locdef url = URL(urljoin(response.url, text)) if elem.getparent().tag.endswith('}sitemap'): # set sitemap=True to help downstream processing url = url.update_fragment_dict(sitemap=True) yield "url", url if elem.getparent() is root: # release memory for previous elements while elem.getprevious() is not None: del root[0]
def test_get_gzip(): url = "http://httpbin.org/gzip" for i, readable in enumerate(URL(url).get(decode_content=False)): response = Response.from_readable(readable) assert response.headers.get("X-wex-has-gzip-magic") == "1" data = json.load(utf8_reader(decode(response))) assert data.get("gzipped") assert i == 0
def get(url, **kw): codes = [] for readable in URL(url).get(**kw): response = Response.from_readable(readable) codes.append(response.code) if response.code == 200: data = json.load(utf8_reader(decode(response))) assert 'headers' in data return codes
def urls_from_urlset_or_sitemapindex(response): """ Yields URLs from ``<urlset>`` or ``<sitemapindex>`` elements as per `sitemaps.org <http://www.sitemaps.org/protocol.html>`_. """ sitemap = URL(response.url).fragment_dict.get('sitemap') content_subtypes = response.headers.get_content_subtype().split('+') if not sitemap and 'xml' not in content_subtypes: return root = None try: for _, elem in iterparse(decode(response)): if root is None: root = elem.getroottree().getroot() if not (root.tag.endswith('}sitemapindex') or root.tag.endswith('}urlset')): # root element has wrong tag - give up break if elem.tag.endswith('}loc') and elem.text is not None: text = elem.text.strip() if text: # http://www.sitemaps.org/protocol.html#locdef url = URL(urljoin(response.url, text)) if elem.getparent().tag.endswith('}sitemap'): # set sitemap=True to help downstream processing url = url.update_fragment_dict(sitemap=True) yield "url", url if elem.getparent() is root: # release memory for previous elements while elem.getprevious() is not None: del root[0] except XMLSyntaxError: log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code)