def test_chained_does_seek_response(): readable = resource_stream(__name__, 'fixtures/robots_txt') response = Response.from_readable(readable) # use the same extractor twice extract = chained(extract_first_line, extract_first_line) values = list(extract(response)) # and we get the same first line because chained re-seeks to 0 assert values == [(b'# /robots.txt\n', ), (b'# /robots.txt\n', )]
def test_chained_does_seek_response(): readable = resource_stream(__name__, 'fixtures/robots_txt') response = Response.from_readable(readable) # use the same extractor twice extract = chained(extract_first_line, extract_first_line) values = list(extract(response)) # and we get the same first line because chained re-seeks to 0 assert values == [(b'# /robots.txt\n',), (b'# /robots.txt\n',)]
def test_labelled_chained(): # bug test labeller = (lambda x: x) extract = labelled(labeller, chained(extract_arg0)) assert list(extract("foo")) == [("foo", "foo")]
def test_chained_extractor_raises(): extract = chained(extract_with_error) items = list(extract('foo')) assert items == [(my_error,)]
if root is None: root = elem.getroottree().getroot() if not (root.tag.endswith('}sitemapindex') or root.tag.endswith('}urlset')): # root element has wrong tag - give up break if elem.tag.endswith('}loc') and elem.text is not None: text = elem.text.strip() if text: # http://www.sitemaps.org/protocol.html#locdef url = URL(urljoin(response.url, text)) if elem.getparent().tag.endswith('}sitemap'): # set sitemap=True to help downstream processing url = url.update_fragment_dict(sitemap=True) yield "url", url if elem.getparent() is root: # release memory for previous elements while elem.getprevious() is not None: del root[0] except XMLSyntaxError: log.debug("XMLSyntaxError in '%s' (%d)", response.url, response.code) #: Extractor that combines :func:`.urls_from_robots_txt` and #: :func:`.urls_from_urlset_or_sitemapindex`. urls_from_sitemaps = chained(urls_from_robots_txt, urls_from_urlset_or_sitemapindex)
def test_chained_extractor_raises(): extract = chained(extract_with_error) items = list(extract('foo')) assert items == [(my_error, )]