def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg( format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def test_iter_urls_from_robots(self): robots = '''User-agent: * Disallow: /aff/ Disallow: /wl/ # Search and shopping refining Disallow: /s*/*facet Disallow: /s*/*tags # Sitemap files Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap-product-index.xml # Forums Disallow: /forum/search/ Disallow: /forum/active/ ''' self.assertListEqual(list(iter_urls_from_robots(robots)), ['http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml'])
def test_iter_urls_from_robots(self): robots = '''User-agent: * Disallow: /aff/ Disallow: /wl/ # Search and shopping refining Disallow: /s*/*facet Disallow: /s*/*tags # Sitemap files Sitemap: http://example.com/sitemap.xml Sitemap: http://example.com/sitemap-product-index.xml # Forums Disallow: /forum/search/ Disallow: /forum/active/ ''' self.assertListEqual(list(iter_urls_from_robots(robots)), [ 'http://example.com/sitemap.xml', 'http://example.com/sitemap-product-index.xml' ])