def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg( format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def _parse_sitemap(self, response): requests = [] if response.url.endswith('/robots.txt'): self._sitemap_urls.extend(iter_urls_from_robots(response.body)) else: sitemap_body = get_sitemap_body(response) if sitemap_body is None: log.msg(format='Invalid sitemap %(url)s', level=log.WARNING, url=response.url) return [] sitemap_type = get_sitemap_type(sitemap_body) if sitemap_type == 'sitemapindex': log.msg(format='Sitemap %(url)s is of type <sitemapindex>', level=log.DEBUG, url=response.url) self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body)) elif sitemap_type == 'urlset': log.msg(format='Sitemap %(url)s is of type <urlset>', level=log.DEBUG, url=response.url) self._site_urls.extend(iter_urls_from_sitemap(sitemap_body)) else: log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s', level=log.WARNING, url=response.url, stype=sitemap_type) return requests
def test_get_sitemap_type(self): self.assertEqual(get_sitemap_type(sitemapindex), 'sitemapindex') self.assertEqual(get_sitemap_type(urlset), 'urlset')
def test_get_sitemap_type(self): self.assertEqual(get_sitemap_type(sitemapindex), 'sitemapindex') self.assertEqual(get_sitemap_type(urlset), 'urlset')