Exemplo n.º 1
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING,
                        url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG,
                        url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG,
                        url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(
                    format='Unrecognized type of sitemap %(url)s: %(stype)s',
                    level=log.WARNING,
                    url=response.url,
                    stype=sitemap_type)
        return requests
Exemplo n.º 2
0
    def _parse_sitemap(self, response):
        requests = []

        if response.url.endswith('/robots.txt'):
            self._sitemap_urls.extend(iter_urls_from_robots(response.body))
        else:
            sitemap_body = get_sitemap_body(response)
            if sitemap_body is None:
                log.msg(format='Invalid sitemap %(url)s',
                        level=log.WARNING, url=response.url)
                return []

            sitemap_type = get_sitemap_type(sitemap_body)
            if sitemap_type == 'sitemapindex':
                log.msg(format='Sitemap %(url)s is of type <sitemapindex>',
                        level=log.DEBUG, url=response.url)
                self._sitemap_urls.extend(iter_urls_from_sitemap(sitemap_body))
            elif sitemap_type == 'urlset':
                log.msg(format='Sitemap %(url)s is of type <urlset>',
                        level=log.DEBUG, url=response.url)
                self._site_urls.extend(iter_urls_from_sitemap(sitemap_body))
            else:
                log.msg(format='Unrecognized type of sitemap %(url)s: %(stype)s',
                        level=log.WARNING, url=response.url, stype=sitemap_type)
        return requests
Exemplo n.º 3
0
 def test_get_sitemap_type(self):
     self.assertEqual(get_sitemap_type(sitemapindex), 'sitemapindex')
     self.assertEqual(get_sitemap_type(urlset), 'urlset')
Exemplo n.º 4
0
 def test_get_sitemap_type(self):
     self.assertEqual(get_sitemap_type(sitemapindex), 'sitemapindex')
     self.assertEqual(get_sitemap_type(urlset), 'urlset')