Exemplo n.º 1
0
 def get_sitemaps(self):
     rr = crawle.quick_request(self.SITEMAP_INDEX, redirects=1)
     if rr.response_status != 200:
         print 'Could not get index: %d' % rr.response_status
         sys.exit(1)
     data = rr.response_body
     return [x for x in self.LOC_RE.findall(data) if 'product' in x]
Exemplo n.º 2
0
 def get_product_ids(self):
     sitemaps = self.get_sitemaps()
     for sitemap in sitemaps:
         rr = crawle.quick_request(sitemap, redirects=1)
         if rr.response_status != 200:
             print 'Error fetching sitemap: %d' % rr.response_status
             print rr.request_url, rr.response_url
             sys.exit(1)
         body = gzip.GzipFile(fileobj=StringIO(rr.response_body)).read()
         self.item_ids.extend(self.ITEM_ID_RE.findall(body))