def read_sitemaps(sitemaps, max_urls=10000): """ Read one or more sitemaps and return all urls. sitemaps: a list of sitemap urls max_urls: stop processing more sitemaps if max_urls already found """ urls = set() while len(sitemaps) > 0: url = sitemaps.pop(0) webpage = _fetch_url(url) if len(webpage) == 0: continue # not every server returns correct Content-Encoding if "sitemaps" not in webpage[:1000]: try: webpage = GzipFile(fileobj=StringIO(webpage)).read() if "sitemaps" not in webpage[:1000]: continue except: logging.debug(traceback.format_exc()) continue # read sitemap logging.debug("Reading sitemap: " + url) if isinstance(webpage, unicode): webpage = webpage.encode("utf-8") _read_sitemap(webpage, urls, sitemaps) logging.debug("URLs so far: %s" % len(urls)) if len(urls) > max_urls: break return list(urls)[:max_urls]
def http_response(self, req, resp): urllib2.HTTPCookieProcessor.http_response(self, req, resp) data = resp.read() if 200 <= resp.code < 300: # gzip if resp.headers.get('Content-Encoding') == 'gzip': log('un-gzip') data = GzipFile(fileobj=StringIO(data), mode='r').read() if 200 <= resp.code < 300 and resp.info().maintype == 'text': # <meta> redirect if resp.info().type in MIMETYPE['html']: match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data) if match: new_url = match.groups()[0] log('redirect: %s' % new_url) new_headers = dict((k, v) for k, v in req.headers.items() if k.lower() not in ('content-length', 'content-type')) new = urllib2.Request(new_url, headers=new_headers, origin_req_host=req.get_origin_req_host(), unverifiable=True) return self.parent.open(new, timeout=req.timeout) # encoding enc = detect_encoding(data, resp) if enc: data = data.decode(enc, 'replace') if not self.decode: data = data.encode(enc) fp = StringIO(data) old_resp = resp resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp