예제 #1
0
def read_sitemaps(sitemaps, max_urls=10000):
    """
        Read one or more sitemaps and return all urls.
        sitemaps: a list of sitemap urls
        max_urls: stop processing more sitemaps if max_urls already found
    """
    urls = set()
    while len(sitemaps) > 0:
        url = sitemaps.pop(0)
        webpage = _fetch_url(url)
        if len(webpage) == 0:
            continue
        # not every server returns correct Content-Encoding
        if "sitemaps" not in webpage[:1000]:
            try:
                webpage = GzipFile(fileobj=StringIO(webpage)).read()
                if "sitemaps" not in webpage[:1000]:
                    continue
            except:
                logging.debug(traceback.format_exc())
                continue
        # read sitemap
        logging.debug("Reading sitemap: " + url)
        if isinstance(webpage, unicode):
            webpage = webpage.encode("utf-8")
        _read_sitemap(webpage, urls, sitemaps)
        logging.debug("URLs so far: %s" % len(urls))
        if len(urls) > max_urls:
            break
    return list(urls)[:max_urls]
예제 #2
0
파일: morss.py 프로젝트: SamuelMarks/morss
    def http_response(self, req, resp):
        urllib2.HTTPCookieProcessor.http_response(self, req, resp)
        data = resp.read()

        if 200 <= resp.code < 300:
            # gzip
            if resp.headers.get('Content-Encoding') == 'gzip':
                log('un-gzip')
                data = GzipFile(fileobj=StringIO(data), mode='r').read()

        if 200 <= resp.code < 300 and resp.info().maintype == 'text':
            # <meta> redirect
            if resp.info().type in MIMETYPE['html']:
                match = re.search(r'(?i)<meta http-equiv=.refresh[^>]*?url=(http.*?)["\']', data)
                if match:
                    new_url = match.groups()[0]
                    log('redirect: %s' % new_url)

                    new_headers = dict((k, v) for k, v in req.headers.items()
                                       if k.lower() not in ('content-length', 'content-type'))
                    new = urllib2.Request(new_url,
                                          headers=new_headers,
                                          origin_req_host=req.get_origin_req_host(),
                                          unverifiable=True)

                    return self.parent.open(new, timeout=req.timeout)

            # encoding
            enc = detect_encoding(data, resp)

            if enc:
                data = data.decode(enc, 'replace')

                if not self.decode:
                    data = data.encode(enc)

        fp = StringIO(data)
        old_resp = resp
        resp = urllib2.addinfourl(fp, old_resp.headers, old_resp.url, old_resp.code)
        resp.msg = old_resp.msg

        return resp