Пример #1
0
def opener(url, ienc=None, save=None, **ka):
    if url.startswith(FILE):
        url = url[len(FILE) :]
        import io

        d = io.open(url, "rb").read()
        if not ienc:
            import chardet  # chardet.feedparser.org, python3-chardet, python-chardet

            ienc = chardet.detect(d)["encoding"]  # .confidence
    else:
        ienc = None
        r = url_opener(url, ka)
        d = r.read()
        r.close()
        if save:
            open(url.replace("/", "__"), "wb").write(d)

    # remove hanging open <
    d = re.sub(b"<([^>]*?<)", rb"\1", d)
    # if d!=d1: print( 2222222)

    if ienc:
        d = d.decode(ienc)
    return d
Пример #2
0
 def loadUrl(self, url):
   self.setUrl(url)
   # Use PyQuery's URL opener to properly handle content encoding
   html = url_opener(url, {})
   if hasattr(html, 'read'):
     html = html.read()
   self.loadHtml( str(html) )
Пример #3
0
 def loadUrl(self, url):
     self.setUrl(url)
     # Use PyQuery's URL opener to properly handle content encoding
     html = url_opener(url, {})
     if hasattr(html, 'read'):
         html = html.read()
     self.loadHtml(str(html))