示例#1
0
def get_content(from_,expect_binary=False):
#{{
    if is_remote(from_):
        if ignore_url(from_):
            return u''

        ct = urllib2.urlopen(from_)
        if not expect_binary:
            s = ct.read()
            encodings = feedparser._getCharacterEncoding(ct.headers,s)
            return unicode(s,encodings[0])
        else:
            return ct.read()
    else:
        s = open(from_).read()
        if not expect_binary:
            encodings = feedparser._getCharacterEncoding({},s)
            return unicode(s,encodings[0])
        else:
            return s
示例#2
0
    def openPath(self, path, fragment=None):
        if "#" in path:
            path, fragment = path.split('#', 1)
        path = QtCore.QUrl.fromPercentEncoding(path)

        if self.cur_path != path:
            self.cur_path = path
            xml = self.epub.getData(path)
            encoding = feedparser._getCharacterEncoding({}, xml)[0]
            xml = xml.decode(encoding)
            self.view.page().mainFrame().setHtml(xml,
                                            QtCore.QUrl("epub://book/" + path))

        if fragment:
            self.javascript('document.location.hash = "%s"' % fragment)
示例#3
0
def get():
    url = request.args.get('url')
    if not url:
        return """
<!doctype html>
<html>
  <head>
    <title>html2text</title>
  </head>
  <body>
    <p><b>This is a private webservice point. Please be fair and don't abuse this service.</b></p>
    <p>In case you want to use this service regular, please consider <a href="https://github.com/aaronsw/html2text-service/">to get the source code</a> and set up your own service.
    <br>This a RESTful web service that converts HTML to
      <a href="http://daringfireball.net/projects/markdown/">Markdown</a>-compatible text using
      <a href="http://www.aaronsw.com/">Aaron Swartz</a>'s <a href="http://www.aaronsw.com/2002/html2text/">html2text.py</a>.</p>
    </p>
    <!--
    <form action="/" method="get">
      <p>URL: <input type="text" name="url" /> <button type="submit">Go</button></p>
    </form>
    

    <form action="/" method="post">
      <p>Or just paste in some HTML:</p>
      <textarea name="html" rows="20" cols="80"></textarea>
      <p><button type="submit">Go</button>
    </form>
    -->
  </body>
</html>
"""
    else:
        req = urllib.urlopen(url)
        text = req.read()
        encoding = _getCharacterEncoding(req.headers, text)[0]
        if encoding == 'us-ascii': encoding = 'utf-8'
        try:
            text = text.decode(encoding)
        except UnicodeDecodeError:
            text = text.decode(chardet.detect(text)['encoding'])
        output = html2text.html2text(text, url)
        return text_response(output)
示例#4
0
def get():
    url = request.args.get('url')
    if not url:
        return """
<!doctype html>
<html>
  <head>
    <title>html2text</title>
  </head>
  <body>
    <p><b>This is a private webservice point. Please be fair and don't abuse this service.</b></p>
    <p>In case you want to use this service regular, please consider <a href="https://github.com/aaronsw/html2text-service/">to get the source code</a> and set up your own service.
    <br>This a RESTful web service that converts HTML to
      <a href="http://daringfireball.net/projects/markdown/">Markdown</a>-compatible text using
      <a href="http://www.aaronsw.com/">Aaron Swartz</a>'s <a href="http://www.aaronsw.com/2002/html2text/">html2text.py</a>.</p>
    </p>
    <!--
    <form action="/" method="get">
      <p>URL: <input type="text" name="url" /> <button type="submit">Go</button></p>
    </form>
    

    <form action="/" method="post">
      <p>Or just paste in some HTML:</p>
      <textarea name="html" rows="20" cols="80"></textarea>
      <p><button type="submit">Go</button>
    </form>
    -->
  </body>
</html>
"""
    else:
        req = urllib.urlopen(url)
        text = req.read()
        encoding = _getCharacterEncoding(req.headers, text)[0]
        if encoding == 'us-ascii': encoding = 'utf-8'
        try:
            text = text.decode(encoding)
        except UnicodeDecodeError:
            text = text.decode(chardet.detect(text)['encoding'])
        output = html2text.html2text(text, url)
        return text_response(output)
示例#5
0
        result['status'] = 200
    if hasattr(f, 'status'):
        result['status'] = f.status
    if hasattr(f, 'headers'):
        result['headers'] = f.headers.dict
    if hasattr(f, 'close'):
        f.close()

    # there are four encodings to keep track of:
    # - http_encoding is the encoding declared in the Content-Type HTTP header
    # - xml_encoding is the encoding declared in the <?xml declaration
    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
    http_headers = result.get('headers', {})
    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
        _getCharacterEncoding(http_headers, data)
    if http_headers and (not acceptable_content_type):
        if http_headers.has_key('content-type'):
            bozo_message = '%s is not an XML media type' % http_headers[
                'content-type']
        else:
            bozo_message = 'no Content-type specified'
        result['bozo'] = 1
        result['bozo_exception'] = NonXMLContentType(bozo_message)

    doctype, data = _stripDoctype(data)  # changed

    baseuri = http_headers.get('content-location', result.get('href'))
    baselang = http_headers.get('content-language', None)

    # if server sent 304, we're done
示例#6
0
    def getLink(self):
        """Reads the HTML page and extracts the link, title and body."""

        if not self.children.intersection(self.attrs):
            return      # mandatory child element missing

        self.loadCache()
        try:
            f = feedparser._open_resource(self.uri, self.etag, self.modified,
                USER_AGENT, None, [], {}
            )
            html = f.read()
        except Exception as e:
            sys.stderr.write('Getting page %s: %s\n' % (self.uri, e))
            return

        if getattr(f, 'status', None) == 304 or not html:
            # not modified or empty page
            return

        # save HTTP headers
        if hasattr(f, 'info'):
            info = f.info()
            etag = info.getheader('ETag')
            modified = info.getheader('Last-Modified')
            if modified:
                modified = feedparser._parse_date(modified)
            self.saveCache(etag, modified)

            # if the page is compressed, decompress it
            ce = info.getheader('Content-Encoding', '')
            if ce == 'gzip':
                try:
                    import gzip
                    import StringIO
                    html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
                except Exception as e:
                    sys.stderr.write('Unzipping page %s: %s\n' % (self.uri, e))
                    return
            elif ce == 'deflate':
                try:
                    import zlib
                    html = zlib.decompress(html, -zlib.MAX_WBITS)
                except Exception as e:
                    sys.stderr.write('Inflating page %s: %s\n' % (self.uri, e))
                    return

        # resolve relative URIs
        html = feedparser._resolveRelativeURIs(html, self.uri, self.encoding, 'text/html')

        if hasattr(f, 'headers'):
            charsets = [c for c in feedparser._getCharacterEncoding(f.headers, html) if c]
        else:
            charsets = [self.encoding]
        for charset in charsets:
            try:
                html = html.decode(charset)
                break
            except UnicodeDecodeError:
                pass
            except LookupError:
                pass

        if 'regex' in self.attrs:
            self.match_regex(html)
        else:
            self.match_xpath(html)
示例#7
0
文件: gd560.py 项目: lqik2004/EzRead
 def __init__(self, url):
     super(MainInfo, self).__init__()
     self.url = url   #网页URL
     self.req = urllib.urlopen(url)
     self.text = self.req.read()
     self.encoding = _getCharacterEncoding(self.req.headers, self.text)[0]
示例#8
0
文件: xspfparser.py 项目: DxCx/xspf
        result['status'] = 200
    if hasattr(f, 'status'):
        result['status'] = f.status
    if hasattr(f, 'headers'):
        result['headers'] = f.headers.dict
    if hasattr(f, 'close'):
        f.close()

    # there are four encodings to keep track of:
    # - http_encoding is the encoding declared in the Content-Type HTTP header
    # - xml_encoding is the encoding declared in the <?xml declaration
    # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
    # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
    http_headers = result.get('headers', {})
    result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
        _getCharacterEncoding(http_headers, data)
    if http_headers and (not acceptable_content_type):
        if http_headers.has_key('content-type'):
            bozo_message = '%s is not an XML media type' % http_headers['content-type']
        else:
            bozo_message = 'no Content-type specified'
        result['bozo'] = 1
        result['bozo_exception'] = NonXMLContentType(bozo_message)
        
    doctype, data = _stripDoctype(data) # changed

    baseuri = http_headers.get('content-location', result.get('href'))
    baselang = http_headers.get('content-language', None)

    # if server sent 304, we're done
    if result.get('status', 0) == 304:
示例#9
0
 def __init__(self, url):
     super(MainInfo, self).__init__()
     self.url = url  #网页URL
     self.req = urllib.urlopen(url)
     self.text = self.req.read()
     self.encoding = _getCharacterEncoding(self.req.headers, self.text)[0]