def get_content(from_,expect_binary=False): #{{ if is_remote(from_): if ignore_url(from_): return u'' ct = urllib2.urlopen(from_) if not expect_binary: s = ct.read() encodings = feedparser._getCharacterEncoding(ct.headers,s) return unicode(s,encodings[0]) else: return ct.read() else: s = open(from_).read() if not expect_binary: encodings = feedparser._getCharacterEncoding({},s) return unicode(s,encodings[0]) else: return s
def openPath(self, path, fragment=None): if "#" in path: path, fragment = path.split('#', 1) path = QtCore.QUrl.fromPercentEncoding(path) if self.cur_path != path: self.cur_path = path xml = self.epub.getData(path) encoding = feedparser._getCharacterEncoding({}, xml)[0] xml = xml.decode(encoding) self.view.page().mainFrame().setHtml(xml, QtCore.QUrl("epub://book/" + path)) if fragment: self.javascript('document.location.hash = "%s"' % fragment)
def get(): url = request.args.get('url') if not url: return """ <!doctype html> <html> <head> <title>html2text</title> </head> <body> <p><b>This is a private webservice point. Please be fair and don't abuse this service.</b></p> <p>In case you want to use this service regular, please consider <a href="https://github.com/aaronsw/html2text-service/">to get the source code</a> and set up your own service. <br>This a RESTful web service that converts HTML to <a href="http://daringfireball.net/projects/markdown/">Markdown</a>-compatible text using <a href="http://www.aaronsw.com/">Aaron Swartz</a>'s <a href="http://www.aaronsw.com/2002/html2text/">html2text.py</a>.</p> </p> <!-- <form action="/" method="get"> <p>URL: <input type="text" name="url" /> <button type="submit">Go</button></p> </form> <form action="/" method="post"> <p>Or just paste in some HTML:</p> <textarea name="html" rows="20" cols="80"></textarea> <p><button type="submit">Go</button> </form> --> </body> </html> """ else: req = urllib.urlopen(url) text = req.read() encoding = _getCharacterEncoding(req.headers, text)[0] if encoding == 'us-ascii': encoding = 'utf-8' try: text = text.decode(encoding) except UnicodeDecodeError: text = text.decode(chardet.detect(text)['encoding']) output = html2text.html2text(text, url) return text_response(output)
result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status if hasattr(f, 'headers'): result['headers'] = f.headers.dict if hasattr(f, 'close'): f.close() # there are four encodings to keep track of: # - http_encoding is the encoding declared in the Content-Type HTTP header # - xml_encoding is the encoding declared in the <?xml declaration # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications http_headers = result.get('headers', {}) result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ _getCharacterEncoding(http_headers, data) if http_headers and (not acceptable_content_type): if http_headers.has_key('content-type'): bozo_message = '%s is not an XML media type' % http_headers[ 'content-type'] else: bozo_message = 'no Content-type specified' result['bozo'] = 1 result['bozo_exception'] = NonXMLContentType(bozo_message) doctype, data = _stripDoctype(data) # changed baseuri = http_headers.get('content-location', result.get('href')) baselang = http_headers.get('content-language', None) # if server sent 304, we're done
def getLink(self): """Reads the HTML page and extracts the link, title and body.""" if not self.children.intersection(self.attrs): return # mandatory child element missing self.loadCache() try: f = feedparser._open_resource(self.uri, self.etag, self.modified, USER_AGENT, None, [], {} ) html = f.read() except Exception as e: sys.stderr.write('Getting page %s: %s\n' % (self.uri, e)) return if getattr(f, 'status', None) == 304 or not html: # not modified or empty page return # save HTTP headers if hasattr(f, 'info'): info = f.info() etag = info.getheader('ETag') modified = info.getheader('Last-Modified') if modified: modified = feedparser._parse_date(modified) self.saveCache(etag, modified) # if the page is compressed, decompress it ce = info.getheader('Content-Encoding', '') if ce == 'gzip': try: import gzip import StringIO html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read() except Exception as e: sys.stderr.write('Unzipping page %s: %s\n' % (self.uri, e)) return elif ce == 'deflate': try: import zlib html = zlib.decompress(html, -zlib.MAX_WBITS) except Exception as e: sys.stderr.write('Inflating page %s: %s\n' % (self.uri, e)) return # resolve relative URIs html = feedparser._resolveRelativeURIs(html, self.uri, self.encoding, 'text/html') if hasattr(f, 'headers'): charsets = [c for c in feedparser._getCharacterEncoding(f.headers, html) if c] else: charsets = [self.encoding] for charset in charsets: try: html = html.decode(charset) break except UnicodeDecodeError: pass except LookupError: pass if 'regex' in self.attrs: self.match_regex(html) else: self.match_xpath(html)
def __init__(self, url): super(MainInfo, self).__init__() self.url = url #网页URL self.req = urllib.urlopen(url) self.text = self.req.read() self.encoding = _getCharacterEncoding(self.req.headers, self.text)[0]
result['status'] = 200 if hasattr(f, 'status'): result['status'] = f.status if hasattr(f, 'headers'): result['headers'] = f.headers.dict if hasattr(f, 'close'): f.close() # there are four encodings to keep track of: # - http_encoding is the encoding declared in the Content-Type HTTP header # - xml_encoding is the encoding declared in the <?xml declaration # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications http_headers = result.get('headers', {}) result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ _getCharacterEncoding(http_headers, data) if http_headers and (not acceptable_content_type): if http_headers.has_key('content-type'): bozo_message = '%s is not an XML media type' % http_headers['content-type'] else: bozo_message = 'no Content-type specified' result['bozo'] = 1 result['bozo_exception'] = NonXMLContentType(bozo_message) doctype, data = _stripDoctype(data) # changed baseuri = http_headers.get('content-location', result.get('href')) baselang = http_headers.get('content-language', None) # if server sent 304, we're done if result.get('status', 0) == 304: