def replace_entities(ustring, placeholder=" "): """Replaces HTML special characters by readable characters. As taken from Leif K-Brooks algorithm on: http://groups-beta.google.com/group/comp.lang.python """ def _repl_func(match): try: if match.group(1): # Numeric character reference return unichr( int(match.group(2)) ) else: try: return cp1252[ unichr(int(match.group(3))) ].strip() except: return unichr( name2codepoint[match.group(3)] ) except: return placeholder # Force to Unicode. if not isinstance(ustring, unicode): ustring = UnicodeDammit(ustring).unicode # Don't want some weird unicode character here # that truncate_spaces() doesn't know of: ustring = ustring.replace(" ", " ") # The ^> makes sure nothing inside a tag (i.e. href with query arguments) gets processed. _entity_re = re.compile(r'&(?:(#)(\d+)|([^;^> ]+));') return _entity_re.sub(_repl_func, ustring)
def make_tree(html): """ Returns an lxml tree for the given HTML string (either Unicode or bytestring). This is better than lxml.html.document_fromstring because this takes care of a few known issues. """ # Normalize newlines. Otherwise, "\r" gets converted to an HTML entity # by lxml. html = re.sub('\r\n', '\n', html) # Remove <?xml> declaration in Unicode objects, because it causes an error: # "ValueError: Unicode strings with encoding declaration are not supported." # Note that the error only occurs if the <?xml> tag has an "encoding" # attribute, but we remove it in all cases, as there's no downside to # removing it. if isinstance(html, unicode): html = re.sub(r'^\s*<\?xml\s+.*?\?>', '', html) else: html = UnicodeDammit(html, isHTML=True).unicode html = html.strip() if html: try: return document_fromstring(html) except: # Fall back to using the (slow) BeautifulSoup parser. return lxml.html.soupparser.fromstring(html) else: root = Element('body') root.text = u'' return ElementTree(root)
def _prep_query(query): '''Prepare query for Wikipedia. Queries must capitalize (most) words, must be in unicode, and must not contain characters with accents (ü). For example, the query http://en.wikipedia.org/wiki/Olga_Kurylenko works, but the query http://en.wikipedia.org/wiki/olga_kurylenko does not. Args: query (str) : Original query Returns: Wikipedia-formatted query ''' # Ensure unicode query = UnicodeDammit(query).unicode # Replace accents (ü -> u) query = geotools.strip_accents(query) # Split and capitalize query terms terms = map( lambda s: s.capitalize() if s not in _no_cap else s, query.lower().split(' ') ) # Join query terms query = ' '.join(terms) # Return completed query return query
def _solve_encoding(self, encoding, text): result = text if encoding: if (encoding in ["guess", "detect", "unicodedammit"]): dammit = UnicodeDammit(text) encoding = dammit.originalEncoding logger.debug( "Detected content encoding as %s (using 'unicodedammit' detection)" % encoding) result = dammit.unicode else: if (encoding in ["chardet"]): chardet_result = chardet.detect(text) encoding = chardet_result['encoding'] logger.debug( "Detected content encoding as %s (using 'chardet' detection)" % encoding) try: result = text.decode(encoding, self.encoding_errors) except UnicodeDecodeError: if (self.encoding_abort): raise Exception( "Error decoding unicode with encoding '%s' on data: %r" % (encoding, text)) logger.warn( "Error decoding unicode with encoding '%s' on data: %r" % (encoding, text)) result = text.decode("latin-1") return result
def decodeText(txt, headers=None): """ Takes a HTTP response body (=text) and the corresponding HTTP headers (a dict or dict-like object; httplib.HTTPResponse will do; see parseHttpHeaders() if you have a string); outputs the text as a unicode string. The encoding is guessed using BeautifulSoup.UnicodeDammit (which in turn uses chardet if installed), enhanced by the HTTP-suggested encoding. Raises MimeTypeError (subclass of ValueError) if headers do not indicate a text/* mime-type. """ from BeautifulSoup import UnicodeDammit # guess the charset suggested by HTTP headers httpCharset = [] if headers: contentType = headers.get('content-type', '') if not contentType.startswith('text/'): raise MimeTypeError( "Can only decode text documents (mime type text/*; got %s)" % contentType) m = re.search('charset=([\w0-9\-]+)', contentType) if m: httpCharset = [m.group(1).replace('windows-', 'cp')] ud = UnicodeDammit( txt, isHTML=True, overrideEncodings=httpCharset ) # overrideEncodings is not enforced by UnicodeDammit, it's just tried return ud.unicode
def _sniff_encoding(self, resource_info): with open(resource_info.filename) as f: data = f.read() proposed = ["utf-8", "latin1"] converted = UnicodeDammit(data, proposed, isHTML=True) del data return converted.originalEncoding
def decode_html(html, charset='ascii'): "Decode html_string to unicode" try: body = unicode(html, charset) except (UnicodeDecodeError, LookupError,): body = UnicodeDammit(html, isHTML=True).unicode return body
def _decoder(data): """Simple helper to enforce a decent charset handling.""" converted = UnicodeDammit(data, isHTML=True) if not converted.unicode: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(converted.triedEncodings)) return converted.unicode
def force_unicode(raw): ''' Uses BeautifulSoup.UnicodeDammit to try to force to unicode, and if that fails, it assumes utf8 and just ignores all errors. ''' converted = UnicodeDammit(raw, isHTML=True) if not converted.unicode: converted.unicode = unicode(raw, 'utf8', errors='ignore') encoding_m = encoding_re.match(converted.unicode) if encoding_m: converted.unicode = \ encoding_m.group('start_xml') + \ encoding_m.group('remainder') return converted.unicode
def decode_html(html_string): # See http://stackoverflow.com/a/16427392/82216 converted = UnicodeDammit(html_string, isHTML=True) if not converted.unicode: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(converted.triedEncodings)) return converted.unicode
def parse(filename, window_width=1000): logger.info('Got HTML to parse: %s' % filename) try: with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as f: copy_fname = f.name with tempfile.NamedTemporaryFile(delete=False) as f: conv_fname = f.name shutil.copy2(filename, copy_fname) # try to determine encoding and decode with open(copy_fname, 'rb') as f: converted = UnicodeDammit(f.read(), isHTML=True) if converted.unicode: with open(copy_fname, 'wb') as f: f.write(converted.unicode.encode('utf8')) args = ['wkhtmltopdf', '--encoding', 'utf-8', copy_fname, conv_fname] env = {'DISPLAY': ':99'} logger.debug('Calling wkhtmltopdf with arguments %r' % args) subprocess.check_call(args, env=env) logger.debug('Wkhtmltopdf has done the job') return pdf.parse(conv_fname, window_width) except subprocess.CalledProcessError as err: logger.error('wkhtmltopdf failed to convert "%s" because of %s\n%s' % (filename, err, traceback.format_exc())) raise PreprocError() finally: if copy_fname and os.path.exists(copy_fname): os.remove(copy_fname) if conv_fname and os.path.exists(conv_fname): os.remove(conv_fname)
def decode_html(html_string): """Convert a string into the UTF-8 encoding""" converted = UnicodeDammit(html_string, isHTML=True) if not converted.unicode: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(converted.triedEncodings)) return converted.unicode
def parse(raw_content, base_href=None, notify=lambda *args: None): try: content = UnicodeDammit(raw_content, isHTML=True).markup cleaned = _remove_crufty_html(content) return create_doc(cleaned, base_href) except HTMLParseError, e: notify("parsing failed:", e)
def handle_text(self): ''' Takes care of converting body text to unicode, if its text at all. Sets self.original_encoding to original char encoding, and converts body to unicode if possible. Must come after handle_compression, and after self.mediaType is valid. ''' self.encoding = None self.text = None # if the body is text if (self.mediaType and (self.mediaType.type == 'text' or (self.mediaType.type == 'application' and 'xml' in self.mediaType.subtype))): # if there was a charset parameter in HTTP header, store it if 'charset' in self.mediaType.params: override_encodings = [self.mediaType.params['charset']] else: override_encodings = [] # if there even is data (otherwise, dammit.originalEncoding might be None) if self.body != '': if UnicodeDammit: # honestly, I don't mind not abiding by RFC 2023. UnicodeDammit just # does what makes sense, and if the content is remotely standards- # compliant, it will do the right thing. dammit = UnicodeDammit(self.body, override_encodings) # if unicode was found if dammit.unicode: self.text = dammit.unicode self.originalEncoding = dammit.originalEncoding else: # unicode could not be decoded, at all # HAR can't write data, but body might still be useful as-is pass else: # try the braindead version, just guess content-type or utf-8 u = None # try our list of encodings + utf8 with strict errors for e in override_encodings + ['utf8', 'iso-8859-1']: try: u = self.body.decode(e, 'strict') self.originalEncoding = e break # if ^^ didn't throw, we're done except UnicodeError: logging.warning("Decoding unicocde response.") pass # if none of those worked, try utf8 with 'replace' error mode if not u: # unicode has failed u = self.body.decode('utf8', 'replace') self.originalEncoding = None # ??? self.text = u or None else: # body is not text self.encoding = "base64" self.text = base64.b64encode(self.body) # BLAZE - Removing body for now, to preserve memory self.text = None
def parse(raw_content, base_href=None, notify=lambda x: None): try: content = UnicodeDammit(raw_content, isHTML=True).markup cleaned = _remove_crufty_html(content) debug("Cleaned content: %s" % (cleaned, )) return create_doc(cleaned, base_href) except HTMLParseError, e: notify("parsing (%s) failed: %s" % (parse_method.__name__, e))
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit=-1): """ The main reading method. """ url_book = self.mirror lid = len(str(self.book_id)) fullbid = str(self.book_id) rootbid = fullbid # sometimes the id to access a file has a variation, ex fullbid=14285-8 for the book 14285 print type(lid) stopit = 0 for i in range(lid - 1): if (fullbid[i + 1] != "-") and (stopit == 0): url_book += '/' + fullbid[i] else: stopit = 1 rootbid = fullbid[0:i] url_book += '/' + rootbid + '/' + fullbid + '.txt' print url_book response = url2.urlopen(url_book) raw = response.read() #.decode('utf8') converted = UnicodeDammit(raw) raw = converted.unicode start_book = raw.find("START OF") end_book = raw.rfind('END OF') preamb = raw[:start_book] author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1 ][0] title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1 ][0] date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1 ][0] book_paraph = raw[start_book:end_book].split("\r\n\r\n") print "Book length %s" % len(raw) print "N paragraphs:", len(book_paraph) for id_p, p in enumerate(book_paraph): yield {'id': id_p, 'author': author, 'title': title, 'text': p}
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None, options=None, logger=None): self.text = text self.url = url self.verbose = verbose self.maxpage = maxpage self.logger = logger self.checker = checker self.options = options # The parsing of the page is done in the __init__() routine in # order to initialize the list of names the file # contains. Stored the parser in an instance variable. Passed # the URL to MyHTMLParser(). size = len(self.text) if self.maxpage and size > self.maxpage: self.logger.info("%s Skip huge file (%.0f Kbytes)" % (self.url, (size * 0.001))) self.parser = None return if options: text = self.reformat(text, url) self.logger.debug("Parsing %s (%d bytes)" % (self.url, size)) #text = clean_html(text) try: converted = UnicodeDammit(text, isHTML=True) if not converted.unicode: raise UnicodeDecodeError( "Failed to detect encoding, tried [%s]", ', '.join(converted.triedEncodings)) # print converted.originalEncoding self.parser = lxml.html.fromstring(converted.unicode) #self.parser = lxml.html.soupparser.fromstring(text) self.parser.resolve_base_href() self._html = tostring(self.parser, encoding=unicode, method="html", pretty_print=True) assert self._html is not None return except UnicodeDecodeError, HTMLParseError: self.logger.error("HTMLParseError %s" % url) pass
def markdown_to_html(text, markdown_extensions): """ When the input is Markdown, convert it to HTML so we can parse that. """ logger.info("Converting Markdown to HTML using extensions: %s.", ", ".join(sorted(markdown_extensions))) # We import the markdown module here so that the markdown module is not # required to use html2vimdoc when the input is HTML. from markdown import markdown # The Python Markdown module only accepts Unicode and ASCII strings, but we # don't know what the encoding of the Markdown text is. BeautifulSoup comes # to the rescue with the aptly named UnicodeDammit class :-). return markdown(UnicodeDammit(text).unicode, extensions=markdown_extensions)
def text_from_html(html): """Remove ALL tags and return all plain text. """ text = preprocess_to_string(html, drop_tags=_html_droptags, drop_trees=_html_droptrees) if not text: # Maybe there was something there but not really HTML. if text and not isinstance(text, unicode): text = UnicodeDammit(html, isHTML=False).unicode.strip() else: text = u'' text = convert_entities(text) return text
def get_url(self, url): """ fetch url, return it as an lxml.html doc """ content = urllib2.urlopen(url).read() # content = re.sub( """<?xml version="1.0" encoding="(.*?)"?>""", '', content) #"""<?xml version="1.0" encoding="ISO-8859-1"?>""" converted = UnicodeDammit(content, isHTML=True) if not converted.unicode: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(converted.triedEncodings)) doc = fromstring(converted.unicode) doc.make_links_absolute(url) return doc
def format_results(results): from BeautifulSoup import UnicodeDammit new_results = [] for line in results: new_line = [] for elem in line: if elem is None: new_line.append('') else: new_line.append(UnicodeDammit(elem).unicode) new_results.append('\t'.join(new_line)) return '\n'.join(new_results)
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit=-1): """ The main reading method. """ url_book = self.mirror lid = len(str(self.book_id)) bid = str(self.book_id) print type(lid) for i in range(lid - 1): url_book += '/' + bid[i] url_book += '/' + bid + '/' + bid + '.txt' print url_book response = url2.urlopen(url_book) raw = response.read() #.decode('utf8') converted = UnicodeDammit(raw) raw = converted.unicode start_book = raw.find("START OF") end_book = raw.rfind('END OF') preamb = raw[:start_book] author = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Author') != -1 ][0] title = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Title') != -1 ][0] date = [ i.split(':')[1].strip() for i in preamb.split("\r\n\r\n") if i.find('Release Date') != -1 ][0] book_paraph = raw[start_book:end_book].split("\r\n\r\n") print "Book length %s" % len(raw) print "N paragraphs:", len(book_paraph) for id_p, p in enumerate(book_paraph): yield {'id': id_p, 'author': author, 'title': title, 'text': p}
def get_value(self, value): # # This is called when XML is being rendered. # If a `transform` callable was passed into the constructor, it will # be used to modify the passed value. # value = self.transform(value) if self.transform else value # # Ugh - BeerXMLv1 is ASCII (ISO-8859-1), so we need to coerce # accented and other international characters to normalized ASCII # equivalents as best we can. # if isinstance(value, basestring): value = unicodedata.normalize('NFKD', UnicodeDammit(value).unicode).encode( 'ascii', 'ignore') return {self.name: value}
def from_string(self, string, isHTML=False, encoding=None, remove_blank_text=False): if string is None: return None if encoding == None: ud = UnicodeDammit(str(string), isHTML=isHTML) markup = ud.markup.encode('utf-8') else: markup = str(string).encode(encoding) if isHTML: try: return html.fromstring(markup, parser=html_parser) except: self._core.log_exception( 'Error parsing with lxml, falling back to soupparser') return soupparser.fromstring(string) else: return etree.fromstring( markup, parser=(xml_parser if remove_blank_text else None))
def sanitize_xml(data, log=None): u"""Take a string of bytes or unicode representing XML data and turn it into a UTF-8 string with characters that are invalid in that version of XML removed. >>> sanitize_xml("<?xml encoding='UTF-8'?><hello>hi</hello>") '<?xml encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml(u"<?xml encoding='UTF-8'?><hello>hi</hello>") '<?xml encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml("<?xml encoding='UTF-16'?><hello>hi</hello>") '<?xml encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml('<?xml encoding="UTF-16"?><hello>hi</hello>') '<?xml encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml('<?xml encoding="blah"?><hello>hi</hello>') '<?xml encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml('<?xml encoding="blah" ?><hello>hi</hello>') '<?xml encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml('<?xml version="1.0" encoding="blah" ?><hello>hi</hello>') '<?xml version="1.0" encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml('<?xml version="1.1" encoding="blah" ?><hello>hi</hello>') '<?xml version="1.1" encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml('<hello>hi</hello>') '<?xml version="1.0" encoding="UTF-8"?><hello>hi</hello>' >>> sanitize_xml(u'\u2026') '<?xml version="1.0" encoding="UTF-8"?>\\xe2\\x80\\xa6' >>> sanitize_xml('hello\\x00world') '<?xml version="1.0" encoding="UTF-8"?>helloworld' >>> def log(msg): print msg ... >>> sanitize_xml('hello\\0world', log) Found first disallowed character u'\\x00' at position 44 '<?xml version="1.0" encoding="UTF-8"?>helloworld' \x7f is allowed in XML 1.0, but not in XML 1.1 >>> sanitize_xml( ... '<?xml version="1.0" encoding="UTF-8"?><hello>\\x7f</hello>') '<?xml version="1.0" encoding="UTF-8"?><hello>\\x7f</hello>' >>> sanitize_xml('<?xml version="1.1"?><hello>\\x7f</hello>', log) Found first disallowed character u'\\x7f' at position 46 '<?xml version="1.1" encoding="UTF-8"?><hello></hello>' The \x80 in the following makes UnicodeDammit interpret the string using the windows-1252 encoding, so it gets translated into a Euro symbol. >>> sanitize_xml('hello\\x80world', log) '<?xml version="1.0" encoding="UTF-8"?>hello\\xe2\\x82\\xacworld' If we pass in a unicode string instead so that UnicodeDammit is bypassed then it gets properly ignored... >>> sanitize_xml(u'hello\u0080world', log).decode('utf_8') u'<?xml version="1.0" encoding="UTF-8"?>hello\\x80world' unless we use XML 1.1 where it is properly disallowed and so stripped: >>> sanitize_xml(u'<?xml version="1.1" ?>hello\u0080world', log) Found first disallowed character u'\\x80' at position 44 '<?xml version="1.1" encoding="UTF-8"?>helloworld' >>> sanitize_xml('<hello></hello>', log) Found first disallowed character reference  at position 46 '<?xml version="1.0" encoding="UTF-8"?><hello></hello>' >>> sanitize_xml(u'<?xml version="1.1"?><hello></hello>', log) '<?xml version="1.1" encoding="UTF-8"?><hello></hello>' >>> sanitize_xml('<hello>�</hello>', log) Found first disallowed character reference � at position 46 '<?xml version="1.0" encoding="UTF-8"?><hello></hello>' >>> sanitize_xml('<hello>

�blah </hello>', log) Found first disallowed character reference � at position 57 '<?xml version="1.0" encoding="UTF-8"?><hello>

blah </hello>' """ if isinstance(data, unicode): u = data else: u = UnicodeDammit(data, smartQuotesTo=None).unicode # The text may have a prolog that specifies a character encoding, but we're # going to re-encode it as UTF-8 so make sure the prolog reflects that. m = re.match("""^<\?xml[\s]*([^\?]*?)[\s]*\?>""", u) if not m: # no prolog found, so add one of our own u = '<?xml version="1.0" encoding="UTF-8"?>' + u version = 0 else: new_encoding = 'encoding="UTF-8"' attr = m.group(1) encoding_m = re.search("""encoding[\s]*=[\s]*['"].*?['"]""", attr) if encoding_m: # replace the encoding attr = \ attr[:encoding_m.start()] + \ new_encoding + \ attr[encoding_m.end():] else: # or add it if there wasn't one in the prolog already attr = attr + ' ' + new_encoding u = '<?xml ' + attr + '?>' + u[m.end():] # see if the prolog has a version number too m2 = re.search("""[\s]*version[\s]*=[\s]*['"](.*?)['"]""", attr) if m2: if m2.group(1) == u'1.0': version = 0 else: # anything unknown is going to be >1.1, so assume the 1.1 # invalid character rules version = 1 else: # version number is optional for XML 1.0 version = 0 allowed = u'\x09\x0a\x0d\x20-\x7e\xa0-\ud7ff\ue000-\ufffd' if version == 0: allowed = allowed + u'\x7f-\x9f' else: allowed = allowed + u'\x85' allowed_as_references = allowed if version != 0: allowed_as_references = allowed_as_references + u'\x01-\x1f\x7f-\x9f' everything_but = '[^%s]' disallowed = re.compile( everything_but % allowed) disallowed_as_references = re.compile( everything_but % allowed_as_references) logged_first = False skip_replacement = False if log: m = disallowed.search(u) if m: log('Found first disallowed character %s at position %d' % ( repr(m.group(0)), m.start() + 1)) logged_first = True else: # no point searching again in a moment skip_replacement = True if not skip_replacement: u = disallowed.sub('', u) reference = re.compile('&#(x)?0*([0-9a-fA-F]+);') search_pos = 0 while True: m = reference.search(u, search_pos) if not m: break c = unichr(int(m.group(2), 16 if m.group(1) == 'x' else 10)) if disallowed_as_references.match(c): if log and not logged_first: log(('Found first disallowed character reference %s ' + 'at position %d') % ( m.group(0), m.start() + 1)) logged_first = True u = u[:m.start()] + u[m.end():] search_pos = m.start() else: search_pos = m.end() return u.encode('utf_8')
# return no encoding. # Uses most used encodings for each national suffix if u'.ru' in ref.link or u'.su' in ref.link: # see http://www.sci.aha.ru/ATL/ra13a.htm : no server # encoding, no page encoding enc = enc + ['koi8-r', 'windows-1251'] elif u'.jp' in ref.link: enc.append("shift jis 2004") enc.append("cp932") elif u'.kr' in ref.link: enc.append("euc-kr") enc.append("cp949") elif u'.zh' in ref.link: enc.append("gbk") u = UnicodeDammit(linkedpagetext, overrideEncodings=enc) if not u.unicode: #Some page have utf-8 AND windows-1252 characters, #Can't easily parse them. (~1 on 1000) repl = ref.refLink() new_text = new_text.replace(match.group(), repl) pywikibot.output('%s : Hybrid encoding...' % ref.link) continue # Retrieves the first non empty string inside <title> tags for m in self.TITLE.finditer(u.unicode): t = m.group() if t: ref.title = t ref.transform()
def lhget(*args, **kwargs): r = requests.get(*args, **kwargs) html = UnicodeDammit(r.content).unicode tree = lh.fromstring(html) return tree
def decode_html(html_string): converted = UnicodeDammit(html_string, isHTML=True) if not converted.unicode: raise UnicodeDecodeError( ', '.join(converted.triedEncodings)) return converted.unicode
def _computeLinks(self): self._computeRelpaths() htmls = self.resources['mimes']['text/html'] total = len(htmls) i = 1 for url in htmls: if self.cancel: return if self.client: self.client.call( 'eXe.app.getController("Toolbar").updateImportProgressWindow', _(u'Analyzing HTML file labels %d of %d: %s') % (i, total, str(url))) content = open(url.path).read() encoding = detect(content)['encoding'] ucontent = unicode(content, encoding) soup = BeautifulSoup(ucontent, fromEncoding=encoding) declaredHTMLEncoding = getattr(soup, 'declaredHTMLEncoding') if declaredHTMLEncoding: ucontent = UnicodeDammit(content, [declaredHTMLEncoding]).unicode encoding = declaredHTMLEncoding else: pass url.setContent(ucontent, encoding) url.setSoup(soup) for tag in soup.findAll(): if self.cancel: return if not tag.attrs: continue matches = [] for key, value in tag.attrs: if value == "": continue unq_value = unquote(value) unq_low_value = unquote(value.lower()) for l, rl in self.resources['urls'][ url.parentpath].relpaths: low_rl = rl.lower() if rl in unq_value: L = Link(self.resources['urls'][l], rl, url, tag, key, rl) matches.append(L) elif low_rl in unq_value: L = Link(self.resources['urls'][l], rl, url, tag, key, low_rl) matches.append(L) elif l in unq_value: L = Link(self.resources['urls'][l], rl, url, tag, key, l) matches.append(L) matches_final = [] for l1 in matches: matches_ = [m for m in matches if m != l1] found = False for l2 in matches_: if re.search(re.escape(l1.relative), l2.relative): found = True if not found: matches_final.append(l1) if matches_final: for match in matches_final: url.addLink(match) url.addRLink(str(match.url)) i += 1 csss = self.resources[ 'mimes']['text/css'] if 'text/css' in self.resources['mimes'].keys( ) else None csss_and_htmls = csss + htmls if csss else htmls total = len(csss_and_htmls) i = 1 for url in csss_and_htmls: if self.cancel: return if url.mime == 'text/css': tipo = 'CSS' else: tipo = 'HTML' content = url.getContent() if not content: content = open(url.path).read() encoding = detect(content)['encoding'] content = unicode(content, encoding) url.setContent(content, encoding) if self.client: self.client.call( 'eXe.app.getController("Toolbar").updateImportProgressWindow', _(u'Exhaustively analyzed file %s %d of %d: %s') % (tipo, i, total, str(url))) matches = [] for l, rl in self.resources['urls'][url.parentpath].relpaths: low_rl = rl.lower() if rl in content: L = Link(self.resources['urls'][l], rl, url, match=rl) matches.append(L) elif low_rl in content: L = Link(self.resources['urls'][l], rl, url, match=low_rl) matches.append(L) matches_final = [] for l1 in matches: matches_ = [m for m in matches if m != l1] found = False for l2 in matches_: if re.search(re.escape(l1.relative), l2.relative): found = True if not found: matches_final.append(l1) if matches_final: for match in matches_final: if not [ link for link in url.links if link.relative == match.relative ]: url.addLink(match) url.addRLink(str(match.url)) i += 1
def message(self, user, message, length=380): message = message.replace('\n', '').replace('\r', '') message = UnicodeDammit(message) self.send_message(user, message.unicode.encode("utf-8"), length)
- text - after bte - keywords - discription - author - title """ result = {} try: conn = urllib2.urlopen(url) webfile = conn.read() except Exception, e: logger.info("Cannot download URL:%s\t%s", url, e) else: if not webfile: return result converted = UnicodeDammit(webfile) #, isHTML=True) if not converted.unicode: logger.info("UnicodeDammit failed to detect encoding, tried [%s]", \ ', '.join(converted.triedEncodings)) return result logger.debug("UnicodeDammit: originalEncoding:%s, triedEncodings:%s", converted.originalEncoding, ', '.join(converted.triedEncodings)) result['raw'] = converted.unicode result['text'] = bte.html2text(converted.unicode) root = None try: root = lxml.html.fromstring(webfile) except lxml.etree.ParserError, e: logger.info("Can not parse URL:%s\t%s", url, e) return dict() find = {'description' : "./head/meta[@name=\"description\"]/@content",
def make_clean_html_super(raw, stream_item=None, log_dir_path=None): ''' Treat 'raw' as though it is HTML, even if we have no idea what it really is, and attempt to get a properly formatted HTML document with all HTML-escaped characters converted to their unicode. ''' ## attempt to get HTML and force it to unicode fixed_html = None ## count the number of attempts, so can get progressively more ## aggressive with forcing the character set attempt = 0 ## keep all the tracebacks, so we can read them if we want to ## analyze a particular document all_exc = [] ## the last attempt leads sets this to True to end the looping no_more_attempts = False while not no_more_attempts: attempt += 1 try: ## default attempt uses vanilla lxml.html root = lxml.html.fromstring(raw) ## if that worked, then we will be able to generate a ## valid HTML string fixed_html = lxml.html.tostring(root, encoding='unicode') except UnicodeDecodeError, exc: ## most common failure is a bogus encoding all_exc.append(exc) try: converted = UnicodeDammit(raw, isHTML=True) if not converted.unicode: raise Exception( 'UnicodeDammit failed, appeared to be %r tried [%s]' % ( converted.originalEncoding, ', '.join(converted.triedEncodings))) encoding_m = encoding_re.match(converted.unicode) if encoding_m: converted.unicode = \ encoding_m.group('start_xml') + \ encoding_m.group('remainder') root = lxml.html.fromstring(converted.unicode) ## if that worked, then we will be able to generate a ## valid HTML string fixed_html = lxml.html.tostring(root, encoding='unicode') ## hack in a logging step here so we can manually inspect ## this fallback stage. if log_dir_path and stream_item: stream_item.body.clean_html = fixed_html.encode('utf8') stream_item.body.logs.append( make_traceback_log(all_exc) ) except Exception, exc: ## UnicodeDammit failed all_exc.append(exc) fixed_html = None
def unicode_cleansed(content, base_href): content = UnicodeDammit(content, isHTML=True).markup cleaned = _remove_crufty_html(content) debug("Cleaned content: %s" % (cleaned, )) return beautiful_soup(cleaned, base_href)