def as_unicode(line): """ Return a unicode text line from a text line. Try to decode line as Unicode. Try first some default encodings, then attempt Unicode trans-literation and finally fall-back to ASCII strings extraction. TODO: Add file/magic detection, unicodedmanit/BS3/4 """ unicodedata_normalize = unicodedata.normalize chardet_detect = chardet.detect try: s = unicode(line, 'UTF-8') except UnicodeDecodeError: try: # FIXME: latin-1 may never fail s = unicode(line, 'LATIN-1') except UnicodeDecodeError: try: # Convert some byte string to ASCII characters as Unicode including # replacing accented characters with their non- accented NFKD # equivalent. Non ISO-Latin and non ASCII characters are stripped # from the output. Does not preserve the original length offsets. # For Unicode NFKD equivalence, see: # http://en.wikipedia.org/wiki/Unicode_equivalence s = unicodedata_normalize('NFKD', line).encode('ASCII') except UnicodeDecodeError: try: enc = chardet_detect(line)['encoding'] s = unicode(line, enc) except UnicodeDecodeError: # fall-back to strings extraction if all else fails s = strings.string_from_string(s) return s
def as_unicode(line): """ Return a unicode text line from a text line. Try to decode line as Unicode. Try first some default encodings, then attempt Unicode trans-literation and finally fall-back to ASCII strings extraction. TODO: Add file/magic detection, unicodedmanit/BS3/4 """ if isinstance(line, unicode): return line unicodedata_normalize = unicodedata.normalize chardet_detect = chardet.detect try: s = line.decode('UTF-8') except UnicodeDecodeError: try: # FIXME: latin-1 may never fail s = line.decode('LATIN-1') except UnicodeDecodeError: try: # Convert some byte string to ASCII characters as Unicode including # replacing accented characters with their non- accented NFKD # equivalent. Non ISO-Latin and non ASCII characters are stripped # from the output. Does not preserve the original length offsets. # For Unicode NFKD equivalence, see: # http://en.wikipedia.org/wiki/Unicode_equivalence s = unicodedata_normalize('NFKD', line).encode('ASCII') except UnicodeDecodeError: try: enc = chardet_detect(line)['encoding'] s = unicode(line, enc) except UnicodeDecodeError: # fall-back to strings extraction if all else fails s = strings.string_from_string(s) return s