def string_value(value, encoding_default='utf-8', encoding=None): """Brute-force convert a given object to a string. This will attempt an increasingly mean set of conversions to make a given object into a unicode string. It is guaranteed to either return unicode or None, if all conversions failed (or the value is indeed empty). """ if value is None: return None if not isinstance(value, six.text_type): if isinstance(value, (date, datetime)): return value.isoformat() if isinstance(value, (float, Decimal)): return Decimal(value).to_eng_string() if isinstance(value, six.string_types): if encoding is None: encoding = guess_encoding(encoding_default) value = value.decode(encoding, 'replace') value = ''.join(ch for ch in value if category(ch)[0] != 'C') value = value.replace(u'\xfe\xff', '') # remove BOM else: value = six.text_type(value) if not len(value.strip()): return None return value
def decode_string(self, text, encoding=DEFAULT_ENCODING): if not isinstance(text, bytes): return stringify(text) encoding = normalize_encoding(encoding) try: return text.decode(encoding, 'strict') except Exception: try: detected = guess_encoding(text) return text.decode(detected, 'strict') except Exception: return text.decode(encoding, 'replace')
def read_file_decoded(self, entity, file_path): with open(file_path, 'rb') as fh: body = fh.read() if not entity.has('encoding'): entity.set('encoding', guess_encoding(body)) for encoding in entity.get('encoding'): try: body = body.decode(encoding) if encoding != self.DEFAULT_ENCODING: log.info("Decoding [%r] as: %s", entity, encoding) return body except UnicodeDecodeError as ude: raise ProcessingException('Error decoding file as %s: %s' % (encoding, ude)) from ude
def __init__(self, file_path, encoding=None, delimiter=None): if encoding is None: with open(file_path, 'r') as fh: data = fh.read(SAMPLE_SIZE) encoding = guess_encoding(data) self.fh = io.open(file_path, 'r', encoding=encoding) data = self.fh.read(SAMPLE_SIZE) dialect = csv.Sniffer().sniff(data) if delimiter is not None: dialect.delimiter = delimiter self.fh.seek(0) self.reader = iter(csv.reader(self.fh, dialect=dialect)) self.headers = next(self.reader) self.count = 0
def rows(self): try: with open(self.file_name, 'r') as fh: sample = fh.read(4096 * 10) encoding = guess_encoding(sample) if encoding != 'utf-8': log.info("Decode [%s]: %s", self.file_name, encoding) sample = sample.decode(encoding, 'replace') dialect = Sniffer().sniff(sample) fh.seek(0) for row in DictReader( fh, encoding=encoding, delimiter=dialect.delimiter.encode(encoding)): yield row except Exception as exc: log.error('Failed reading file [%s]: %s', self.file_name, exc)
def open_csv(file_path, encoding=None, delimiter=None): if encoding is None: with io.open(file_path, 'rb') as fh: data = fh.read(SAMPLE_SIZE) encoding = guess_encoding(data) fh = io.open(file_path, 'r', encoding=encoding) if delimiter is None: data = fh.read(SAMPLE_SIZE) dialect = csv.Sniffer().sniff(data) delimiter = dialect.delimiter fh.seek(0) reader = csv.reader(fh, delimiter=delimiter) headers = [] for row in reader: headers = row break fh.seek(0) return fh, delimiter, headers
def _getStringStream(self, filename): """Gets a string representation of the requested filename. Checks for both ASCII and Unicode representations and returns a value if possible. If there are both ASCII and Unicode versions, then the parameter /prefer/ specifies which will be returned. """ if isinstance(filename, list): # Join with slashes to make it easier to append the type filename = "/".join(filename) value = windowsUnicode(self._getStream(filename + '001F')) if value is None: raw = self._getStream(filename + '001E') try: value = decode_utf7(raw) except Exception: encoding = guess_encoding(raw) value = raw.decode(encoding, 'replace') if value is not None and len(value): return remove_unsafe_chars(value)
def unpack_members(self, pack, temp_dir): # Some archives come with non-Unicode file names, this # attempts to avoid that issue by naming the destination # explicitly. names = pack.namelist() names = [n for n in names if isinstance(n, six.binary_type)] encoding = guess_encoding('\n'.join(names)) log.debug('Detected filename encoding: %s', encoding) for name in pack.namelist(): file_name = name if isinstance(name, six.binary_type): file_name = name.decode(encoding, 'ignore') out_path = join_path(temp_dir, file_name) if os.path.exists(out_path): continue if not out_path.startswith(temp_dir): continue out_dir = os.path.dirname(out_path) make_directory(out_dir) if os.path.isdir(out_path): continue try: in_fh = pack.open(name) try: log.debug("Unpack: %s -> %s", self.result, file_name) with open(out_path, 'w') as out_fh: shutil.copyfileobj(in_fh, out_fh) finally: in_fh.close() except Exception as ex: # TODO: should this be a fatal error? log.debug("Failed to unpack [%s]: %s", file_name, ex)
def test_guess_encoding(self): text = u'Порошенко Петро Олексійович' encoded = text.encode('iso-8859-5') out = guess_encoding(encoded) self.assertEqual('iso-8859-5', out)
def test_guess_encoding(self): text = u"Порошенко Петро Олексійович" encoded = text.encode("iso-8859-5") out = guess_encoding(encoded) self.assertEqual("iso-8859-5", out)