示例#1
0
def string_value(value, encoding_default='utf-8', encoding=None):
    """Brute-force convert a given object to a string.

    This will attempt an increasingly mean set of conversions to make a given
    object into a unicode string. It is guaranteed to either return unicode or
    None, if all conversions failed (or the value is indeed empty).
    """
    if value is None:
        return None

    if not isinstance(value, six.text_type):
        if isinstance(value, (date, datetime)):
            return value.isoformat()

        if isinstance(value, (float, Decimal)):
            return Decimal(value).to_eng_string()

        if isinstance(value, six.string_types):
            if encoding is None:
                encoding = guess_encoding(encoding_default)
            value = value.decode(encoding, 'replace')
            value = ''.join(ch for ch in value if category(ch)[0] != 'C')
            value = value.replace(u'\xfe\xff', '')  # remove BOM
        else:
            value = six.text_type(value)

    if not len(value.strip()):
        return None
    return value
示例#2
0
文件: encoding.py 项目: wdsn/aleph
 def decode_string(self, text, encoding=DEFAULT_ENCODING):
     if not isinstance(text, bytes):
         return stringify(text)
     encoding = normalize_encoding(encoding)
     try:
         return text.decode(encoding, 'strict')
     except Exception:
         try:
             detected = guess_encoding(text)
             return text.decode(detected, 'strict')
         except Exception:
             return text.decode(encoding, 'replace')
示例#3
0
    def read_file_decoded(self, entity, file_path):
        with open(file_path, 'rb') as fh:
            body = fh.read()

        if not entity.has('encoding'):
            entity.set('encoding', guess_encoding(body))

        for encoding in entity.get('encoding'):
            try:
                body = body.decode(encoding)
                if encoding != self.DEFAULT_ENCODING:
                    log.info("Decoding [%r] as: %s", entity, encoding)
                return body
            except UnicodeDecodeError as ude:
                raise ProcessingException('Error decoding file as %s: %s' %
                                          (encoding, ude)) from ude
示例#4
0
    def __init__(self, file_path, encoding=None, delimiter=None):
        if encoding is None:
            with open(file_path, 'r') as fh:
                data = fh.read(SAMPLE_SIZE)
                encoding = guess_encoding(data)

        self.fh = io.open(file_path, 'r', encoding=encoding)
        data = self.fh.read(SAMPLE_SIZE)
        dialect = csv.Sniffer().sniff(data)
        if delimiter is not None:
            dialect.delimiter = delimiter
        self.fh.seek(0)

        self.reader = iter(csv.reader(self.fh, dialect=dialect))
        self.headers = next(self.reader)
        self.count = 0
示例#5
0
 def rows(self):
     try:
         with open(self.file_name, 'r') as fh:
             sample = fh.read(4096 * 10)
             encoding = guess_encoding(sample)
             if encoding != 'utf-8':
                 log.info("Decode [%s]: %s", self.file_name, encoding)
             sample = sample.decode(encoding, 'replace')
             dialect = Sniffer().sniff(sample)
             fh.seek(0)
             for row in DictReader(
                     fh,
                     encoding=encoding,
                     delimiter=dialect.delimiter.encode(encoding)):
                 yield row
     except Exception as exc:
         log.error('Failed reading file [%s]: %s', self.file_name, exc)
示例#6
0
文件: csv.py 项目: uhhhuh/pgcsv
def open_csv(file_path, encoding=None, delimiter=None):
    if encoding is None:
        with io.open(file_path, 'rb') as fh:
            data = fh.read(SAMPLE_SIZE)
            encoding = guess_encoding(data)

    fh = io.open(file_path, 'r', encoding=encoding)
    if delimiter is None:
        data = fh.read(SAMPLE_SIZE)
        dialect = csv.Sniffer().sniff(data)
        delimiter = dialect.delimiter
        fh.seek(0)

    reader = csv.reader(fh, delimiter=delimiter)
    headers = []
    for row in reader:
        headers = row
        break
    fh.seek(0)
    return fh, delimiter, headers
示例#7
0
    def _getStringStream(self, filename):
        """Gets a string representation of the requested filename.
        Checks for both ASCII and Unicode representations and returns
        a value if possible.  If there are both ASCII and Unicode
        versions, then the parameter /prefer/ specifies which will be
        returned.
        """

        if isinstance(filename, list):
            # Join with slashes to make it easier to append the type
            filename = "/".join(filename)

        value = windowsUnicode(self._getStream(filename + '001F'))
        if value is None:
            raw = self._getStream(filename + '001E')
            try:
                value = decode_utf7(raw)
            except Exception:
                encoding = guess_encoding(raw)
                value = raw.decode(encoding, 'replace')

        if value is not None and len(value):
            return remove_unsafe_chars(value)
示例#8
0
    def unpack_members(self, pack, temp_dir):
        # Some archives come with non-Unicode file names, this
        # attempts to avoid that issue by naming the destination
        # explicitly.
        names = pack.namelist()
        names = [n for n in names if isinstance(n, six.binary_type)]
        encoding = guess_encoding('\n'.join(names))
        log.debug('Detected filename encoding: %s', encoding)

        for name in pack.namelist():
            file_name = name
            if isinstance(name, six.binary_type):
                file_name = name.decode(encoding, 'ignore')

            out_path = join_path(temp_dir, file_name)
            if os.path.exists(out_path):
                continue
            if not out_path.startswith(temp_dir):
                continue

            out_dir = os.path.dirname(out_path)
            make_directory(out_dir)
            if os.path.isdir(out_path):
                continue

            try:
                in_fh = pack.open(name)
                try:
                    log.debug("Unpack: %s -> %s", self.result, file_name)
                    with open(out_path, 'w') as out_fh:
                        shutil.copyfileobj(in_fh, out_fh)
                finally:
                    in_fh.close()
            except Exception as ex:
                # TODO: should this be a fatal error?
                log.debug("Failed to unpack [%s]: %s", file_name, ex)
示例#9
0
 def test_guess_encoding(self):
     text = u'Порошенко Петро Олексійович'
     encoded = text.encode('iso-8859-5')
     out = guess_encoding(encoded)
     self.assertEqual('iso-8859-5', out)
示例#10
0
 def test_guess_encoding(self):
     text = u"Порошенко Петро Олексійович"
     encoded = text.encode("iso-8859-5")
     out = guess_encoding(encoded)
     self.assertEqual("iso-8859-5", out)