示例#1
0
def _make_gz(args):
    try:
        columns, from_file, to_file, sep = args
        if os.path.exists(to_file):
            return
        with open(from_file, 'rb') as fin, gzip.open(to_file, 'wb') as fout:
            log.debug('making gz %s => %s' % (from_file, to_file))
            for i, line in enumerate(fin):
                line = line.strip(b'\n').split(sep)
                #if i % 10000 == 0:
                #    log.debug('making gz %s => %s [%d]' % (from_file, to_file, i))
                if len(line) != len(columns):
                    log.error('columns not match at %s, got %d, expect %d' %
                              (from_file, len(line), len(columns)))
                    continue
                features = {}
                for l, c in zip(line, columns):
                    features[c.name] = c.raw_to_proto(l)
                example = example_pb2.Example(features=feature_pb2.Features(
                    feature=features))
                serialized = example.SerializeToString()
                l = len(serialized)
                data = struct.pack('i%ds' % l, l, serialized)
                fout.write(data)
            log.debug('done making gz %s => %s' % (from_file, to_file))
    except Exception as e:
        log.exception(e)
        raise e
示例#2
0
 def _parse_gz(record_str):  # function that takes python_str as input
     ex = example_pb2.Example()
     ex.ParseFromString(record_str)
     ret = []
     fea_dict = ex.features.feature
     for c in self._columns:
         ins = c.proto_to_instance(fea_dict[c.name])
         ret.append(ins)
     return ret