def _make_gz(args): try: columns, from_file, to_file, sep = args if os.path.exists(to_file): return with open(from_file, 'rb') as fin, gzip.open(to_file, 'wb') as fout: log.debug('making gz %s => %s' % (from_file, to_file)) for i, line in enumerate(fin): line = line.strip(b'\n').split(sep) #if i % 10000 == 0: # log.debug('making gz %s => %s [%d]' % (from_file, to_file, i)) if len(line) != len(columns): log.error('columns not match at %s, got %d, expect %d' % (from_file, len(line), len(columns))) continue features = {} for l, c in zip(line, columns): features[c.name] = c.raw_to_proto(l) example = example_pb2.Example(features=feature_pb2.Features( feature=features)) serialized = example.SerializeToString() l = len(serialized) data = struct.pack('i%ds' % l, l, serialized) fout.write(data) log.debug('done making gz %s => %s' % (from_file, to_file)) except Exception as e: log.exception(e) raise e
def _parse_gz(record_str): # function that takes python_str as input ex = example_pb2.Example() ex.ParseFromString(record_str) ret = [] fea_dict = ex.features.feature for c in self._columns: ins = c.proto_to_instance(fea_dict[c.name]) ret.append(ins) return ret