def convert_to_recordio_files(data_frame, dir_name, records_per_shard): """ Convert a pandas DataFrame to recordio files. Args: data_frame: A pandas DataFrame to convert_to_recordio_files. dir_name: A directory to put the generated recordio files. records_per_shard: The record number per shard. """ pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) row_num = 0 writer = None for index, row in data_frame.iterrows(): if row_num % records_per_shard == 0: if writer: writer.close() shard = row_num // records_per_shard file_path_name = os.path.join(dir_name, "data-%05d" % shard) writer = recordio.Writer(file_path_name) feature = convert_series_to_tf_feature( row, data_frame.columns, data_frame.dtypes ) result_string = tf.train.Example( features=tf.train.Features(feature=feature) ).SerializeToString() writer.write(result_string) row_num += 1 if writer: writer.close() print("Finish data conversion in {}".format(dir_name))
def convert(x, y, args, subdir): """Convert pairs of image and label in NumPy arrays into a set of RecordIO files. """ logger = logging.getLogger("image_label::convert") logger.setLevel("INFO") row = 0 shard = 0 w = None while row < x.shape[0] * args.fraction: if row % args.records_per_shard == 0: if w: w.close() dn = os.path.join(args.dir, args.dataset, subdir) fn = os.path.join(dn, "data-%05d" % (shard)) if not os.path.exists(dn): os.makedirs(os.path.dirname(fn)) logger.info("Writing {} ...".format(fn)) w = recordio.Writer(fn) shard = shard + 1 w.write( tf.train.Example(features=tf.train.Features( feature={ "image": tf.train.Feature(float_list=tf.train.FloatList( value=x[row].flatten())), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=y[row].flatten())), })).SerializeToString()) row = row + 1 w.close() logger.info("Wrote {} of total {} records into {} files".format( row, x.shape[0], shard))
def convert_to_recordio_files(file_path, dir_name, records_per_shard): """ Convert a pandas DataFrame to recordio files. Args: file_path: A path of the data file dir_name: A directory to put the generated recordio files. records_per_shard: The record number per shard. """ pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) writer = None with open(file_path, "r") as f: for index, row in enumerate(f): if index % records_per_shard == 0: if writer: writer.close() shard = index // records_per_shard file_path_name = os.path.join(dir_name, "data-%05d" % shard) writer = recordio.Writer(file_path_name) feature = convert_series_to_tf_feature( row, DAC_COLUMNS, DAC_DTYPES ) result_string = tf.train.Example( features=tf.train.Features(feature=feature) ).SerializeToString() writer.write(result_string) if writer: writer.close() print("Finish data conversion in {}".format(dir_name))
def convert_to_recordio_files(file_path, dir_name, records_per_shard=10240): """ Convert a CSV file to recordio files. Args: file_path: A path of the CSV file dir_name: A directory to put the generated recordio files. records_per_shard: The record number per shard. """ pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True) writer = None with open(file_path, "r") as f: for index, row in enumerate(f): if index % records_per_shard == 0: if writer: writer.close() shard = index // records_per_shard file_path_name = os.path.join(dir_name, "data-%05d" % shard) writer = recordio.Writer(file_path_name) example = convert_data_to_tf_example(row, COLUMNS) writer.write(example) if writer: writer.close()
def create_imagenet_recordio_file(size, shape): image_size = 1 for s in shape: image_size *= s temp_file = tempfile.NamedTemporaryFile(delete=False) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): # image: float -> uint8 -> tensor -> bytes image = np.random.rand(image_size).reshape(shape).astype(np.uint8) image = tf.image.encode_jpeg(tf.convert_to_tensor(value=image)) image = image.numpy() label = np.ndarray([1], dtype=np.int64) label[0] = np.random.randint(1, 11) example_dict = { "image": tf.train.Feature( bytes_list=tf.train.BytesList(value=[image]) ), "label": tf.train.Feature( int64_list=tf.train.Int64List(value=[label]) ), } example = tf.train.Example( features=tf.train.Features(feature=example_dict) ) f.write(example.SerializeToString()) return temp_file.name
def create_recordio_file(size): temp_file = tempfile.NamedTemporaryFile(delete=False) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): x = np.random.rand(1).astype(np.float32) y = 2 * x + 1 example_dict = { "x": tf.train.Feature(float_list=tf.train.FloatList(value=x)), "y": tf.train.Feature(float_list=tf.train.FloatList(value=y)), } example = tf.train.Example(features=tf.train.Features( feature=example_dict)) f.write(example.SerializeToString()) return temp_file.name
def write_recordio_shards_from_iterator(records_iter, features_list, output_dir, records_per_shard): """Writes RecordIO files from Python iterator of numpy arrays.""" # Take the first record batch to check whether it contains multiple items first_record_batch = next(records_iter) is_first_record_batch_consumed = False is_multi_items_per_batch = any( isinstance(i, list) for i in first_record_batch) # Find the features of different types that will be used # in `_parse_row_to_example()` later record = (first_record_batch[0] if is_multi_items_per_batch else first_record_batch) feature_indices = _find_feature_indices_from_record(record) writer = None rows_written = 0 shards_written = 0 while True: try: # Make sure to consume the first record batch if is_first_record_batch_consumed: record_batch = next(records_iter) else: record_batch = first_record_batch is_first_record_batch_consumed = True if not is_multi_items_per_batch: record_batch = [record_batch] # Write each record in the batch to a RecordIO shard for record in record_batch: # Initialize the writer for the new shard if rows_written % records_per_shard == 0: if writer is not None: writer.close() shard_file_path = os.path.join( output_dir, "data-%05d" % shards_written) writer = recordio.Writer(shard_file_path) shards_written += 1 writer.write( _parse_row_to_example(record, features_list, feature_indices).SerializeToString()) rows_written += 1 except StopIteration: break writer.close()
def create_frappe_recordio_file(size, shape, input_dim): temp_file = tempfile.NamedTemporaryFile(delete=False) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): # image: float -> uint8 -> tensor -> bytes feature = np.random.randint(input_dim, size=(shape, )) label = np.random.randint(2, size=(1, )) example_dict = { "feature": tf.train.Feature(int64_list=tf.train.Int64List(value=feature)), "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])), } example = tf.train.Example(features=tf.train.Features( feature=example_dict)) f.write(example.SerializeToString()) return temp_file.name
def create_recordio_file(size, shape): image_size = 1 for s in shape: image_size *= s temp_file = tempfile.NamedTemporaryFile(delete=False) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): image = np.random.rand(image_size).astype(np.float32) label = np.ndarray([1], dtype=np.int64) label[0] = np.random.randint(0, 10) example_dict = { "image": tf.train.Feature(float_list=tf.train.FloatList(value=image)), "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label])), } example = tf.train.Example(features=tf.train.Features( feature=example_dict)) f.write(example.SerializeToString()) return temp_file.name
def create_recordio_file(size, dataset_name, shape, temp_dir=None): """Creates a temporary file containing data of `recordio` format. Args: size: The number of records in the temporary file. dataset_name: A dataset name from `DatasetName`. shape: The shape of records to be created. temp_dir: The storage path of the temporary file. Returns: A python string indicating the temporary file name. """ temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): if dataset_name == DatasetName.IMAGENET: image = np.random.randint(255, size=shape, dtype=np.uint8) image = tf.image.encode_jpeg(tf.convert_to_tensor(value=image)) image = image.numpy() label = np.ndarray([1], dtype=np.int64) label[0] = np.random.randint(1, 11) example_dict = { "image": tf.train.Feature(bytes_list=tf.train.BytesList( value=[image])), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])), } elif dataset_name == DatasetName.FRAPPE: feature = np.random.randint(5383, size=(shape, )) label = np.random.randint(2, size=(1, )) example_dict = { "feature": tf.train.Feature(int64_list=tf.train.Int64List( value=feature)), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])), } elif dataset_name == DatasetName.TEST_MODULE: x = np.random.rand(shape).astype(np.float32) y = 2 * x + 1 example_dict = { "x": tf.train.Feature(float_list=tf.train.FloatList(value=x)), "y": tf.train.Feature(float_list=tf.train.FloatList(value=y)), } elif dataset_name == DatasetName.IMAGE_DEFAULT: image = np.random.rand(np.prod(shape)).astype(np.float32) label = np.ndarray([1], dtype=np.int64) label[0] = np.random.randint(0, 10) example_dict = { "image": tf.train.Feature(float_list=tf.train.FloatList( value=image)), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])), } else: raise ValueError("Unknown dataset name %s." % dataset_name) example = tf.train.Example(features=tf.train.Features( feature=example_dict)) f.write(example.SerializeToString()) return temp_file.name
def create_recordio_file(size, dataset_name, shape, temp_dir=None): """Creates a temporary file containing data of `recordio` format. Args: size: The number of records in the temporary file. dataset_name: A dataset name from `DatasetName`. shape: The shape of records to be created. temp_dir: The storage path of the temporary file. Returns: A python string indicating the temporary file name. """ temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir) with closing(recordio.Writer(temp_file.name)) as f: for _ in range(size): if dataset_name == DatasetName.IMAGENET: image = np.random.randint(255, size=shape, dtype=np.uint8) image = tf.image.encode_jpeg(tf.convert_to_tensor(value=image)) image = image.numpy() label = np.ndarray([1], dtype=np.int64) label[0] = np.random.randint(1, 11) example_dict = { "image": tf.train.Feature(bytes_list=tf.train.BytesList( value=[image])), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])), } elif dataset_name == DatasetName.FRAPPE: feature = np.random.randint(5383, size=(shape, )) label = np.random.randint(2, size=(1, )) example_dict = { "feature": tf.train.Feature(int64_list=tf.train.Int64List( value=feature)), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])), } elif dataset_name == DatasetName.TEST_MODULE: x = np.random.rand(shape).astype(np.float32) y = 2 * x + 1 example_dict = { "x": tf.train.Feature(float_list=tf.train.FloatList(value=x)), "y": tf.train.Feature(float_list=tf.train.FloatList(value=y)), } elif dataset_name == DatasetName.IMAGE_DEFAULT: image = np.random.rand(np.prod(shape)).astype(np.float32) label = np.ndarray([1], dtype=np.int64) label[0] = np.random.randint(0, 10) example_dict = { "image": tf.train.Feature(float_list=tf.train.FloatList( value=image)), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])), } elif dataset_name == DatasetName.CENSUS: example_dict = { "workclass": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"Private"])), "education": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"HS-grad"])), "marital-status": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"Widowed"])), "occupation": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"Exec-managerial"])), "relationship": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"Not-in-family"])), "race": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"White"])), "sex": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"Female"])), "native-country": tf.train.Feature(bytes_list=tf.train.BytesList( value=[b"United-States"])), "age": tf.train.Feature(float_list=tf.train.FloatList( value=[np.random.randint(10, 100)])), "capital-gain": tf.train.Feature(float_list=tf.train.FloatList( value=[np.random.randint(100, 4000)])), "capital-loss": tf.train.Feature(float_list=tf.train.FloatList( value=[np.random.randint(2000, 7000)])), "hours-per-week": tf.train.Feature(float_list=tf.train.FloatList( value=[np.random.randint(10, 70)])), "label": tf.train.Feature(int64_list=tf.train.Int64List( value=[np.random.randint(0, 2)])), } else: raise ValueError("Unknown dataset name %s." % dataset_name) example = tf.train.Example(features=tf.train.Features( feature=example_dict)) f.write(example.SerializeToString()) return temp_file.name
def write_to_recordio(filename, data_list): logger.info("Writing to file:", filename) with closing(recordio.Writer(filename)) as f: for d in data_list: f.write(d)