def load_encoder_dataset(sentences, oracle=None): """ Load semi-dataset used for the encoder. (Sentence ID, Word embeddings, Sequence length) Needds to be paired with target style vectors queried from a previously trained synthesizer. """ dataset = TFRecordDataset([path.join(TFREDIR, sentence+'.tfr') for sentence in sentences])\ .map( lambda record: \ tf.parse_single_example( record, features={ 's': tf.FixedLenFeature([], tf.string), 'e': tf.FixedLenSequenceFeature([NE], tf.float32, allow_missing=True), 'n': tf.FixedLenFeature([], tf.int64) } ) ) if oracle is None: return dataset.map(lambda feature: (feature['e'], feature['n'])) else: indices = {s: n for n, s in enumerate(sentences)} return dataset.map(lambda feature: \ (feature['e'], feature['n'], tf.py_func( lambda s: oracle[indices[s.decode('ascii')],:].reshape(NC), [feature['s']], tf.float32 )) )
class InputManager(): def __init__(self, datafile, batch_size, repeat): self.data_path = datafile self.data = None self.dataset = TFRecordDataset(datafile, "GZIP") self.dataset = self.dataset.map(self._parse_sample) self.dataset = self.dataset.repeat() self.dataset = self.dataset.shuffle(buffer_size=batch_size * 3) self.data = self.dataset.batch(batch_size) def _parse_sample(self, example): feature_map = { 'b1': tf.FixedLenFeature(shape=[size], dtype=tf.float32), 'b2': tf.FixedLenFeature(shape=[size], dtype=tf.float32), } parsed = tf.parse_single_example(example, feature_map) return parsed['b1'], parsed['b2'] def iterator(self): return self.data.make_initializable_iterator()
def get_data_info(self): """ Returns shape of data, number of labels, steps per epoch of training, validation and test """ dataset = TFRecordDataset(self.train_filenames) dataset = dataset.map(self.parser) dataset = dataset.take(4) iterator = dataset.make_one_shot_iterator() sample_data = K.get_session().run(iterator.get_next()) train_spe = int( np.ceil( self.count_samples(self.train_filenames) * 1.0 / self.batch_size)) validation_spe = int( np.ceil( self.count_samples(self.validation_filenames) * 1.0 / self.batch_size)) test_spe = int( np.ceil( self.count_samples(self.test_filenames) * 1.0 / self.batch_size)) logging.info("Shape of input data: {}".format(sample_data[0].shape)) logging.info("Number of labels in input data: {}".format( sample_data[1].size)) logging.info( "Steps per epoch - Train: {}, Validation: {}, Test: {}".format( train_spe, validation_spe, test_spe)) return sample_data[0].shape, sample_data[ 1].size, train_spe, validation_spe, test_spe
def load_encoder_dataset2(sentences, oracle=None): dataset = TFRecordDataset([path.join(TFREDIR, sentence+'.tfr') for sentence in sentences])\ .map( lambda record: \ tf.parse_single_example( record, features={ 's': tf.FixedLenFeature([], tf.string), 'w': tf.FixedLenSequenceFeature([], tf.string, allow_missing=True), 'n': tf.FixedLenFeature([], tf.int64) } ) ) if oracle is None: return dataset.map(lambda feature: (feature['w'], feature['n'])) else: indices = {s: n for n, s in enumerate(sentences)} return dataset.map(lambda feature: \ (feature['w'], feature['n'], tf.py_func( lambda s: oracle[indices[s.decode('ascii')],:].reshape(NC), [feature['s']], tf.float32 )) )
def tf_create_iterator(dataset, batch_size): """ """ dataset_prefix = os.path.join(SNPX_DATASET_ROOT, dataset, dataset) train_rec_file = dataset_prefix + "_train.tfrecords" val_rec_file = dataset_prefix + "_val.tfrecords" # Create the training dataset object train_set = TFRecordDataset(train_rec_file) train_set = train_set.map(tf_parse_record, num_threads=4, output_buffer_size=1000) train_set = train_set.shuffle(buffer_size=50000) train_set = train_set.batch(batch_size) # Create the validation dataset object val_set = TFRecordDataset(val_rec_file) val_set = val_set.map(tf_parse_record) val_set = val_set.batch(batch_size) # Create a reinitializable iterator from both datasets iterator = Iterator.from_structure(train_set.output_types, train_set.output_shapes) train_init_op = iterator.make_initializer(train_set) val_init_op = iterator.make_initializer(val_set) iter_op = iterator.get_next() return train_init_op, val_init_op, iter_op
def __init__(self, datafile, batch_size, repeat): self.data_path = datafile self.data = None self.dataset = TFRecordDataset(datafile, "GZIP") self.dataset = self.dataset.map(self._parse_sample) self.dataset = self.dataset.repeat() self.dataset = self.dataset.shuffle(buffer_size=batch_size * 3) self.data = self.dataset.batch(batch_size)
def get_datasets(args): train_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[0])) validate_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[1])) test_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[2])) train_set = train_set.map(CelebDataset.parse_tfrecord(args)) validate_set = validate_set.map(CelebDataset.parse_tfrecord(args)) test_set = test_set.map(CelebDataset.parse_tfrecord(args)) return {'train': train_set, 'validate': validate_set, 'test': test_set}
def get_complete_data(self, filename): dataset = TFRecordDataset(filename) dataset = dataset.map(self.parser) dataset = dataset.shuffle(buffer_size=1024, seed=42) iterator = dataset.make_one_shot_iterator() xt, yt = iterator.get_next() xs, ys = [], [] while True: try: x, y = K.get_session().run([xt, yt]) xs.append(x) ys.append(y) except: break return np.array(xs), np.array(ys)
def get_datasets(args): train_fn = os.path.join(args.dataset_dir, _output_files[0]) train_set = TFRecordDataset(train_fn) validate_fn = os.path.join(args.dataset_dir, _output_files[1]) validate_set = TFRecordDataset(validate_fn) test_fn = os.path.join(args.dataset_dir, _output_files[2]) test_set = TFRecordDataset(test_fn) train_set = train_set.map(FloorplanDataset.parse_tfrecord(args)) validate_set = validate_set.map(FloorplanDataset.parse_tfrecord(args)) test_set = test_set.map(FloorplanDataset.parse_tfrecord(args)) return {'train': (train_set, train_fn), 'validate': (validate_set, validate_fn), 'test': (test_set, test_fn)}
def get_datasets(args): def ignore_incomplete_depthmaps(x, y, *args): return tf.logical_not( tf.reduce_any( tf.logical_or(tf.equal(y, tf.ones_like(y)), tf.equal(y, tf.zeros_like(y))))) datasets = {} for k, v in _dataset_files.items(): fn = os.path.join(args.dataset_dir, v) dataset = TFRecordDataset(fn) dataset = dataset.map(NYUv2Dataset.parse_tfrecord(args), num_threads=args.n_threads) dataset = dataset.filter(ignore_incomplete_depthmaps) datasets[k] = (dataset, fn) return datasets
def input_fn(): ds = TFRecordDataset(tfrecords_filenames) def parse_feats(exp): feature_def_dict = { 'img_id': tf.FixedLenFeature([], tf.string), # 'raw_img': tf.FixedLenFeature([], tf.string), 'img_feats': tf.FixedLenFeature([], tf.string), 'raw_caps': tf.FixedLenFeature([ 5, ], tf.string), 'cap_idx': tf.FixedLenFeature([ 5, ], tf.string), } features = tf.parse_single_example( exp, # Defaults are not specified since both keys are required. features=feature_def_dict) feats_tensor = tf.reshape( tf.decode_raw(features['img_feats'], tf.float32), [bin_size * bin_size, 1536]) return feats_tensor def parse_caps(exp): features = tf.parse_single_example( exp, # Defaults are not specified since both keys are required. features={ 'img_id': tf.FixedLenFeature([], tf.string), # 'raw_img': tf.FixedLenFeature([], tf.string), 'img_feats': tf.FixedLenFeature([], tf.string), 'raw_caps': tf.FixedLenFeature([ 5, ], tf.string), 'cap_idx': tf.FixedLenFeature([ 5, ], tf.string), }) cap_tensor = tf.decode_raw( random.choice(tf.unstack(features['cap_idx'])), tf.int32) return cap_tensor return ds.map(parse_feats), ds.map(parse_caps)
def input_fn(): ds = TFRecordDataset(tfrecords_filenames) def parse_feats(exp): feature_def_dict = { 'img_id': tf.FixedLenFeature([], tf.string), 'img_feats': tf.FixedLenFeature([], tf.string), } features = tf.parse_single_example( exp, # Defaults are not specified since both keys are required. features=feature_def_dict) feats_tensor = tf.reshape( tf.decode_raw(features['img_feats'], tf.float32), [bin_size * bin_size, 1536]) return features['img_id'], feats_tensor return ds.map(parse_feats)
def get_datasets(args): test_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[0])) train_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[1])) test_set = test_set.map(MNISTDataset.parse_tfrecord) train_set = train_set.map(MNISTDataset.parse_tfrecord) return {'train': train_set, 'validate': None, 'test': test_set}
def prepare_dataset(self, filename): """ Datset transformation pipeline :param filename: TfRecord filename :return: iterator of x, y batch nodes where x: feature, y: label """ dataset = TFRecordDataset(filename) dataset = dataset.map(self.parser, num_threads=4, output_buffer_size=2048) # dataset = dataset.shuffle(buffer_size=1024, seed=42) dataset = dataset.repeat(count=-1) # dataset = dataset.batch(self.batch_size) iterator = dataset.make_one_shot_iterator() x, y = iterator.get_next() x, y = tf.train.shuffle_batch( tensors=[x, y], shapes=[list(self.shape), [self.num_labels]], batch_size=self.batch_size, capacity=2048, min_after_dequeue=1024, enqueue_many=False, num_threads=self.num_threads) return x, y
def trainyuyu(): """ """ tf.logging.set_verbosity(tf.logging.INFO) batch_size = 200 dataset_prefix = os.path.join(SNPX_DATASET_ROOT, "CIFAR-10", "CIFAR-10") train_rec_file = dataset_prefix + "_train.tfrecords" with tf.Graph().as_default(): # Create the training dataset object train_set = TFRecordDataset(train_rec_file) train_set = train_set.map(tf_parse_record, num_threads=4, output_buffer_size=1000) train_set = train_set.shuffle(buffer_size=10000) train_set = train_set.batch(batch_size) # Create a reinitializable iterator from both datasets iterator = train_set.make_one_shot_iterator() images, labels = iterator.get_next() onehot_labels = tf.one_hot(labels, 10) predictions = snpx_net_create(10, images) # Get the optimizer opt = tf.train.AdamOptimizer() global_step = tf.train.get_or_create_global_step() # Compute the loss and the train_op loss = tf.losses.softmax_cross_entropy(onehot_labels, predictions) total_loss = tf.losses.get_total_loss() train_op = opt.minimize(total_loss, global_step=global_step) op = [train_op, total_loss] tf_sess = tf.Session() tf_sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=tf_sess, coord=coord) count = 0 last_log_tick = time() last_log_batch = 0 while True: try: loss, step = tf_sess.run([op, global_step]) count += 1 print(count, step) if (count - last_log_batch) >= 10: elapsed = time() - last_log_tick freq = ((count - last_log_batch) * batch_size / elapsed) last_log_batch = count last_log_tick = time() print(count, loss, freq) except tf.errors.OutOfRangeError: break
def get_datasets(args): train_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[0])) validate_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[1])) test_set = TFRecordDataset( os.path.join(args.dataset_dir, _output_files[2])) train_set = train_set.map(COCODataset.parse_tfrecord(args), num_threads=args.n_threads) validate_set = validate_set.map(COCODataset.parse_tfrecord(args), num_threads=args.n_threads) test_set = test_set.map(COCODataset.parse_tfrecord(args), num_threads=args.n_threads) return { 'train': (train_set, os.path.join(args.dataset_dir, 'coco.train.tfrecords')), 'validate': (validate_set, os.path.join(args.dataset_dir, 'coco.validate.tfrecords')), 'test': (test_set, os.path.join(args.dataset_dir, 'coco.test.tfrecords')) }
def load_synthesizer_dataset(sentences): """ Load dataset used for the synthesizer. (Linguistic features, Sentence ID) -> Acoustic features """ return TFRecordDataset([path.join(TFRSDIR, sentence+'.tfr') for sentence in sentences])\ .map( lambda record: \ tf.parse_single_example( record, features={ 's': tf.FixedLenFeature([], tf.string), 'l': tf.FixedLenFeature([NL+9], tf.float32), 'a': tf.FixedLenFeature([NA], tf.float32) } ) )\ .map( lambda feature: (feature['l'], feature['s'], feature['a']) )
def _get_dataset(file_name, parser): """Helper function. Intended to be used by get_datasets.""" dataset = TFRecordDataset(file_name).map(parser) dataset_size = sum( [1 for r in tf.python_io.tf_record_iterator(file_name)]) return dataset, dataset_size
def dataset_batch(self, pre_process_func=None, batch_size=32, shuffle=True, buffer_size=10000, num_threads_map=1, num_epochs=None, cols=None): """使用dataset返回一个batch数据(tf.__version__ >=1.3)。写dataset.map函数时需要注意要返回tuple而不是list,并且由于不能 返回SparseTensor,所以将SpareTensor拆分成3个Tensor。这部分通过参考read_batch_features函数来实现。 :param pre_process_func: 预处理函数 :param batch_size: :param shuffle: 读取时是否打乱顺序 :param buffer_size: map和shuffle的buffer大小 :param num_threads_map: map转换使用的线程数 :param num_epochs: 获取多少epoch数据,None表示无限 :param cols: 要返回TFRecord中的哪些feature。get_keys()函数返回值得子集。 :return: 一个batch数据 """ filenames = tf.gfile.Glob(self.pattern) dataset = TFRecordDataset(filenames) if cols is None: cols = self.get_keys() # ---两个map耗时更长--- # map函数不能返回list,必须返回tuple # dataset = dataset.map(self._parse_example_proto, num_threads=num_threads_map, output_buffer_size=buffer_size) # 原因:源码nest.flatten(ret)函数中的is_sequence(nest)对list类型的nest返回false # dataset = dataset.map(lambda feature_dict: tuple(feature_dict[col] for col in cols), # num_threads=num_threads_map, # output_buffer_size=buffer_size) # 采用closure,两个map合成一个 keys = self.get_keys() types = self.get_types() cols_types = [types[keys.index(col)] for col in cols] # 返回cols对应的类型 if shuffle: dataset = dataset.shuffle(buffer_size=buffer_size) dataset = dataset.repeat(num_epochs) dataset = dataset.batch(batch_size) dataset = dataset.map(_parse_example_proto_cols_closure( cols_types, cols), num_threads=num_threads_map, output_buffer_size=buffer_size) sparse_bool_list = [] if pre_process_func is not None: # 是否进行预处理 dataset = dataset.map( _preprocess_sparsetensor(pre_process_func, cols_types, sparse_bool_list)) iterator = dataset.make_one_shot_iterator() output = iterator.get_next() # 3 tensor -> sparsetensor output_sparse = [] index = 0 if pre_process_func and sparse_bool_list: # 用户进行preprocess for sparse_bool in sparse_bool_list: if sparse_bool: output_sparse.append( sparse_tensor.SparseTensor(indices=output[index], values=output[index + 1], dense_shape=output[index + 2])) index += 3 else: output_sparse.append(output[index]) index += 1 else: # 用户没有preprocess for col_type in cols_types: if col_type.get_isfix(): output_sparse.append(output[index]) index += 1 else: output_sparse.append( sparse_tensor.SparseTensor(indices=output[index], values=output[index + 1], dense_shape=output[index + 2])) index += 3 return output_sparse
def get_dataset(args): if args.dataset == 'floorplans': fn = 'data/floorplans.train.tfrecords' d = TFRecordDataset(fn) d = d.map(parse_floorplans) elif args.dataset == 'cifar': fn = 'data/cifar.32.train.tfrecords' d = TFRecordDataset(fn) d = d.map(parse_cifar) # d = tf.train.shuffle_batch(d, batch_size=args.batch_size, capacity=10000, num_threads=4, min_after_dequeue=20) # x = d # d = d.cache('tmp/') d = d.cache(os.path.join(args.cache_dir, 'cache')) if args.cache_dir else d.cache() d = d.repeat() d = d.shuffle(buffer_size=args.buffer_size) d = d.batch(args.batch_size * args.n_gpus) iterator = d.make_initializable_iterator() x = iterator.get_next() # determine size of dataset c = sum([1 for r in tf.python_io.tf_record_iterator(fn)]) # return d, int(c) return x, iterator.initializer, int(c)