Пример #1
0
def load_encoder_dataset(sentences, oracle=None):
    """
    Load semi-dataset used for the encoder.
    (Sentence ID, Word embeddings, Sequence length)
    Needds to be paired with target style vectors queried from a previously trained synthesizer.
    """
    dataset = TFRecordDataset([path.join(TFREDIR, sentence+'.tfr')
                               for sentence in sentences])\
        .map(
            lambda record: \
                tf.parse_single_example(
                    record,
                    features={
                        's': tf.FixedLenFeature([], tf.string),
                        'e': tf.FixedLenSequenceFeature([NE],
                                                        tf.float32,
                                                        allow_missing=True),
                        'n': tf.FixedLenFeature([], tf.int64)
                    }
                )
        )

    if oracle is None:
        return dataset.map(lambda feature: (feature['e'], feature['n']))
    else:
        indices = {s: n for n, s in enumerate(sentences)}
        return dataset.map(lambda feature: \
            (feature['e'],
             feature['n'],
             tf.py_func(
                lambda s: oracle[indices[s.decode('ascii')],:].reshape(NC),
                [feature['s']],
                tf.float32
             ))
        )
Пример #2
0
class InputManager():
    def __init__(self, datafile, batch_size, repeat):

        self.data_path = datafile

        self.data = None
        self.dataset = TFRecordDataset(datafile, "GZIP")
        self.dataset = self.dataset.map(self._parse_sample)
        self.dataset = self.dataset.repeat()
        self.dataset = self.dataset.shuffle(buffer_size=batch_size * 3)

        self.data = self.dataset.batch(batch_size)

    def _parse_sample(self, example):

        feature_map = {
            'b1': tf.FixedLenFeature(shape=[size], dtype=tf.float32),
            'b2': tf.FixedLenFeature(shape=[size], dtype=tf.float32),
        }

        parsed = tf.parse_single_example(example, feature_map)

        return parsed['b1'], parsed['b2']

    def iterator(self):

        return self.data.make_initializable_iterator()
    def get_data_info(self):
        """
        Returns shape of data, number of labels, steps per epoch of training, validation and test
        """
        dataset = TFRecordDataset(self.train_filenames)
        dataset = dataset.map(self.parser)
        dataset = dataset.take(4)
        iterator = dataset.make_one_shot_iterator()
        sample_data = K.get_session().run(iterator.get_next())

        train_spe = int(
            np.ceil(
                self.count_samples(self.train_filenames) * 1.0 /
                self.batch_size))
        validation_spe = int(
            np.ceil(
                self.count_samples(self.validation_filenames) * 1.0 /
                self.batch_size))
        test_spe = int(
            np.ceil(
                self.count_samples(self.test_filenames) * 1.0 /
                self.batch_size))

        logging.info("Shape of input data: {}".format(sample_data[0].shape))
        logging.info("Number of labels in input data: {}".format(
            sample_data[1].size))
        logging.info(
            "Steps per epoch - Train: {}, Validation: {}, Test: {}".format(
                train_spe, validation_spe, test_spe))

        return sample_data[0].shape, sample_data[
            1].size, train_spe, validation_spe, test_spe
Пример #4
0
def load_encoder_dataset2(sentences, oracle=None):
    dataset = TFRecordDataset([path.join(TFREDIR, sentence+'.tfr')
                               for sentence in sentences])\
        .map(
            lambda record: \
                tf.parse_single_example(
                    record,
                    features={
                        's': tf.FixedLenFeature([], tf.string),
                        'w': tf.FixedLenSequenceFeature([], tf.string,
                                                        allow_missing=True),
                        'n': tf.FixedLenFeature([], tf.int64)
                    }
                )
        )

    if oracle is None:
        return dataset.map(lambda feature: (feature['w'], feature['n']))
    else:
        indices = {s: n for n, s in enumerate(sentences)}
        return dataset.map(lambda feature: \
            (feature['w'],
             feature['n'],
             tf.py_func(
                lambda s: oracle[indices[s.decode('ascii')],:].reshape(NC),
                [feature['s']],
                tf.float32
             ))
        )
Пример #5
0
def tf_create_iterator(dataset, batch_size):
    """ """
    dataset_prefix = os.path.join(SNPX_DATASET_ROOT, dataset, dataset)
    train_rec_file = dataset_prefix + "_train.tfrecords"
    val_rec_file = dataset_prefix + "_val.tfrecords"

    # Create the training dataset object
    train_set = TFRecordDataset(train_rec_file)
    train_set = train_set.map(tf_parse_record,
                              num_threads=4,
                              output_buffer_size=1000)
    train_set = train_set.shuffle(buffer_size=50000)
    train_set = train_set.batch(batch_size)

    # Create the validation dataset object
    val_set = TFRecordDataset(val_rec_file)
    val_set = val_set.map(tf_parse_record)
    val_set = val_set.batch(batch_size)

    # Create a reinitializable iterator from both datasets
    iterator = Iterator.from_structure(train_set.output_types,
                                       train_set.output_shapes)
    train_init_op = iterator.make_initializer(train_set)
    val_init_op = iterator.make_initializer(val_set)
    iter_op = iterator.get_next()
    return train_init_op, val_init_op, iter_op
Пример #6
0
    def __init__(self, datafile, batch_size, repeat):

        self.data_path = datafile

        self.data = None
        self.dataset = TFRecordDataset(datafile, "GZIP")
        self.dataset = self.dataset.map(self._parse_sample)
        self.dataset = self.dataset.repeat()
        self.dataset = self.dataset.shuffle(buffer_size=batch_size * 3)

        self.data = self.dataset.batch(batch_size)
Пример #7
0
 def get_datasets(args):
     train_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[0]))
     validate_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[1]))
     test_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[2]))
     train_set = train_set.map(CelebDataset.parse_tfrecord(args))
     validate_set = validate_set.map(CelebDataset.parse_tfrecord(args))
     test_set = test_set.map(CelebDataset.parse_tfrecord(args))
     return {'train': train_set, 'validate': validate_set, 'test': test_set}
 def get_complete_data(self, filename):
     dataset = TFRecordDataset(filename)
     dataset = dataset.map(self.parser)
     dataset = dataset.shuffle(buffer_size=1024, seed=42)
     iterator = dataset.make_one_shot_iterator()
     xt, yt = iterator.get_next()
     xs, ys = [], []
     while True:
         try:
             x, y = K.get_session().run([xt, yt])
             xs.append(x)
             ys.append(y)
         except:
             break
     return np.array(xs), np.array(ys)
Пример #9
0
    def get_datasets(args):
        train_fn = os.path.join(args.dataset_dir, _output_files[0])
        train_set = TFRecordDataset(train_fn)
        validate_fn = os.path.join(args.dataset_dir, _output_files[1])
        validate_set = TFRecordDataset(validate_fn)
        test_fn = os.path.join(args.dataset_dir, _output_files[2])
        test_set = TFRecordDataset(test_fn)

        train_set = train_set.map(FloorplanDataset.parse_tfrecord(args))
        validate_set = validate_set.map(FloorplanDataset.parse_tfrecord(args))
        test_set = test_set.map(FloorplanDataset.parse_tfrecord(args))
        return {'train': (train_set, train_fn), 'validate': (validate_set, validate_fn), 'test': (test_set, test_fn)}
Пример #10
0
    def get_datasets(args):
        def ignore_incomplete_depthmaps(x, y, *args):
            return tf.logical_not(
                tf.reduce_any(
                    tf.logical_or(tf.equal(y, tf.ones_like(y)),
                                  tf.equal(y, tf.zeros_like(y)))))

        datasets = {}
        for k, v in _dataset_files.items():
            fn = os.path.join(args.dataset_dir, v)
            dataset = TFRecordDataset(fn)
            dataset = dataset.map(NYUv2Dataset.parse_tfrecord(args),
                                  num_threads=args.n_threads)
            dataset = dataset.filter(ignore_incomplete_depthmaps)
            datasets[k] = (dataset, fn)
        return datasets
Пример #11
0
        def input_fn():
            ds = TFRecordDataset(tfrecords_filenames)

            def parse_feats(exp):
                feature_def_dict = {
                    'img_id': tf.FixedLenFeature([], tf.string),
                    # 'raw_img': tf.FixedLenFeature([], tf.string),
                    'img_feats': tf.FixedLenFeature([], tf.string),
                    'raw_caps': tf.FixedLenFeature([
                        5,
                    ], tf.string),
                    'cap_idx': tf.FixedLenFeature([
                        5,
                    ], tf.string),
                }

                features = tf.parse_single_example(
                    exp,
                    # Defaults are not specified since both keys are required.
                    features=feature_def_dict)
                feats_tensor = tf.reshape(
                    tf.decode_raw(features['img_feats'], tf.float32),
                    [bin_size * bin_size, 1536])
                return feats_tensor

            def parse_caps(exp):
                features = tf.parse_single_example(
                    exp,
                    # Defaults are not specified since both keys are required.
                    features={
                        'img_id': tf.FixedLenFeature([], tf.string),
                        # 'raw_img': tf.FixedLenFeature([], tf.string),
                        'img_feats': tf.FixedLenFeature([], tf.string),
                        'raw_caps': tf.FixedLenFeature([
                            5,
                        ], tf.string),
                        'cap_idx': tf.FixedLenFeature([
                            5,
                        ], tf.string),
                    })

                cap_tensor = tf.decode_raw(
                    random.choice(tf.unstack(features['cap_idx'])), tf.int32)

                return cap_tensor

            return ds.map(parse_feats), ds.map(parse_caps)
Пример #12
0
        def input_fn():
            ds = TFRecordDataset(tfrecords_filenames)

            def parse_feats(exp):
                feature_def_dict = {
                    'img_id': tf.FixedLenFeature([], tf.string),
                    'img_feats': tf.FixedLenFeature([], tf.string),
                }

                features = tf.parse_single_example(
                    exp,
                    # Defaults are not specified since both keys are required.
                    features=feature_def_dict)
                feats_tensor = tf.reshape(
                    tf.decode_raw(features['img_feats'], tf.float32),
                    [bin_size * bin_size, 1536])
                return features['img_id'], feats_tensor

            return ds.map(parse_feats)
Пример #13
0
 def get_datasets(args):
     test_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[0]))
     train_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[1]))
     test_set = test_set.map(MNISTDataset.parse_tfrecord)
     train_set = train_set.map(MNISTDataset.parse_tfrecord)
     return {'train': train_set, 'validate': None, 'test': test_set}
 def prepare_dataset(self, filename):
     """
     Datset transformation pipeline
     :param filename: TfRecord filename
     :return: iterator of x, y batch nodes where x: feature, y: label
     """
     dataset = TFRecordDataset(filename)
     dataset = dataset.map(self.parser,
                           num_threads=4,
                           output_buffer_size=2048)
     # dataset = dataset.shuffle(buffer_size=1024, seed=42)
     dataset = dataset.repeat(count=-1)
     # dataset = dataset.batch(self.batch_size)
     iterator = dataset.make_one_shot_iterator()
     x, y = iterator.get_next()
     x, y = tf.train.shuffle_batch(
         tensors=[x, y],
         shapes=[list(self.shape), [self.num_labels]],
         batch_size=self.batch_size,
         capacity=2048,
         min_after_dequeue=1024,
         enqueue_many=False,
         num_threads=self.num_threads)
     return x, y
Пример #15
0
def trainyuyu():
    """ """
    tf.logging.set_verbosity(tf.logging.INFO)
    batch_size = 200
    dataset_prefix = os.path.join(SNPX_DATASET_ROOT, "CIFAR-10", "CIFAR-10")
    train_rec_file = dataset_prefix + "_train.tfrecords"

    with tf.Graph().as_default():
        # Create the training dataset object
        train_set = TFRecordDataset(train_rec_file)
        train_set = train_set.map(tf_parse_record,
                                  num_threads=4,
                                  output_buffer_size=1000)
        train_set = train_set.shuffle(buffer_size=10000)
        train_set = train_set.batch(batch_size)

        # Create a reinitializable iterator from both datasets
        iterator = train_set.make_one_shot_iterator()
        images, labels = iterator.get_next()
        onehot_labels = tf.one_hot(labels, 10)
        predictions = snpx_net_create(10, images)

        # Get the optimizer
        opt = tf.train.AdamOptimizer()
        global_step = tf.train.get_or_create_global_step()

        # Compute the loss and the train_op
        loss = tf.losses.softmax_cross_entropy(onehot_labels, predictions)
        total_loss = tf.losses.get_total_loss()
        train_op = opt.minimize(total_loss, global_step=global_step)
        op = [train_op, total_loss]
        tf_sess = tf.Session()
        tf_sess.run(
            tf.group(tf.global_variables_initializer(),
                     tf.local_variables_initializer()))
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=tf_sess, coord=coord)

        count = 0
        last_log_tick = time()
        last_log_batch = 0
        while True:
            try:
                loss, step = tf_sess.run([op, global_step])
                count += 1
                print(count, step)
                if (count - last_log_batch) >= 10:
                    elapsed = time() - last_log_tick
                    freq = ((count - last_log_batch) * batch_size / elapsed)
                    last_log_batch = count
                    last_log_tick = time()
                    print(count, loss, freq)
            except tf.errors.OutOfRangeError:
                break
Пример #16
0
 def get_datasets(args):
     train_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[0]))
     validate_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[1]))
     test_set = TFRecordDataset(
         os.path.join(args.dataset_dir, _output_files[2]))
     train_set = train_set.map(COCODataset.parse_tfrecord(args),
                               num_threads=args.n_threads)
     validate_set = validate_set.map(COCODataset.parse_tfrecord(args),
                                     num_threads=args.n_threads)
     test_set = test_set.map(COCODataset.parse_tfrecord(args),
                             num_threads=args.n_threads)
     return {
         'train': (train_set,
                   os.path.join(args.dataset_dir, 'coco.train.tfrecords')),
         'validate': (validate_set,
                      os.path.join(args.dataset_dir,
                                   'coco.validate.tfrecords')),
         'test':
         (test_set, os.path.join(args.dataset_dir, 'coco.test.tfrecords'))
     }
Пример #17
0
def load_synthesizer_dataset(sentences):
    """
    Load dataset used for the synthesizer.
    (Linguistic features, Sentence ID) -> Acoustic features
    """
    return TFRecordDataset([path.join(TFRSDIR, sentence+'.tfr')
                            for sentence in sentences])\
        .map(
            lambda record: \
                tf.parse_single_example(
                    record,
                    features={
                        's': tf.FixedLenFeature([], tf.string),
                        'l': tf.FixedLenFeature([NL+9], tf.float32),
                        'a': tf.FixedLenFeature([NA], tf.float32)
                    }
                )
        )\
        .map(
            lambda feature: (feature['l'], feature['s'], feature['a'])
        )
Пример #18
0
 def _get_dataset(file_name, parser):
     """Helper function. Intended to be used by get_datasets."""
     dataset = TFRecordDataset(file_name).map(parser)
     dataset_size = sum(
         [1 for r in tf.python_io.tf_record_iterator(file_name)])
     return dataset, dataset_size
Пример #19
0
    def dataset_batch(self,
                      pre_process_func=None,
                      batch_size=32,
                      shuffle=True,
                      buffer_size=10000,
                      num_threads_map=1,
                      num_epochs=None,
                      cols=None):
        """使用dataset返回一个batch数据(tf.__version__ >=1.3)。写dataset.map函数时需要注意要返回tuple而不是list,并且由于不能
        返回SparseTensor,所以将SpareTensor拆分成3个Tensor。这部分通过参考read_batch_features函数来实现。

        :param pre_process_func: 预处理函数
        :param batch_size:
        :param shuffle: 读取时是否打乱顺序
        :param buffer_size: map和shuffle的buffer大小
        :param num_threads_map: map转换使用的线程数
        :param num_epochs: 获取多少epoch数据,None表示无限
        :param cols:  要返回TFRecord中的哪些feature。get_keys()函数返回值得子集。
        :return: 一个batch数据
        """
        filenames = tf.gfile.Glob(self.pattern)
        dataset = TFRecordDataset(filenames)
        if cols is None:
            cols = self.get_keys()
        # ---两个map耗时更长---
        # map函数不能返回list,必须返回tuple
        # dataset = dataset.map(self._parse_example_proto, num_threads=num_threads_map, output_buffer_size=buffer_size)
        # 原因:源码nest.flatten(ret)函数中的is_sequence(nest)对list类型的nest返回false
        # dataset = dataset.map(lambda feature_dict: tuple(feature_dict[col] for col in cols),
        #                       num_threads=num_threads_map,
        #                       output_buffer_size=buffer_size)

        # 采用closure,两个map合成一个
        keys = self.get_keys()
        types = self.get_types()
        cols_types = [types[keys.index(col)] for col in cols]  # 返回cols对应的类型
        if shuffle:
            dataset = dataset.shuffle(buffer_size=buffer_size)
        dataset = dataset.repeat(num_epochs)
        dataset = dataset.batch(batch_size)
        dataset = dataset.map(_parse_example_proto_cols_closure(
            cols_types, cols),
                              num_threads=num_threads_map,
                              output_buffer_size=buffer_size)
        sparse_bool_list = []
        if pre_process_func is not None:  # 是否进行预处理
            dataset = dataset.map(
                _preprocess_sparsetensor(pre_process_func, cols_types,
                                         sparse_bool_list))
        iterator = dataset.make_one_shot_iterator()
        output = iterator.get_next()
        # 3 tensor -> sparsetensor
        output_sparse = []
        index = 0
        if pre_process_func and sparse_bool_list:  # 用户进行preprocess
            for sparse_bool in sparse_bool_list:
                if sparse_bool:
                    output_sparse.append(
                        sparse_tensor.SparseTensor(indices=output[index],
                                                   values=output[index + 1],
                                                   dense_shape=output[index +
                                                                      2]))
                    index += 3
                else:
                    output_sparse.append(output[index])
                    index += 1
        else:  # 用户没有preprocess
            for col_type in cols_types:
                if col_type.get_isfix():
                    output_sparse.append(output[index])
                    index += 1
                else:
                    output_sparse.append(
                        sparse_tensor.SparseTensor(indices=output[index],
                                                   values=output[index + 1],
                                                   dense_shape=output[index +
                                                                      2]))
                    index += 3
        return output_sparse
Пример #20
0
def get_dataset(args):
    if args.dataset == 'floorplans':
        fn = 'data/floorplans.train.tfrecords'
        d = TFRecordDataset(fn)
        d = d.map(parse_floorplans)
    elif args.dataset == 'cifar':
        fn = 'data/cifar.32.train.tfrecords'
        d = TFRecordDataset(fn)
        d = d.map(parse_cifar)

    # d = tf.train.shuffle_batch(d, batch_size=args.batch_size, capacity=10000, num_threads=4, min_after_dequeue=20)
    # x = d
    # d = d.cache('tmp/')
    d = d.cache(os.path.join(args.cache_dir,
                             'cache')) if args.cache_dir else d.cache()
    d = d.repeat()
    d = d.shuffle(buffer_size=args.buffer_size)
    d = d.batch(args.batch_size * args.n_gpus)

    iterator = d.make_initializable_iterator()
    x = iterator.get_next()

    # determine size of dataset
    c = sum([1 for r in tf.python_io.tf_record_iterator(fn)])

    # return d, int(c)

    return x, iterator.initializer, int(c)