示例#1
0
  def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
    """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
    if self._num_gpus > 1:
      dataset = dataset.shard(self._num_gpus, hvd.rank())

    if self.is_training:
      # Shuffle the input files.
      dataset.shuffle(buffer_size=self._file_shuffle_buffer_size)

    if self.is_training and not self._cache:
      dataset = dataset.repeat()

    # Read the data from disk in parallel
    dataset = dataset.interleave(
        tf.data.TFRecordDataset,
        cycle_length=10,
        block_length=1,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._cache:
      dataset = dataset.cache()

    if self.is_training:
      dataset = dataset.shuffle(self._shuffle_buffer_size)
      dataset = dataset.repeat()

    # Parse, pre-process, and batch the data in parallel
    preprocess = self.parse_record
    dataset = dataset.map(preprocess,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    if self._num_gpus > 1:
      # The batch size of the dataset will be multiplied by the number of
      # replicas automatically when strategy.distribute_datasets_from_function
      # is called, so we use local batch size here.
      dataset = dataset.batch(self.local_batch_size,
                              drop_remainder=self.is_training)
    else:
      dataset = dataset.batch(self.global_batch_size,
                              drop_remainder=self.is_training)

    # Apply Mixup
    mixup_alpha = self.mixup_alpha if self.is_training else 0.0
    dataset = dataset.map(
        functools.partial(self.mixup, self.local_batch_size, mixup_alpha),
        num_parallel_calls=64)

    # Prefetch overlaps in-feed with training
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
示例#2
0
def get_augmented_data(
    dataset: tf.data.Dataset,
    batch_size: int,
    map_func: Callable,
    shuffle_buffer: Optional[int] = None,
    shuffle_seed: Optional[int] = None,
    augment_seed: Optional[int] = None,
    use_stateless_map: bool = False,
) -> RepeatedData:
    if shuffle_buffer is not None:
        dataset = dataset.shuffle(shuffle_buffer, seed=shuffle_seed)
    dataset = dataset.batch(batch_size)
    steps_per_epoch = tf.keras.backend.get_value(dataset.cardinality())
    # repeat before map so stateless map is different across epochs
    dataset = dataset.repeat()
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    if use_stateless_map:
        dataset = dataset.apply(
            tfrng.data.stateless_map(
                map_func,
                seed=augment_seed,
                num_parallel_calls=AUTOTUNE,
            ))
    else:
        # if map_func has random elements this won't be deterministic
        dataset = dataset.map(map_func, num_parallel_calls=AUTOTUNE)
    dataset = dataset.prefetch(AUTOTUNE)
    return RepeatedData(dataset, steps_per_epoch)
 def train(self, dataset: tf.data.Dataset, nr_records: int):
     dataset = dataset.batch(self.batch_size).map(self.transform_example)
     dataset = dataset.repeat()
     dataset = dataset.shuffle(1000)
     self.model.fit(dataset,
                    epochs=self.epochs,
                    steps_per_epoch=nr_records // self.batch_size)
示例#4
0
def get_dataloader(dataset: tf.data.Dataset,
                   transform: Callable,
                   train: bool = False,
                   n_samples: int = None,
                   epoch_frac: float = None,
                   batch_size: int = 4,
                   n_workers: int = 4,
                   random_seed: int = 82):
    n_samples = n_samples or iterable_len(sample_generator(dataset, n_workers))
    n_samples = int(n_samples *
                    epoch_frac) if epoch_frac is not None else n_samples

    if train:
        # shuffle data
        dataset = dataset.shuffle(buffer_size=n_samples,
                                  seed=random_seed,
                                  reshuffle_each_iteration=True)

    sample_generator_ = sample_generator(dataset, n_workers)
    dataset = BaseAutoDLDataset(sample_generator=sample_generator_,
                                n_samples=n_samples,
                                transform=transform)
    loader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=False,
                        num_workers=0,
                        pin_memory=torch.cuda.is_available(),
                        drop_last=train)

    return loader
示例#5
0
def memoize(dataset: tf.data.Dataset) -> tf.data.Dataset:
    data = []
    with tf.Graph().as_default(), tf.Session(
            config=utils.get_config()) as session:
        dataset = dataset.prefetch(16)
        it = dataset.make_one_shot_iterator().get_next()
        try:
            while 1:
                data.append(session.run(it))
        except tf.errors.OutOfRangeError:
            pass
    images = np.stack([x['image'] for x in data])
    labels = np.stack([x['label'] for x in data])

    def tf_get(index):
        def get(index):
            return images[index], labels[index]

        image, label = tf.py_func(get, [index], [tf.float32, tf.int64])
        return dict(image=image, label=label)

    dataset = tf.data.Dataset.range(len(data)).repeat()
    dataset = dataset.shuffle(
        len(data) if len(data) < FLAGS.shuffle else FLAGS.shuffle)
    return dataset.map(tf_get)
示例#6
0
文件: Task.py 项目: nitrogenase/TAPE
    def prepare_dataset(self,
                        dataset: tf.data.Dataset,
                        buckets: List[int],
                        batch_sizes: List[int],
                        shuffle: bool = False) -> tf.data.Dataset:
        dataset = dataset.map(self._deserialization_func,
                              num_parallel_calls=128)

        buckets_array = np.array(buckets)
        batch_sizes_array = np.array(batch_sizes)

        if np.any(batch_sizes_array == 0) and shuffle:
            iszero = np.where(batch_sizes_array == 0)[0][0]
            filterlen = buckets_array[iszero - 1]
            print("Filtering sequences of length {}".format(filterlen))
            dataset = dataset.filter(
                lambda example: example['protein_length'] < filterlen)
        else:
            batch_sizes_array[batch_sizes_array <= 0] = 1

        dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024)
        batch_fun = tf.data.experimental.bucket_by_sequence_length(
            operator.itemgetter('protein_length'), buckets_array,
            batch_sizes_array)
        dataset = dataset.apply(batch_fun)
        return dataset
示例#7
0
def preprocessing(dsData: tf.data.Dataset, window_size, batch_size):
    dsData = dsData.window(window_size + 1, shift=1, drop_remainder=True)
    dsData = dsData.flat_map(lambda w: w.batch(window_size + 1))
    dsData = dsData.map(lambda x: (x[:-1], x[-1]))
    dsData = dsData.shuffle(1000)
    dsData = dsData.batch(batch_size).prefetch(1)
    return dsData
示例#8
0
    def _prepare_dataset(
        self,
        dataset: tf.data.Dataset,
        shuffle: bool = False,
        augment: bool = False
    ) -> tf.data.Dataset:
        preprocessing_model = self._build_preprocessing()
        dataset = dataset.map(
            map_func=lambda x, y: (preprocessing_model(x, training=False), y),
            num_parallel_calls=tf.data.experimental.AUTOTUNE
        )

        if shuffle:
            dataset = dataset.shuffle(buffer_size=1_000)

        dataset = dataset.batch(batch_size=self.batch_size)

        if augment:
            data_augmentation_model = self._build_data_augmentation()
            dataset = dataset.map(
                map_func=lambda x, y: (data_augmentation_model(x, training=False), y),
                num_parallel_calls=tf.data.experimental.AUTOTUNE
            )

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
示例#9
0
文件: tuner.py 项目: prouhard/autoyml
    def run_trial(
        self,
        trial: Trial,
        dataset: tf.data.Dataset,
        n_splits: int = 5,
    ) -> None:
        """Evaluate the current set of hypermarameters with cross-validation.

        Args:
            trial: A Trial instance passed by the tuner, with the hyperparameters.
            dataset: The training data.
            n_splits: The number of folds to use, defaults to 5.

        Returns:
            None

        """
        val_losses = []
        shuffled_dataset = dataset.shuffle(buffer_size=len(dataset))
        shards = [shuffled_dataset.shard(n_splits, i) for i in range(n_splits)]
        for split in range(n_splits):
            dataset_train, dataset_val = self._cv_concatenate(shards, split)
            model = self.hypermodel.build(trial.hyperparameters, dataset_train)
            print(f"Fitting model (CV {split + 1} / {n_splits})...")
            class_weight = DataPreprocessor.get_class_weight(dataset_train)
            model.fit(dataset_train, class_weight=class_weight)
            print(f"Evaluating model (CV {split + 1} / {n_splits})...")
            val_losses.append(model.evaluate(dataset_val))
        self.oracle.update_trial(trial.trial_id,
                                 {"val_loss": np.mean(val_losses)})
        self.save_model(trial.trial_id, model)
示例#10
0
    def process(self, dataset: tf.data.Dataset, batch_size: int):
        dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE)

        if self.cache:
            dataset = dataset.cache()

        if self.shuffle:
            dataset = dataset.shuffle(self.buffer_size,
                                      reshuffle_each_iteration=True)

        # PADDED BATCH the dataset
        dataset = dataset.padded_batch(
            batch_size=batch_size,
            padded_shapes=(
                tf.TensorShape([]),
                tf.TensorShape(self.speech_featurizer.shape),
                tf.TensorShape([]),
                tf.TensorShape([None]),
                tf.TensorShape([]),
                tf.TensorShape([None]),
                tf.TensorShape([]),
            ),
            padding_values=("", 0., 0, self.text_featurizer.blank, 0,
                            self.text_featurizer.blank, 0),
            drop_remainder=self.drop_remainder)

        # PREFETCH to improve speed of input length
        dataset = dataset.prefetch(AUTOTUNE)
        self.total_steps = get_num_batches(self.total_steps, batch_size)
        return dataset
示例#11
0
文件: trainer.py 项目: pikma/rubik
def train_value_model(model: tf.keras.Model,
                      examples: tf.data.Dataset) -> None:
    '''Takes a compiled model and trains it.

    Trains a value model in a supervised mannel: it simply tries to predict the
    value at each state.
    '''

    # Training examples are generated from trajectories, so consecutive
    # examples are strongly correlated. This increases the variance of the
    # gradient. Shuffling the examples reduces the variance and speeds up
    # training significantly.
    examples = examples.shuffle(
        buffer_size=4096).batch(BATCH_SIZE).prefetch(16)

    evaluation_df = pd.DataFrame()
    for epoch in range(NUM_EPOCHS):
        model.fit(
            x=examples,
            epochs=epoch + 1,  # Train for one epoch.
            steps_per_epoch=NUM_STEPS_PER_EPOCH,
            initial_epoch=epoch,
            callbacks=[
                tf.keras.callbacks.TensorBoard(log_dir='/tmp/tensorboard')
            ])
        epoch_evaluation_df = evaluate_model(model)
        print(epoch_evaluation_df)
        epoch_evaluation_df['epoch'] = epoch
        evaluation_df = evaluation_df.append(epoch_evaluation_df,
                                             ignore_index=True)
示例#12
0
def make_dataset(
    dataset: tf.data.Dataset,
    train: bool,
    image_size: int = smaller_size,
    fixres: bool = True,
    num_parallel_calls=auto,
):
    if image_size not in [smaller_size, bigger_size]:
        raise ValueError(f"{image_size} resolution is not supported.")

    # Determine which preprocessing function we are using.
    if image_size == smaller_size:
        preprocess_func = preprocess_initial(train, image_size)
    elif not fixres and image_size == bigger_size:
        preprocess_func = preprocess_initial(train, image_size)
    else:
        preprocess_func = preprocess_finetune

    if train:
        dataset = dataset.shuffle(batch_size * 10)

    return (dataset.map(
        lambda x, y: preprocess_func(x, y, train),
        num_parallel_calls=num_parallel_calls,
    ).batch(batch_size).prefetch(num_parallel_calls))
示例#13
0
    def get_bucket_iter(self,
                        dataset: tf.data.Dataset,
                        batch_size=32,
                        train=True) -> tf.data.Dataset:
        padded_shapes = self._padded_shapes()
        padding_values = self._padding_values()
        if train:
            bucket_boundaries = self._bucket_boundaries(batch_size)
            bucket_batch_sizes = [batch_size] * (len(bucket_boundaries) + 1)
            dataset = dataset.apply(
                tf.data.experimental.bucket_by_sequence_length(
                    self.element_length_func,
                    bucket_boundaries,
                    bucket_batch_sizes,
                    padded_shapes=padded_shapes,
                    padding_values=padding_values))
            dataset = dataset.shuffle(100)
        else:
            dataset = dataset.padded_batch(batch_size,
                                           padded_shapes=padded_shapes)

        dataset = dataset.map(self._collate_fn,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)
        dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
        return dataset
示例#14
0
def prepare_Dataset(dataset: tf.data.Dataset,
                    shuffle: bool = False,
                    augment: bool = False) -> tf.data.Dataset:
    """Prepare the dataset object with preprocessing and data augmentation.

    Parameters
    ----------
    dataset : tf.data.Dataset
        The dataset object
    shuffle : bool, optional
        Whether to shuffle the dataset, by default False
    augment : bool, optional
        Whether to augment the train dataset, by default False

    Returns
    -------
    tf.data.Dataset
        The prepared dataset
    """
    preprocessing_model = build_preprocessing()
    dataset = dataset.map(map_func=lambda x, y: (preprocessing_model(x), y),
                          num_parallel_calls=AUTOTUNE)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=1_000)

    dataset = dataset.batch(batch_size=BATCH_SIZE)

    if augment:
        data_augmentation_model = build_data_augmentation()
        dataset = dataset.map(map_func=lambda x, y:
                              (data_augmentation_model(x), y),
                              num_parallel_calls=AUTOTUNE)

    return dataset.prefetch(buffer_size=AUTOTUNE)
示例#15
0
    def get_tfds_data_loader(data : tf.data.Dataset, data_subset_mode='train', batch_size=32, num_samples=100, num_classes=19, infinite=True, augment=True, seed=2836):


        def encode_example(x, y):
            x = tf.image.convert_image_dtype(x, tf.float32) * 255.0
            y = _encode_label(y, num_classes=num_classes)
            return x, y

        test_d = next(iter(data))
        print(test_d[0].numpy().min())
        print(test_d[0].numpy().max())

        data = data.shuffle(buffer_size=num_samples) \
                   .cache() \
                   .map(encode_example, num_parallel_calls=AUTOTUNE)

        test_d = next(iter(data))
        print(test_d[0].numpy().min())
        print(test_d[0].numpy().max())

        data = data.map(preprocess_input, num_parallel_calls=AUTOTUNE)

        test_d = next(iter(data))
        print(test_d[0].numpy().min())
        print(test_d[0].numpy().max())

        if data_subset_mode == 'train':
            data = data.shuffle(buffer_size=100, seed=seed)
            augmentor = TRAIN_image_augmentor
        elif data_subset_mode == 'val':
            augmentor = VAL_image_augmentor
        elif data_subset_mode == 'test':
            augmentor = TEST_image_augmentor

        if augment:
            data = augmentor.apply_augmentations(data)

        test_d = next(iter(data))
        print(test_d[0].numpy().min())
        print(test_d[0].numpy().max())

        data = data.batch(batch_size, drop_remainder=True)
        if infinite:
            data = data.repeat()

        return data.prefetch(AUTOTUNE)
示例#16
0
 def apply(self, dataset: tf.data.Dataset, mode: str = None):
     if mode is not None and self.modes is not None and mode not in self.modes:
         LOGGER.info(f"Not applying {self} (mode={mode})")
         return dataset
     return dataset.shuffle(
         self.buffer_size,
         seed=self.seed,
         reshuffle_each_iteration=self.reshuffle_each_iteration)
示例#17
0
def processing(dataset: tf.data.Dataset, window_size, batch_size):
    dataset = dataset.map(lambda x: table.lookup(x))
    dataset = dataset.unbatch()
    dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True)
    dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1))
    dataset = dataset.map(lambda x: (x[:-1], x[-1]-1))
    dataset = dataset.shuffle(10000)
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset
 def create_dataset(self, dataset: tf.data.Dataset,
                    input_columns, output_columns,
                    batch_size: int, use_cache: bool):
     dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns)
     if use_cache:
         dataset = dataset.cache("cache").repeat()
     dataset = dataset.shuffle(1000, reshuffle_each_iteration=True)
     dataset = dataset.batch(batch_size, drop_remainder=True)
     return dataset
    def train(self, dataset: tf.data.Dataset, nr_records: int):
        dataset = dataset.shuffle(512, reshuffle_each_iteration=True)
        dataset = dataset.batch(self.batch_size)
        history = list()
        for i in range(self.epochs):

            dataset = dataset.shuffle(512)
            step = 0
            epcoh_start = time.time()
            for data in dataset:
                users = data['user_id']
                rated = tf.cast(data['x'], tf.float32)
                mask = tf.cast(data['mask'], tf.float32)

                loss_value, grads = train_dtn(self.model, users, rated, mask)
                self.optimizer.apply_gradients(
                    zip(grads, self.model.trainable_variables))
                diff = time.time() - epcoh_start
                if step % 20 == 0:
                    predictions = self.predict(rated, users)
                    train_accuracy = Evaluation.tf_calculate_accuracy(
                        predictions, data['x'], data['mask'])
                    eval_x = data['x_test']
                    eval_mask = data['mask_test']
                    eval_accuracy = Evaluation.tf_calculate_accuracy(
                        predictions, eval_x, eval_mask)

                    print(
                        "\rEpoch #{} Loss at step {}: {:.4f}, time: {:.3f}. Train accuracy {:.3f}, Validation accuracy {:.3f}"
                        .format(i, step,
                                tf.reduce_mean(loss_value).numpy(), diff,
                                train_accuracy, eval_accuracy),
                        end='\r')
                else:
                    print("\rEpoch #{} Loss at step {}: {:.4f}, time: {:.3f}".
                          format(i, step,
                                 tf.reduce_mean(loss_value).numpy(), diff),
                          end='\r')

                step += 1

            print()
            self.epochs_trained += 1
示例#20
0
    def _prepare_dataset(self,
                         dataset: tf.data.Dataset,
                         shuffle: bool = False,
                         augment: bool = False) -> tf.data.Dataset:
        if shuffle:
            dataset = dataset.shuffle(buffer_size=1_000)

        dataset = dataset.batch(batch_size=self.batch_size)

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
 def train(self, dataset: tf.data.Dataset, nr_records: int):
     dataset = dataset.batch(self.batch_size)
     dataset = dataset.shuffle(1000)
     nr_steps = nr_records // self.batch_size
     for i in range(self.epochs):
         step = 0
         for data in dataset:
             loss_value, grads = grad(self.model, data)
             self.optimizer.apply_gradients(zip(grads, [self.model.U, self.model.P]))
             printProgressBar(step, nr_steps, 'Epoch {}, loss:  {:.3f}'.format(i, loss_value),length=80)
             step += 1
    def __init__(self, factory: TFToxicDataSetsFactory,
                 dataset: tf.data.Dataset, size: int):
        assert isinstance(factory, TFToxicDataSetsFactory) and isinstance(
            dataset, tf.data.Dataset)
        assert size > 0

        self._factory = factory
        self._dataset = dataset.shuffle(1000).batch(
            self.batch_size).prefetch(1)
        self._size = size
        self._batch_index = 0
 def preprocess_fn(dataset: tf.data.Dataset) -> tf.data.Dataset:
   if shuffle_buffer_size > 1:
     dataset = dataset.shuffle(shuffle_buffer_size, seed=debug_seed)
   if preprocess_spec.num_epochs > 1:
     dataset = dataset.repeat(preprocess_spec.num_epochs)
   if preprocess_spec.max_elements is not None:
     dataset = dataset.take(preprocess_spec.max_elements)
   dataset = dataset.batch(preprocess_spec.batch_size, drop_remainder=False)
   return dataset.map(
       mapping_fn,
       num_parallel_calls=num_parallel_calls,
       deterministic=debug_seed is not None)
示例#24
0
def train_fn(ds: tf.data.Dataset,
             batch_size=1,
             shuffle=10000,
             repeat: int = None):
    '''Create input function for training, prediction, evaluation.'''

    if shuffle:
        ds = ds.shuffle(shuffle)
    ds = ds.batch(batch_size)
    if repeat != 1:
        ds = ds.repeat(repeat)

    return lambda: ds.make_one_shot_iterator().get_next()
示例#25
0
 def prepare_dataset(self,
                     dataset: tf.data.Dataset,
                     buckets: List[int],
                     batch_sizes: List[int],
                     shuffle: bool = False) -> tf.data.Dataset:
     dataset = dataset.map(self._deserialization_func, 128)
     dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024)
     batch_fun = tf.data.experimental.bucket_by_sequence_length(
         lambda example: tf.maximum(example['first']['protein_length'],
                                    example['second']['protein_length']),
         buckets, batch_sizes)
     dataset = dataset.apply(batch_fun)
     return dataset
示例#26
0
 def __init__(
     self,
     dataset: tf.data.Dataset,
     patch_shape: list,
     targeted: bool = True,
     patch_initialization: str = "data",
     n_classes: int = 200,
 ):
     self.dataset = dataset.shuffle(buffer_size=100).repeat()
     self.patch_shape = patch_shape
     self.targeted = targeted
     self.n_classes = n_classes
     self.patch_initialization = patch_initialization
     self.sample_counter = -1
示例#27
0
def train(generator: kr.Model, discriminator: kr.Model,
          train_dataset: tf.data.Dataset, epoch):
    train_dataset = train_dataset.shuffle(10000).batch(
        HP.batch_size).prefetch(1)
    discriminator_losses, generator_losses = [], []

    for data in train_dataset:
        discriminator_loss, generator_loss = _train(generator, discriminator,
                                                    data, epoch)
        discriminator_losses.append(discriminator_loss)
        generator_losses.append(generator_loss)

    return tf.reduce_mean(discriminator_losses), tf.reduce_mean(
        generator_losses)
示例#28
0
    def _prepare_dataset(self,
                         dataset: tf.data.Dataset,
                         shuffle: bool = False) -> tf.data.Dataset:
        dataset = dataset.map(
            map_func=lambda x, y: (tf.reshape(self.normalization_layer(
                tf.reshape(x, shape=(1, self.num_features)), training=False),
                                              shape=(self.num_features, )), y),
            num_parallel_calls=tf.data.experimental.AUTOTUNE)

        if shuffle:
            dataset = dataset.shuffle(buffer_size=1_000)

        dataset = dataset.batch(batch_size=self.batch_size)

        return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
示例#29
0
def get_features(generator: kr.Model, condition, real_image_dataset: tf.data.Dataset):
    real_image_dataset = real_image_dataset.shuffle(10000).batch(HP.batch_size).prefetch(1)

    real_features = []
    fake_features = []

    for real_images in real_image_dataset:
        real_features_batch, fake_features_batch = _get_feature_samples(generator, condition, real_images)
        real_features.append(real_features_batch)
        fake_features.append(fake_features_batch)

    real_features = tf.concat(real_features, axis=0)
    fake_features = tf.concat(fake_features, axis=0)

    return real_features, fake_features
示例#30
0
 def get_data_iter(self,
                   dataset: tf.data.Dataset,
                   batch_size=32,
                   train=True) -> tf.data.Dataset:
     padded_shapes = self._padded_shapes()
     padding_values = self._padding_values()
     if train:
         dataset = dataset.shuffle(batch_size * 100)
     dataset = dataset.padded_batch(batch_size,
                                    padded_shapes=padded_shapes,
                                    padding_values=padding_values)
     dataset = dataset.map(self._collate_fn,
                           num_parallel_calls=tf.data.experimental.AUTOTUNE)
     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
     return dataset