def pipeline(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. Returns: A TensorFlow dataset outputting batched images and labels. """ if self._num_gpus > 1: dataset = dataset.shard(self._num_gpus, hvd.rank()) if self.is_training: # Shuffle the input files. dataset.shuffle(buffer_size=self._file_shuffle_buffer_size) if self.is_training and not self._cache: dataset = dataset.repeat() # Read the data from disk in parallel dataset = dataset.interleave( tf.data.TFRecordDataset, cycle_length=10, block_length=1, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self._shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel preprocess = self.parse_record dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if self._num_gpus > 1: # The batch size of the dataset will be multiplied by the number of # replicas automatically when strategy.distribute_datasets_from_function # is called, so we use local batch size here. dataset = dataset.batch(self.local_batch_size, drop_remainder=self.is_training) else: dataset = dataset.batch(self.global_batch_size, drop_remainder=self.is_training) # Apply Mixup mixup_alpha = self.mixup_alpha if self.is_training else 0.0 dataset = dataset.map( functools.partial(self.mixup, self.local_batch_size, mixup_alpha), num_parallel_calls=64) # Prefetch overlaps in-feed with training dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def get_augmented_data( dataset: tf.data.Dataset, batch_size: int, map_func: Callable, shuffle_buffer: Optional[int] = None, shuffle_seed: Optional[int] = None, augment_seed: Optional[int] = None, use_stateless_map: bool = False, ) -> RepeatedData: if shuffle_buffer is not None: dataset = dataset.shuffle(shuffle_buffer, seed=shuffle_seed) dataset = dataset.batch(batch_size) steps_per_epoch = tf.keras.backend.get_value(dataset.cardinality()) # repeat before map so stateless map is different across epochs dataset = dataset.repeat() AUTOTUNE = tf.data.experimental.AUTOTUNE if use_stateless_map: dataset = dataset.apply( tfrng.data.stateless_map( map_func, seed=augment_seed, num_parallel_calls=AUTOTUNE, )) else: # if map_func has random elements this won't be deterministic dataset = dataset.map(map_func, num_parallel_calls=AUTOTUNE) dataset = dataset.prefetch(AUTOTUNE) return RepeatedData(dataset, steps_per_epoch)
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.batch(self.batch_size).map(self.transform_example) dataset = dataset.repeat() dataset = dataset.shuffle(1000) self.model.fit(dataset, epochs=self.epochs, steps_per_epoch=nr_records // self.batch_size)
def get_dataloader(dataset: tf.data.Dataset, transform: Callable, train: bool = False, n_samples: int = None, epoch_frac: float = None, batch_size: int = 4, n_workers: int = 4, random_seed: int = 82): n_samples = n_samples or iterable_len(sample_generator(dataset, n_workers)) n_samples = int(n_samples * epoch_frac) if epoch_frac is not None else n_samples if train: # shuffle data dataset = dataset.shuffle(buffer_size=n_samples, seed=random_seed, reshuffle_each_iteration=True) sample_generator_ = sample_generator(dataset, n_workers) dataset = BaseAutoDLDataset(sample_generator=sample_generator_, n_samples=n_samples, transform=transform) loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=torch.cuda.is_available(), drop_last=train) return loader
def memoize(dataset: tf.data.Dataset) -> tf.data.Dataset: data = [] with tf.Graph().as_default(), tf.Session( config=utils.get_config()) as session: dataset = dataset.prefetch(16) it = dataset.make_one_shot_iterator().get_next() try: while 1: data.append(session.run(it)) except tf.errors.OutOfRangeError: pass images = np.stack([x['image'] for x in data]) labels = np.stack([x['label'] for x in data]) def tf_get(index): def get(index): return images[index], labels[index] image, label = tf.py_func(get, [index], [tf.float32, tf.int64]) return dict(image=image, label=label) dataset = tf.data.Dataset.range(len(data)).repeat() dataset = dataset.shuffle( len(data) if len(data) < FLAGS.shuffle else FLAGS.shuffle) return dataset.map(tf_get)
def prepare_dataset(self, dataset: tf.data.Dataset, buckets: List[int], batch_sizes: List[int], shuffle: bool = False) -> tf.data.Dataset: dataset = dataset.map(self._deserialization_func, num_parallel_calls=128) buckets_array = np.array(buckets) batch_sizes_array = np.array(batch_sizes) if np.any(batch_sizes_array == 0) and shuffle: iszero = np.where(batch_sizes_array == 0)[0][0] filterlen = buckets_array[iszero - 1] print("Filtering sequences of length {}".format(filterlen)) dataset = dataset.filter( lambda example: example['protein_length'] < filterlen) else: batch_sizes_array[batch_sizes_array <= 0] = 1 dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024) batch_fun = tf.data.experimental.bucket_by_sequence_length( operator.itemgetter('protein_length'), buckets_array, batch_sizes_array) dataset = dataset.apply(batch_fun) return dataset
def preprocessing(dsData: tf.data.Dataset, window_size, batch_size): dsData = dsData.window(window_size + 1, shift=1, drop_remainder=True) dsData = dsData.flat_map(lambda w: w.batch(window_size + 1)) dsData = dsData.map(lambda x: (x[:-1], x[-1])) dsData = dsData.shuffle(1000) dsData = dsData.batch(batch_size).prefetch(1) return dsData
def _prepare_dataset( self, dataset: tf.data.Dataset, shuffle: bool = False, augment: bool = False ) -> tf.data.Dataset: preprocessing_model = self._build_preprocessing() dataset = dataset.map( map_func=lambda x, y: (preprocessing_model(x, training=False), y), num_parallel_calls=tf.data.experimental.AUTOTUNE ) if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=self.batch_size) if augment: data_augmentation_model = self._build_data_augmentation() dataset = dataset.map( map_func=lambda x, y: (data_augmentation_model(x, training=False), y), num_parallel_calls=tf.data.experimental.AUTOTUNE ) return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def run_trial( self, trial: Trial, dataset: tf.data.Dataset, n_splits: int = 5, ) -> None: """Evaluate the current set of hypermarameters with cross-validation. Args: trial: A Trial instance passed by the tuner, with the hyperparameters. dataset: The training data. n_splits: The number of folds to use, defaults to 5. Returns: None """ val_losses = [] shuffled_dataset = dataset.shuffle(buffer_size=len(dataset)) shards = [shuffled_dataset.shard(n_splits, i) for i in range(n_splits)] for split in range(n_splits): dataset_train, dataset_val = self._cv_concatenate(shards, split) model = self.hypermodel.build(trial.hyperparameters, dataset_train) print(f"Fitting model (CV {split + 1} / {n_splits})...") class_weight = DataPreprocessor.get_class_weight(dataset_train) model.fit(dataset_train, class_weight=class_weight) print(f"Evaluating model (CV {split + 1} / {n_splits})...") val_losses.append(model.evaluate(dataset_val)) self.oracle.update_trial(trial.trial_id, {"val_loss": np.mean(val_losses)}) self.save_model(trial.trial_id, model)
def process(self, dataset: tf.data.Dataset, batch_size: int): dataset = dataset.map(self.parse, num_parallel_calls=AUTOTUNE) if self.cache: dataset = dataset.cache() if self.shuffle: dataset = dataset.shuffle(self.buffer_size, reshuffle_each_iteration=True) # PADDED BATCH the dataset dataset = dataset.padded_batch( batch_size=batch_size, padded_shapes=( tf.TensorShape([]), tf.TensorShape(self.speech_featurizer.shape), tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([]), ), padding_values=("", 0., 0, self.text_featurizer.blank, 0, self.text_featurizer.blank, 0), drop_remainder=self.drop_remainder) # PREFETCH to improve speed of input length dataset = dataset.prefetch(AUTOTUNE) self.total_steps = get_num_batches(self.total_steps, batch_size) return dataset
def train_value_model(model: tf.keras.Model, examples: tf.data.Dataset) -> None: '''Takes a compiled model and trains it. Trains a value model in a supervised mannel: it simply tries to predict the value at each state. ''' # Training examples are generated from trajectories, so consecutive # examples are strongly correlated. This increases the variance of the # gradient. Shuffling the examples reduces the variance and speeds up # training significantly. examples = examples.shuffle( buffer_size=4096).batch(BATCH_SIZE).prefetch(16) evaluation_df = pd.DataFrame() for epoch in range(NUM_EPOCHS): model.fit( x=examples, epochs=epoch + 1, # Train for one epoch. steps_per_epoch=NUM_STEPS_PER_EPOCH, initial_epoch=epoch, callbacks=[ tf.keras.callbacks.TensorBoard(log_dir='/tmp/tensorboard') ]) epoch_evaluation_df = evaluate_model(model) print(epoch_evaluation_df) epoch_evaluation_df['epoch'] = epoch evaluation_df = evaluation_df.append(epoch_evaluation_df, ignore_index=True)
def make_dataset( dataset: tf.data.Dataset, train: bool, image_size: int = smaller_size, fixres: bool = True, num_parallel_calls=auto, ): if image_size not in [smaller_size, bigger_size]: raise ValueError(f"{image_size} resolution is not supported.") # Determine which preprocessing function we are using. if image_size == smaller_size: preprocess_func = preprocess_initial(train, image_size) elif not fixres and image_size == bigger_size: preprocess_func = preprocess_initial(train, image_size) else: preprocess_func = preprocess_finetune if train: dataset = dataset.shuffle(batch_size * 10) return (dataset.map( lambda x, y: preprocess_func(x, y, train), num_parallel_calls=num_parallel_calls, ).batch(batch_size).prefetch(num_parallel_calls))
def get_bucket_iter(self, dataset: tf.data.Dataset, batch_size=32, train=True) -> tf.data.Dataset: padded_shapes = self._padded_shapes() padding_values = self._padding_values() if train: bucket_boundaries = self._bucket_boundaries(batch_size) bucket_batch_sizes = [batch_size] * (len(bucket_boundaries) + 1) dataset = dataset.apply( tf.data.experimental.bucket_by_sequence_length( self.element_length_func, bucket_boundaries, bucket_batch_sizes, padded_shapes=padded_shapes, padding_values=padding_values)) dataset = dataset.shuffle(100) else: dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes) dataset = dataset.map(self._collate_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def prepare_Dataset(dataset: tf.data.Dataset, shuffle: bool = False, augment: bool = False) -> tf.data.Dataset: """Prepare the dataset object with preprocessing and data augmentation. Parameters ---------- dataset : tf.data.Dataset The dataset object shuffle : bool, optional Whether to shuffle the dataset, by default False augment : bool, optional Whether to augment the train dataset, by default False Returns ------- tf.data.Dataset The prepared dataset """ preprocessing_model = build_preprocessing() dataset = dataset.map(map_func=lambda x, y: (preprocessing_model(x), y), num_parallel_calls=AUTOTUNE) if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=BATCH_SIZE) if augment: data_augmentation_model = build_data_augmentation() dataset = dataset.map(map_func=lambda x, y: (data_augmentation_model(x), y), num_parallel_calls=AUTOTUNE) return dataset.prefetch(buffer_size=AUTOTUNE)
def get_tfds_data_loader(data : tf.data.Dataset, data_subset_mode='train', batch_size=32, num_samples=100, num_classes=19, infinite=True, augment=True, seed=2836): def encode_example(x, y): x = tf.image.convert_image_dtype(x, tf.float32) * 255.0 y = _encode_label(y, num_classes=num_classes) return x, y test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) data = data.shuffle(buffer_size=num_samples) \ .cache() \ .map(encode_example, num_parallel_calls=AUTOTUNE) test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) data = data.map(preprocess_input, num_parallel_calls=AUTOTUNE) test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) if data_subset_mode == 'train': data = data.shuffle(buffer_size=100, seed=seed) augmentor = TRAIN_image_augmentor elif data_subset_mode == 'val': augmentor = VAL_image_augmentor elif data_subset_mode == 'test': augmentor = TEST_image_augmentor if augment: data = augmentor.apply_augmentations(data) test_d = next(iter(data)) print(test_d[0].numpy().min()) print(test_d[0].numpy().max()) data = data.batch(batch_size, drop_remainder=True) if infinite: data = data.repeat() return data.prefetch(AUTOTUNE)
def apply(self, dataset: tf.data.Dataset, mode: str = None): if mode is not None and self.modes is not None and mode not in self.modes: LOGGER.info(f"Not applying {self} (mode={mode})") return dataset return dataset.shuffle( self.buffer_size, seed=self.seed, reshuffle_each_iteration=self.reshuffle_each_iteration)
def processing(dataset: tf.data.Dataset, window_size, batch_size): dataset = dataset.map(lambda x: table.lookup(x)) dataset = dataset.unbatch() dataset = dataset.window(window_size+1, shift = 1, drop_remainder=True) dataset = dataset.flat_map(lambda ds: ds.batch(window_size+1)) dataset = dataset.map(lambda x: (x[:-1], x[-1]-1)) dataset = dataset.shuffle(10000) dataset = dataset.batch(batch_size).prefetch(1) return dataset
def create_dataset(self, dataset: tf.data.Dataset, input_columns, output_columns, batch_size: int, use_cache: bool): dataset = self.add_feature_columns_to_dataset(dataset, input_columns, output_columns) if use_cache: dataset = dataset.cache("cache").repeat() dataset = dataset.shuffle(1000, reshuffle_each_iteration=True) dataset = dataset.batch(batch_size, drop_remainder=True) return dataset
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.shuffle(512, reshuffle_each_iteration=True) dataset = dataset.batch(self.batch_size) history = list() for i in range(self.epochs): dataset = dataset.shuffle(512) step = 0 epcoh_start = time.time() for data in dataset: users = data['user_id'] rated = tf.cast(data['x'], tf.float32) mask = tf.cast(data['mask'], tf.float32) loss_value, grads = train_dtn(self.model, users, rated, mask) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) diff = time.time() - epcoh_start if step % 20 == 0: predictions = self.predict(rated, users) train_accuracy = Evaluation.tf_calculate_accuracy( predictions, data['x'], data['mask']) eval_x = data['x_test'] eval_mask = data['mask_test'] eval_accuracy = Evaluation.tf_calculate_accuracy( predictions, eval_x, eval_mask) print( "\rEpoch #{} Loss at step {}: {:.4f}, time: {:.3f}. Train accuracy {:.3f}, Validation accuracy {:.3f}" .format(i, step, tf.reduce_mean(loss_value).numpy(), diff, train_accuracy, eval_accuracy), end='\r') else: print("\rEpoch #{} Loss at step {}: {:.4f}, time: {:.3f}". format(i, step, tf.reduce_mean(loss_value).numpy(), diff), end='\r') step += 1 print() self.epochs_trained += 1
def _prepare_dataset(self, dataset: tf.data.Dataset, shuffle: bool = False, augment: bool = False) -> tf.data.Dataset: if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=self.batch_size) return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def train(self, dataset: tf.data.Dataset, nr_records: int): dataset = dataset.batch(self.batch_size) dataset = dataset.shuffle(1000) nr_steps = nr_records // self.batch_size for i in range(self.epochs): step = 0 for data in dataset: loss_value, grads = grad(self.model, data) self.optimizer.apply_gradients(zip(grads, [self.model.U, self.model.P])) printProgressBar(step, nr_steps, 'Epoch {}, loss: {:.3f}'.format(i, loss_value),length=80) step += 1
def __init__(self, factory: TFToxicDataSetsFactory, dataset: tf.data.Dataset, size: int): assert isinstance(factory, TFToxicDataSetsFactory) and isinstance( dataset, tf.data.Dataset) assert size > 0 self._factory = factory self._dataset = dataset.shuffle(1000).batch( self.batch_size).prefetch(1) self._size = size self._batch_index = 0
def preprocess_fn(dataset: tf.data.Dataset) -> tf.data.Dataset: if shuffle_buffer_size > 1: dataset = dataset.shuffle(shuffle_buffer_size, seed=debug_seed) if preprocess_spec.num_epochs > 1: dataset = dataset.repeat(preprocess_spec.num_epochs) if preprocess_spec.max_elements is not None: dataset = dataset.take(preprocess_spec.max_elements) dataset = dataset.batch(preprocess_spec.batch_size, drop_remainder=False) return dataset.map( mapping_fn, num_parallel_calls=num_parallel_calls, deterministic=debug_seed is not None)
def train_fn(ds: tf.data.Dataset, batch_size=1, shuffle=10000, repeat: int = None): '''Create input function for training, prediction, evaluation.''' if shuffle: ds = ds.shuffle(shuffle) ds = ds.batch(batch_size) if repeat != 1: ds = ds.repeat(repeat) return lambda: ds.make_one_shot_iterator().get_next()
def prepare_dataset(self, dataset: tf.data.Dataset, buckets: List[int], batch_sizes: List[int], shuffle: bool = False) -> tf.data.Dataset: dataset = dataset.map(self._deserialization_func, 128) dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024) batch_fun = tf.data.experimental.bucket_by_sequence_length( lambda example: tf.maximum(example['first']['protein_length'], example['second']['protein_length']), buckets, batch_sizes) dataset = dataset.apply(batch_fun) return dataset
def __init__( self, dataset: tf.data.Dataset, patch_shape: list, targeted: bool = True, patch_initialization: str = "data", n_classes: int = 200, ): self.dataset = dataset.shuffle(buffer_size=100).repeat() self.patch_shape = patch_shape self.targeted = targeted self.n_classes = n_classes self.patch_initialization = patch_initialization self.sample_counter = -1
def train(generator: kr.Model, discriminator: kr.Model, train_dataset: tf.data.Dataset, epoch): train_dataset = train_dataset.shuffle(10000).batch( HP.batch_size).prefetch(1) discriminator_losses, generator_losses = [], [] for data in train_dataset: discriminator_loss, generator_loss = _train(generator, discriminator, data, epoch) discriminator_losses.append(discriminator_loss) generator_losses.append(generator_loss) return tf.reduce_mean(discriminator_losses), tf.reduce_mean( generator_losses)
def _prepare_dataset(self, dataset: tf.data.Dataset, shuffle: bool = False) -> tf.data.Dataset: dataset = dataset.map( map_func=lambda x, y: (tf.reshape(self.normalization_layer( tf.reshape(x, shape=(1, self.num_features)), training=False), shape=(self.num_features, )), y), num_parallel_calls=tf.data.experimental.AUTOTUNE) if shuffle: dataset = dataset.shuffle(buffer_size=1_000) dataset = dataset.batch(batch_size=self.batch_size) return dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
def get_features(generator: kr.Model, condition, real_image_dataset: tf.data.Dataset): real_image_dataset = real_image_dataset.shuffle(10000).batch(HP.batch_size).prefetch(1) real_features = [] fake_features = [] for real_images in real_image_dataset: real_features_batch, fake_features_batch = _get_feature_samples(generator, condition, real_images) real_features.append(real_features_batch) fake_features.append(fake_features_batch) real_features = tf.concat(real_features, axis=0) fake_features = tf.concat(fake_features, axis=0) return real_features, fake_features
def get_data_iter(self, dataset: tf.data.Dataset, batch_size=32, train=True) -> tf.data.Dataset: padded_shapes = self._padded_shapes() padding_values = self._padding_values() if train: dataset = dataset.shuffle(batch_size * 100) dataset = dataset.padded_batch(batch_size, padded_shapes=padded_shapes, padding_values=padding_values) dataset = dataset.map(self._collate_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset