def split_dataset(self, dataset: tf.data.Dataset, validation_data_fraction: float): """ Splits a dataset of type tf.data.Dataset into a training and validation dataset using given ratio. Fractions are rounded up to two decimal places. @param dataset: the input dataset to split. @param validation_data_fraction: the fraction of the validation data as a float between 0 and 1. @return: a tuple of two tf.data.Datasets as (training, validation) Refrence URL: https://stackoverflow.com/questions/59669413/ what-is-the-canonical-way-to-split-tf-dataset-into-test-and-validation-subsets """ validation_data_percent = round(validation_data_fraction * 100) if not (0 <= validation_data_percent <= 100): raise ValueError("validation data fraction must be ∈ [0,1]") dataset = dataset.enumerate() train_dataset = dataset.filter( lambda f, data: f % 100 > validation_data_percent) validation_dataset = dataset.filter( lambda f, data: f % 100 <= validation_data_percent) # remove enumeration train_dataset = train_dataset.map(lambda f, data: data) validation_dataset = validation_dataset.map(lambda f, data: data) return train_dataset, validation_dataset
def split_dataset(dataset: tf.data.Dataset, validation_data_fraction: float): """ Splits a dataset of type tf.data.Dataset into a training and validation dataset using given ratio. Fractions are rounded up to two decimal places. @param dataset: the input dataset to split. @param validation_data_fraction: the fraction of the validation data as a float between 0 and 1. @return: a tuple of two tf.data.Datasets as (training, validation) """ validation_data_percent = round(validation_data_fraction * 100) if not (0 <= validation_data_percent <= 100): raise ValueError("validation data fraction must be ∈ [0,1]") dataset = dataset.enumerate() train_dataset = dataset.filter( lambda f, data: f % 100 > validation_data_percent) validation_dataset = dataset.filter( lambda f, data: f % 100 <= validation_data_percent) # remove enumeration train_dataset = train_dataset.map(lambda f, data: data) validation_dataset = validation_dataset.map(lambda f, data: data) return train_dataset, validation_dataset
def split_dataset_subject(dset: tf.data.Dataset, validation_subjects): train_dataset = dset.filter( lambda data, label, subject: not tf.reduce_any( tf.math.equal(tf.cast(subject, tf.int32), validation_subjects) )) eval_dataset = dset.filter(lambda data, label, subject: tf.reduce_any( tf.math.equal(tf.cast(subject, tf.int32), validation_subjects))) return train_dataset, eval_dataset
def _filter_dataset(self, dataset: tf.data.Dataset): """Filter examples which are empty or too long.""" # filter empty sequences dataset = dataset.filter(lambda x, y: tf.logical_and(tf.size(x) > 0, tf.size(y) > 0)) # length filter x_max_len = self.config.get('x_max_len', -1) if x_max_len > 0: dataset = dataset.filter(lambda x, y: tf.size(x) <= x_max_len) y_max_len = self.config.get('y_max_len', -1) if y_max_len > 0: dataset = dataset.filter(lambda x, y: tf.size(y) <= y_max_len) return dataset
def prepare_dataset(self, dataset: tf.data.Dataset, buckets: List[int], batch_sizes: List[int], shuffle: bool = False) -> tf.data.Dataset: dataset = dataset.map(self._deserialization_func, num_parallel_calls=128) buckets_array = np.array(buckets) batch_sizes_array = np.array(batch_sizes) if np.any(batch_sizes_array == 0) and shuffle: iszero = np.where(batch_sizes_array == 0)[0][0] filterlen = buckets_array[iszero - 1] print("Filtering sequences of length {}".format(filterlen)) dataset = dataset.filter( lambda example: example['protein_length'] < filterlen) else: batch_sizes_array[batch_sizes_array <= 0] = 1 dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024) batch_fun = tf.data.experimental.bucket_by_sequence_length( operator.itemgetter('protein_length'), buckets_array, batch_sizes_array) dataset = dataset.apply(batch_fun) return dataset
def Split_Dataset(dataset: tf.data.Dataset, validation_data_fraction: float): validation_data_percent = round(validation_data_fraction * 100) if not (0 <= validation_data_percent <= 100): raise ValueError("validation data fraction must be ∈ [0,1]") dataset = dataset.enumerate() train_dataset = dataset.filter( lambda f, data: f % 100 >= validation_data_percent) validation_dataset = dataset.filter( lambda f, data: f % 100 < validation_data_percent) # remove enumeration train_dataset = train_dataset.map(lambda f, data: data) validation_dataset = validation_dataset.map(lambda f, data: data) return train_dataset, validation_dataset
def rm_unlabelled_samples(dataset: tf.data.Dataset): """ Filter all unlabelled data instances (label = -1.0) from the tf.data.Dataset passed in as parameter :param dataset: dataset in which the unlabelled instances (label = -1.0) are to be filtered out. :return: """ return dataset.filter(__unlabelled)
def pretext_dataset(dataset:tf.data.Dataset, start_label:int)->tf.data.Dataset: filtered = dataset.filter(lambda data:data['label'] >= start_label) def supervised_transform(data): image = data['image'] image = tf.cast(image, tf.float32) image = image / 255.0 def random_transform(image): pass
def accumulated_batch( dataset: tf.data.Dataset, accumulator: Union[Accumulator, Mapping, Iterable], **map_kwargs, ): accumulator = accumulator_structure(accumulator) def initial_map_fn(*args): if len(args) == 1: (args,) = args return args, False @tf.function def scan_fn(state, el_and_final): el, final = el_and_final new_state = accumulator.append(state, el) valid = tf.reduce_all(accumulator.valid_conditions(new_state)) invalid = tf.logical_not(valid) if invalid: new_state = accumulator.append(accumulator.initial_state(), el) return new_state, (state, tf.logical_or(invalid, final)) def filter_fn(state, invalid): del state return invalid def map_fn(state, invalid): del invalid return accumulator.finalize(state) cardinality = tf.data.experimental.cardinality(dataset) dataset = dataset.map(initial_map_fn) if cardinality != tf.data.experimental.INFINITE_CARDINALITY: # append (empty, True) element to ensure final elements are generated state_spec = dataset.element_spec[0] empty_el = tf.nest.map_structure( lambda spec: tf.zeros( [1, *(0 if s is None else s for s in spec.shape)], dtype=spec.dtype ), state_spec, ) true_el = tf.ones((1,), dtype=tf.bool) dataset = dataset.concatenate( tf.data.Dataset.from_tensor_slices((empty_el, true_el)) ) dataset = dataset.apply( tf.data.experimental.scan(accumulator.initial_state(), scan_fn) ) dataset = dataset.filter(filter_fn) dataset = dataset.map(map_fn, **map_kwargs) return dataset
def split_dataset(dataset: tf.data.Dataset, val_split: float, test_split: float): # Splits a dataset of type tf.data.Dataset into a training and test dataset using given ratio. Fractions are # rounded up to two decimal places. # Input: # dataset: the input dataset to split. # val_split: the fraction of val data as a float between 0 and 1. # test_split: the fraction of the test data as a float between 0 and 1. # Return: # a tuple of two tf.data.Datasets as (training, test) # Source: https://stackoverflow.com/questions/59669413/what-is-the-canonical-way-to-split-tf-dataset-into-test-and-validation-subsets test_data_percent = round(test_split * 100) if not (0 <= test_data_percent <= 100): raise ValueError("test data fraction must be ∈ [0,1]") val_data_percent = round(val_split * 100) if not (0 <= val_data_percent <= 100): raise ValueError("val data fraction must be ∈ [0,1]") dataset = dataset.enumerate() train_val_dataset = dataset.filter( lambda f, data: f % 100 > test_data_percent) test_dataset = dataset.filter(lambda f, data: f % 100 <= test_data_percent) # remove enumeration train_val_dataset = train_val_dataset.map(lambda f, data: data) test_dataset = test_dataset.map(lambda f, data: data) # add validation from training train_val_dataset = train_val_dataset.enumerate() train_dataset = train_val_dataset.filter( lambda f, data: f % 100 > val_data_percent) val_dataset = train_val_dataset.filter( lambda f, data: f % 100 <= val_data_percent) # remove enumeration train_dataset = train_dataset.map(lambda f, data: data) val_dataset = val_dataset.map(lambda f, data: data) return train_dataset, val_dataset, test_dataset
def supervised_dataset(dataset:tf.data.Dataset, max_label:int)->tf.data.Dataset: filtered = dataset.filter(lambda data:data['label'] < max_label) def supervised_transform(data): image = data['image'] image = tf.cast(image, tf.float32) image = image / 255.0 label = data['label'] label = tf.one_hot(label, max_label) return image, label return filtered.map(supervised_transform, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def separate_by_target(ds: tf.data.Dataset, idx: int = 1, thr: float = 0.5 ) -> typing.Tuple[tf.data.Dataset, tf.data.Dataset]: def _cond0(*args): return tf.cast(args[idx], tf.float32) < thr def _cond1(*args): return tf.cast(args[idx], tf.float32) >= thr ds0 = ds.filter(_cond0) ds1 = ds.filter(_cond1) return ds0, ds1
def explore_localization_error(model: tf.keras.Model, dataset: tf.data.Dataset, transferred: bool): # Building axes rows = 5 cols = 6 fig, axes = plt.subplots(rows, cols) fig.set_tight_layout(tight=0.1) axes = axes.ravel() # Performs pre-processing here to keep the original unprocessed image as well dataset = dataset.filter(lambda image, expected: tf.equal(expected[0], 1)) preprocess_fnc = dogs.transferred_preprocess if transferred else dogs.preprocess # Find the first n images for which the IoU with the GT box is less than 10% found = 0 iterator = iter(dataset) while found < rows * cols: image, expected = next(iterator) predicted = tf.squeeze( model(tf.expand_dims(preprocess_fnc(image), axis=0))) iou = tf.squeeze( dogs.compute_iou(tf.expand_dims(expected[1:], axis=0), tf.expand_dims(predicted[1:], axis=0))) if tf.equal(iou, 0): # Plots corresponding image image_height, image_width, _ = image.shape axes[found].imshow(image.numpy()) # Plots estimated bounding box box = coco.BBox(*predicted[1:]) patch = patches.Rectangle( (box.x * image_width, box.y * image_height), box.width * image_width, box.height * image_height, edgecolor='blue', facecolor='none', lw=2) axes[found].add_patch(patch) # Plots expected bounding box box = coco.BBox(*expected[1:]) patch = patches.Rectangle( (box.x * image_width, box.y * image_height), box.width * image_width, box.height * image_height, edgecolor='red', facecolor='none', lw=2) axes[found].add_patch(patch) # Some extra configurations axes[found].axis('off') axes[found].set_title("{:.2f}".format(predicted[0].numpy())) # Updates count and exits if all needed images have been found found += 1 plt.show()
def split_dataset(dataset: tf.data.Dataset, split: Iterable[int], buffer_shuffle: int = None) -> List[tf.data.Dataset]: partitions = np.insert(np.cumsum(split), 0, 0) if buffer_shuffle is None: buffer_shuffle = 100 * partitions[-1] dataset = dataset.shuffle(buffer_shuffle, seed=22, reshuffle_each_iteration=False).enumerate() partitions = np.stack([partitions[:-1], partitions[1:]], axis=1) datasets = map( lambda partition: dataset.filter( lambda x, y: _between( tf.math.floormod(x, partitions[-1, -1]), partition[0], partition[1] ) ).map(lambda x, y: y), partitions, ) return list(datasets)
def transform_dataset(self, ds_input: tf.data.Dataset) -> tf.data.Dataset: """Create a dataset with filtering applied. Args: ds_input: Any dataset that produces dictionaries keyed by strings and values with any rank tensors. Returns: A `tf.data.Dataset` with elements containing the same keys, but with potentially fewer elements. """ # Filter and return. ds_output = ds_input.filter(self.filter_fn) return ds_output
def preprocess_wmt_data(dataset: tf.data.Dataset, shuffle: bool, num_epochs: Optional[int] = 1, pack_examples: bool = True, shuffle_buffer_size: int = 1024, max_length: int = 512, batch_size: int = 256, drop_remainder: bool = True, prefetch_size: int = AUTOTUNE): """Shuffle and batch/pack the given dataset.""" def length_filter(max_len): def filter_fn(x): source, target = x['inputs'], x['targets'] l = tf.maximum(tf.shape(source)[0], tf.shape(target)[0]) return tf.less(l, max_len + 1) return filter_fn if max_length > 0: dataset = dataset.filter(length_filter(max_length)) if shuffle: dataset = dataset.shuffle(shuffle_buffer_size) dataset = dataset.repeat(num_epochs) if pack_examples: dataset = pack_dataset(dataset, max_length) dataset = dataset.batch(batch_size, drop_remainder=drop_remainder) else: # simple (static-shape) padded batching dataset = dataset.padded_batch(batch_size, padded_shapes={ 'inputs': max_length, 'targets': max_length }, padding_values={ 'inputs': 0, 'targets': 0 }, drop_remainder=drop_remainder) if prefetch_size: dataset = dataset.prefetch(prefetch_size) return dataset
def filter_sample(self, data: tf.data.Dataset) -> tf.data.Dataset: """ Filter according to data filters Parameters ---------- data input dataset Returns ------- data filtered dataset """ def _predicate_fn(inputs): return self.data_filter_true(**inputs) data = data.filter(_predicate_fn) return data
def preprocess_wmt_data(dataset: tf.data.Dataset, data_rng, train: bool, shuffle_buffer_size: int = 1024, max_length: int = 512, per_device_batch_size: int = 256): """Shuffle and batch/pack the given dataset.""" def length_filter(max_len): def filter_fn(x): source, target = x['inputs'], x['targets'] l = tf.maximum(tf.shape(source)[0], tf.shape(target)[0]) return tf.less(l, max_len + 1) return filter_fn if max_length > 0: dataset = dataset.filter(length_filter(max_length)) if train: dataset = dataset.shuffle(shuffle_buffer_size, seed=data_rng[0]) dataset = dataset.repeat() dataset = pack_dataset(dataset, max_length) dataset = dataset.batch(per_device_batch_size, drop_remainder=train) else: # simple (static-shape) padded batching dataset = dataset.padded_batch(per_device_batch_size, padded_shapes={ 'inputs': max_length, 'targets': max_length }, padding_values={ 'inputs': 0, 'targets': 0 }, drop_remainder=train) dataset = dataset.prefetch(AUTOTUNE) return dataset
def apply(self, dataset: tf.data.Dataset, mode: str = None): """Apply preprocessors as one filter operation""" # Filter preprocessors for this mode active_prepros: List[core.Filter] = [] for prepro in self.preprocessors: if mode is not None and prepro.modes is not None and mode not in prepro.modes: LOGGER.info(f"Not applying {prepro} (mode={mode})") continue active_prepros.append(prepro) # Apply filtered preprocessors if not active_prepros: return dataset else: def _fused_tf_predicate(element): pred = None for prepro in active_prepros: new_pred = prepro.tf_predicate(element) pred = new_pred if pred is None else tf.logical_and( pred, new_pred) return pred return dataset.filter(_fused_tf_predicate)
def filter_preprocessor(dataset: tf.data.Dataset) -> tf.data.Dataset: return dataset.filter(predicate)
def apply(self, dataset: tf.data.Dataset, mode: str = None): if mode is not None and self.modes is not None and mode not in self.modes: LOGGER.info(f"Not applying {self} (mode={mode})") return dataset return dataset.filter(self.tf_predicate)