def split_dataset(self, dataset: tf.data.Dataset,
                      validation_data_fraction: float):
        """
        Splits a dataset of type tf.data.Dataset into a training and validation dataset using given ratio. Fractions are
        rounded up to two decimal places.
        @param dataset: the input dataset to split.
        @param validation_data_fraction: the fraction of the validation data as a float between 0 and 1.
        @return: a tuple of two tf.data.Datasets as (training, validation)
        Refrence URL:
        https://stackoverflow.com/questions/59669413/
        what-is-the-canonical-way-to-split-tf-dataset-into-test-and-validation-subsets
        """

        validation_data_percent = round(validation_data_fraction * 100)
        if not (0 <= validation_data_percent <= 100):
            raise ValueError("validation data fraction must be ∈ [0,1]")

        dataset = dataset.enumerate()
        train_dataset = dataset.filter(
            lambda f, data: f % 100 > validation_data_percent)
        validation_dataset = dataset.filter(
            lambda f, data: f % 100 <= validation_data_percent)

        # remove enumeration
        train_dataset = train_dataset.map(lambda f, data: data)
        validation_dataset = validation_dataset.map(lambda f, data: data)

        return train_dataset, validation_dataset
Пример #2
0
    def split_dataset(dataset: tf.data.Dataset,
                      validation_data_fraction: float):
        """
        Splits a dataset of type tf.data.Dataset into a training and validation dataset using given ratio. Fractions are
        rounded up to two decimal places.
        @param dataset: the input dataset to split.
        @param validation_data_fraction: the fraction of the validation data as a float between 0 and 1.
        @return: a tuple of two tf.data.Datasets as (training, validation)
        """

        validation_data_percent = round(validation_data_fraction * 100)
        if not (0 <= validation_data_percent <= 100):
            raise ValueError("validation data fraction must be ∈ [0,1]")

        dataset = dataset.enumerate()
        train_dataset = dataset.filter(
            lambda f, data: f % 100 > validation_data_percent)
        validation_dataset = dataset.filter(
            lambda f, data: f % 100 <= validation_data_percent)

        # remove enumeration
        train_dataset = train_dataset.map(lambda f, data: data)
        validation_dataset = validation_dataset.map(lambda f, data: data)

        return train_dataset, validation_dataset
Пример #3
0
    def split_dataset_subject(dset: tf.data.Dataset, validation_subjects):
        train_dataset = dset.filter(
            lambda data, label, subject: not tf.reduce_any(
                tf.math.equal(tf.cast(subject, tf.int32), validation_subjects)
            ))
        eval_dataset = dset.filter(lambda data, label, subject: tf.reduce_any(
            tf.math.equal(tf.cast(subject, tf.int32), validation_subjects)))

        return train_dataset, eval_dataset
Пример #4
0
 def _filter_dataset(self, dataset: tf.data.Dataset):
     """Filter examples which are empty or too long."""
     # filter empty sequences
     dataset = dataset.filter(lambda x, y: tf.logical_and(tf.size(x) > 0, tf.size(y) > 0))
     # length filter
     x_max_len = self.config.get('x_max_len', -1)
     if x_max_len > 0:
         dataset = dataset.filter(lambda x, y: tf.size(x) <= x_max_len)
     y_max_len = self.config.get('y_max_len', -1)
     if y_max_len > 0:
         dataset = dataset.filter(lambda x, y: tf.size(y) <= y_max_len)
     return dataset
Пример #5
0
    def prepare_dataset(self,
                        dataset: tf.data.Dataset,
                        buckets: List[int],
                        batch_sizes: List[int],
                        shuffle: bool = False) -> tf.data.Dataset:
        dataset = dataset.map(self._deserialization_func,
                              num_parallel_calls=128)

        buckets_array = np.array(buckets)
        batch_sizes_array = np.array(batch_sizes)

        if np.any(batch_sizes_array == 0) and shuffle:
            iszero = np.where(batch_sizes_array == 0)[0][0]
            filterlen = buckets_array[iszero - 1]
            print("Filtering sequences of length {}".format(filterlen))
            dataset = dataset.filter(
                lambda example: example['protein_length'] < filterlen)
        else:
            batch_sizes_array[batch_sizes_array <= 0] = 1

        dataset = dataset.shuffle(1024) if shuffle else dataset.prefetch(1024)
        batch_fun = tf.data.experimental.bucket_by_sequence_length(
            operator.itemgetter('protein_length'), buckets_array,
            batch_sizes_array)
        dataset = dataset.apply(batch_fun)
        return dataset
def Split_Dataset(dataset: tf.data.Dataset, validation_data_fraction: float):

    validation_data_percent = round(validation_data_fraction * 100)
    if not (0 <= validation_data_percent <= 100):
        raise ValueError("validation data fraction must be ∈ [0,1]")

    dataset = dataset.enumerate()
    train_dataset = dataset.filter(
        lambda f, data: f % 100 >= validation_data_percent)
    validation_dataset = dataset.filter(
        lambda f, data: f % 100 < validation_data_percent)

    # remove enumeration
    train_dataset = train_dataset.map(lambda f, data: data)
    validation_dataset = validation_dataset.map(lambda f, data: data)

    return train_dataset, validation_dataset
Пример #7
0
def rm_unlabelled_samples(dataset: tf.data.Dataset):
    """
    Filter all unlabelled data instances (label = -1.0) from the tf.data.Dataset passed in as parameter

    :param dataset: dataset in which the unlabelled instances (label = -1.0) are to be filtered out.
    :return:
    """
    return dataset.filter(__unlabelled)
Пример #8
0
def pretext_dataset(dataset:tf.data.Dataset, start_label:int)->tf.data.Dataset:
    filtered = dataset.filter(lambda data:data['label'] >= start_label)

    def supervised_transform(data):
        image = data['image']
        image = tf.cast(image, tf.float32)
        image = image / 255.0


    def random_transform(image):
        pass
Пример #9
0
def accumulated_batch(
    dataset: tf.data.Dataset,
    accumulator: Union[Accumulator, Mapping, Iterable],
    **map_kwargs,
):
    accumulator = accumulator_structure(accumulator)

    def initial_map_fn(*args):
        if len(args) == 1:
            (args,) = args
        return args, False

    @tf.function
    def scan_fn(state, el_and_final):
        el, final = el_and_final
        new_state = accumulator.append(state, el)
        valid = tf.reduce_all(accumulator.valid_conditions(new_state))
        invalid = tf.logical_not(valid)
        if invalid:
            new_state = accumulator.append(accumulator.initial_state(), el)
        return new_state, (state, tf.logical_or(invalid, final))

    def filter_fn(state, invalid):
        del state
        return invalid

    def map_fn(state, invalid):
        del invalid
        return accumulator.finalize(state)

    cardinality = tf.data.experimental.cardinality(dataset)

    dataset = dataset.map(initial_map_fn)
    if cardinality != tf.data.experimental.INFINITE_CARDINALITY:
        # append (empty, True) element to ensure final elements are generated
        state_spec = dataset.element_spec[0]
        empty_el = tf.nest.map_structure(
            lambda spec: tf.zeros(
                [1, *(0 if s is None else s for s in spec.shape)], dtype=spec.dtype
            ),
            state_spec,
        )
        true_el = tf.ones((1,), dtype=tf.bool)
        dataset = dataset.concatenate(
            tf.data.Dataset.from_tensor_slices((empty_el, true_el))
        )

    dataset = dataset.apply(
        tf.data.experimental.scan(accumulator.initial_state(), scan_fn)
    )

    dataset = dataset.filter(filter_fn)
    dataset = dataset.map(map_fn, **map_kwargs)
    return dataset
def split_dataset(dataset: tf.data.Dataset, val_split: float,
                  test_split: float):
    # Splits a dataset of type tf.data.Dataset into a training and test dataset using given ratio. Fractions are
    #   rounded up to two decimal places.
    # Input:
    #       dataset: the input dataset to split.
    #       val_split: the fraction of val data as a float between 0 and 1.
    #       test_split: the fraction of the test data as a float between 0 and 1.
    # Return:
    #       a tuple of two tf.data.Datasets as (training, test)
    # Source: https://stackoverflow.com/questions/59669413/what-is-the-canonical-way-to-split-tf-dataset-into-test-and-validation-subsets

    test_data_percent = round(test_split * 100)
    if not (0 <= test_data_percent <= 100):
        raise ValueError("test data fraction must be ∈ [0,1]")

    val_data_percent = round(val_split * 100)
    if not (0 <= val_data_percent <= 100):
        raise ValueError("val data fraction must be ∈ [0,1]")

    dataset = dataset.enumerate()
    train_val_dataset = dataset.filter(
        lambda f, data: f % 100 > test_data_percent)
    test_dataset = dataset.filter(lambda f, data: f % 100 <= test_data_percent)

    # remove enumeration
    train_val_dataset = train_val_dataset.map(lambda f, data: data)
    test_dataset = test_dataset.map(lambda f, data: data)

    # add validation from training
    train_val_dataset = train_val_dataset.enumerate()
    train_dataset = train_val_dataset.filter(
        lambda f, data: f % 100 > val_data_percent)
    val_dataset = train_val_dataset.filter(
        lambda f, data: f % 100 <= val_data_percent)

    # remove enumeration
    train_dataset = train_dataset.map(lambda f, data: data)
    val_dataset = val_dataset.map(lambda f, data: data)

    return train_dataset, val_dataset, test_dataset
Пример #11
0
def supervised_dataset(dataset:tf.data.Dataset, max_label:int)->tf.data.Dataset:
    filtered = dataset.filter(lambda data:data['label'] < max_label)

    def supervised_transform(data):
        image = data['image']
        image = tf.cast(image, tf.float32)
        image = image / 255.0
        label = data['label']
        label = tf.one_hot(label, max_label)
        return image, label

    return filtered.map(supervised_transform, num_parallel_calls=tf.data.experimental.AUTOTUNE)
Пример #12
0
def separate_by_target(ds: tf.data.Dataset, idx: int = 1, thr: float = 0.5
                       ) -> typing.Tuple[tf.data.Dataset, tf.data.Dataset]:
    def _cond0(*args):
        return tf.cast(args[idx], tf.float32) < thr

    def _cond1(*args):
        return tf.cast(args[idx], tf.float32) >= thr

    ds0 = ds.filter(_cond0)
    ds1 = ds.filter(_cond1)

    return ds0, ds1
Пример #13
0
def explore_localization_error(model: tf.keras.Model, dataset: tf.data.Dataset,
                               transferred: bool):
    # Building axes
    rows = 5
    cols = 6
    fig, axes = plt.subplots(rows, cols)
    fig.set_tight_layout(tight=0.1)
    axes = axes.ravel()
    # Performs pre-processing here to keep the original unprocessed image as well
    dataset = dataset.filter(lambda image, expected: tf.equal(expected[0], 1))
    preprocess_fnc = dogs.transferred_preprocess if transferred else dogs.preprocess
    # Find the first n images for which the IoU with the GT box is less than 10%
    found = 0
    iterator = iter(dataset)
    while found < rows * cols:
        image, expected = next(iterator)
        predicted = tf.squeeze(
            model(tf.expand_dims(preprocess_fnc(image), axis=0)))
        iou = tf.squeeze(
            dogs.compute_iou(tf.expand_dims(expected[1:], axis=0),
                             tf.expand_dims(predicted[1:], axis=0)))
        if tf.equal(iou, 0):
            # Plots corresponding image
            image_height, image_width, _ = image.shape
            axes[found].imshow(image.numpy())
            # Plots estimated bounding box
            box = coco.BBox(*predicted[1:])
            patch = patches.Rectangle(
                (box.x * image_width, box.y * image_height),
                box.width * image_width,
                box.height * image_height,
                edgecolor='blue',
                facecolor='none',
                lw=2)
            axes[found].add_patch(patch)
            # Plots expected bounding box
            box = coco.BBox(*expected[1:])
            patch = patches.Rectangle(
                (box.x * image_width, box.y * image_height),
                box.width * image_width,
                box.height * image_height,
                edgecolor='red',
                facecolor='none',
                lw=2)
            axes[found].add_patch(patch)
            # Some extra configurations
            axes[found].axis('off')
            axes[found].set_title("{:.2f}".format(predicted[0].numpy()))
            # Updates count and exits if all needed images have been found
            found += 1
    plt.show()
Пример #14
0
def split_dataset(dataset: tf.data.Dataset, split: Iterable[int], buffer_shuffle: int = None) -> List[tf.data.Dataset]:
    partitions = np.insert(np.cumsum(split), 0, 0)
    if buffer_shuffle is None:
        buffer_shuffle = 100 * partitions[-1]
    dataset = dataset.shuffle(buffer_shuffle, seed=22, reshuffle_each_iteration=False).enumerate()
    partitions = np.stack([partitions[:-1], partitions[1:]], axis=1)
    datasets = map(
        lambda partition: dataset.filter(
            lambda x, y: _between(
                tf.math.floormod(x, partitions[-1, -1]), partition[0], partition[1]
            )
        ).map(lambda x, y: y),
        partitions,
    )
    return list(datasets)
Пример #15
0
    def transform_dataset(self, ds_input: tf.data.Dataset) -> tf.data.Dataset:
        """Create a dataset with filtering applied.

        Args:
            ds_input: Any dataset that produces dictionaries keyed by strings and values
                with any rank tensors.

        Returns:
            A `tf.data.Dataset` with elements containing the same keys, but with
            potentially fewer elements.
        """

        # Filter and return.
        ds_output = ds_input.filter(self.filter_fn)

        return ds_output
def preprocess_wmt_data(dataset: tf.data.Dataset,
                        shuffle: bool,
                        num_epochs: Optional[int] = 1,
                        pack_examples: bool = True,
                        shuffle_buffer_size: int = 1024,
                        max_length: int = 512,
                        batch_size: int = 256,
                        drop_remainder: bool = True,
                        prefetch_size: int = AUTOTUNE):
    """Shuffle and batch/pack the given dataset."""
    def length_filter(max_len):
        def filter_fn(x):
            source, target = x['inputs'], x['targets']
            l = tf.maximum(tf.shape(source)[0], tf.shape(target)[0])
            return tf.less(l, max_len + 1)

        return filter_fn

    if max_length > 0:
        dataset = dataset.filter(length_filter(max_length))

    if shuffle:
        dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.repeat(num_epochs)

    if pack_examples:
        dataset = pack_dataset(dataset, max_length)
        dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
    else:  # simple (static-shape) padded batching
        dataset = dataset.padded_batch(batch_size,
                                       padded_shapes={
                                           'inputs': max_length,
                                           'targets': max_length
                                       },
                                       padding_values={
                                           'inputs': 0,
                                           'targets': 0
                                       },
                                       drop_remainder=drop_remainder)

    if prefetch_size:
        dataset = dataset.prefetch(prefetch_size)

    return dataset
Пример #17
0
    def filter_sample(self, data: tf.data.Dataset) -> tf.data.Dataset:
        """
        Filter according to data filters

        Parameters
        ----------
        data
            input dataset

        Returns
        -------
        data
            filtered dataset
        """
        def _predicate_fn(inputs):
            return self.data_filter_true(**inputs)

        data = data.filter(_predicate_fn)
        return data
Пример #18
0
def preprocess_wmt_data(dataset: tf.data.Dataset,
                        data_rng,
                        train: bool,
                        shuffle_buffer_size: int = 1024,
                        max_length: int = 512,
                        per_device_batch_size: int = 256):
    """Shuffle and batch/pack the given dataset."""
    def length_filter(max_len):
        def filter_fn(x):
            source, target = x['inputs'], x['targets']
            l = tf.maximum(tf.shape(source)[0], tf.shape(target)[0])
            return tf.less(l, max_len + 1)

        return filter_fn

    if max_length > 0:
        dataset = dataset.filter(length_filter(max_length))

    if train:
        dataset = dataset.shuffle(shuffle_buffer_size, seed=data_rng[0])
        dataset = dataset.repeat()
        dataset = pack_dataset(dataset, max_length)
        dataset = dataset.batch(per_device_batch_size, drop_remainder=train)
    else:  # simple (static-shape) padded batching
        dataset = dataset.padded_batch(per_device_batch_size,
                                       padded_shapes={
                                           'inputs': max_length,
                                           'targets': max_length
                                       },
                                       padding_values={
                                           'inputs': 0,
                                           'targets': 0
                                       },
                                       drop_remainder=train)

    dataset = dataset.prefetch(AUTOTUNE)
    return dataset
Пример #19
0
    def apply(self, dataset: tf.data.Dataset, mode: str = None):
        """Apply preprocessors as one filter operation"""
        # Filter preprocessors for this mode
        active_prepros: List[core.Filter] = []
        for prepro in self.preprocessors:
            if mode is not None and prepro.modes is not None and mode not in prepro.modes:
                LOGGER.info(f"Not applying {prepro} (mode={mode})")
                continue
            active_prepros.append(prepro)

        # Apply filtered preprocessors
        if not active_prepros:
            return dataset
        else:

            def _fused_tf_predicate(element):
                pred = None
                for prepro in active_prepros:
                    new_pred = prepro.tf_predicate(element)
                    pred = new_pred if pred is None else tf.logical_and(
                        pred, new_pred)
                return pred

            return dataset.filter(_fused_tf_predicate)
Пример #20
0
 def filter_preprocessor(dataset: tf.data.Dataset) -> tf.data.Dataset:
     return dataset.filter(predicate)
Пример #21
0
 def apply(self, dataset: tf.data.Dataset, mode: str = None):
     if mode is not None and self.modes is not None and mode not in self.modes:
         LOGGER.info(f"Not applying {self} (mode={mode})")
         return dataset
     return dataset.filter(self.tf_predicate)