Exemplo n.º 1
0
    def generate(self, count, batch_size=1, rand=None, *args, **kwargs):
        """
        Generator of datasets

        :param rand: random seed, state or None
        :param count: number of datasets to generate
        :param batch_size: number of episodes to generate at each call
        :param args:
        :param kwargs:
        :return: one or a list of Datasets objects
        """
        if not args:
            args = self.args
        if not kwargs:
            kwargs = self.kwargs
        rand = get_rand_state(rand)
        for _ in range(count):
            if batch_size == 1:
                yield self.generate_datasets(rand=rand, *args, **kwargs)
            else:
                yield self.generate_batch(batch_size, rand=rand, *args, **kwargs)
Exemplo n.º 2
0
    def __init__(self, dataset, batch_size, epochs=None, rnd=None):
        """
        Class for stochastic sampling of data points. It is most useful for feeding examples for the the
        training ops of `ReverseHG` or `ForwardHG`. Most notably, if the number of epochs is specified,
        the class takes track of the examples per mini-batches which is important for the backward pass
        of `ReverseHG` method.

        :param dataset: instance of `Dataset` class
        :param batch_size:
        :param epochs: number of epochs (can be None, in which case examples are
                        fed continuously)
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.epochs = epochs
        self.T = int(np.ceil(dataset.num_examples / batch_size))
        if self.epochs:
            self.T *= self.epochs

        self.rnd = get_rand_state(rnd)

        self.training_schedule = None
        self.iter_per_epoch = int(dataset.num_examples / batch_size)
Exemplo n.º 3
0
def redivide_data(
    datasets,
    partition_proportions=None,
    shuffle=False,
    filters=None,
    maps=None,
    balance_classes=False,
    rand=None,
):
    """
    Function that redivides datasets. Can be use also to shuffle or filter or map examples.

    :param rand:
    :param balance_classes: # TODO RICCARDO
    :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for
                        compatibility with mnist datasets
    :param partition_proportions: (optional, default None)  list of fractions that can either sum up to 1 or less
                                    then one, in which case one additional partition is created with
                                    proportion 1 - sum(partition proportions).
                                    If None it will retain the same proportion of samples found in datasets
    :param shuffle: (optional, default False) if True shuffles the examples
    :param filters: (optional, default None) filter or list of filters: functions with signature
                        (data, target, index) -> boolean (accept or reject the sample)
    :param maps: (optional, default None) map or list of maps: functions with signature
                        (data, target, index) ->  (new_data, new_target) (maps the old sample to a new one,
                        possibly also to more
                        than one sample, for data augmentation)
    :return: a list of datasets of length equal to the (possibly augmented) partition_proportion
    """

    rnd = get_rand_state(rand)

    all_data = vstack([get_data(d) for d in datasets])
    all_labels = stack_or_concat([get_targets(d) for d in datasets])

    all_infos = np.concatenate([d.sample_info for d in datasets])

    N = all_data.shape[0]

    if partition_proportions:  # argument check
        partition_proportions = list([partition_proportions] if isinstance(
            partition_proportions, float) else partition_proportions)
        sum_proportions = sum(partition_proportions)
        assert sum_proportions <= 1, (
            "partition proportions must sum up to at most one: %d" %
            sum_proportions)
        if sum_proportions < 1.0:
            partition_proportions += [1.0 - sum_proportions]
    else:
        partition_proportions = [
            1.0 * get_data(d).shape[0] / N for d in datasets
        ]

    if shuffle:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        # if sk_shuffle:  # TODO this does not work!!! find a way to shuffle these matrices while
        # keeping compatibility with tensorflow!
        #     all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos)
        # else:
        permutation = np.arange(all_data.shape[0])
        rnd.shuffle(permutation)

        all_data = all_data[permutation]
        all_labels = np.array(all_labels[permutation])
        all_infos = np.array(all_infos[permutation])

    if filters:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        filters = as_list(filters)
        data_triple = [(x, y, d)
                       for x, y, d in zip(all_data, all_labels, all_infos)]
        for fiat in filters:
            data_triple = [
                xy for i, xy in enumerate(data_triple)
                if fiat(xy[0], xy[1], xy[2], i)
            ]
        all_data = np.vstack([e[0] for e in data_triple])
        all_labels = np.vstack([e[1] for e in data_triple])
        all_infos = np.vstack([e[2] for e in data_triple])

    if maps:
        if sp and isinstance(all_data, sp.sparse.csr.csr_matrix):
            raise NotImplementedError()
        maps = as_list(maps)
        data_triple = [(x, y, d)
                       for x, y, d in zip(all_data, all_labels, all_infos)]
        for _map in maps:
            data_triple = [
                _map(xy[0], xy[1], xy[2], i)
                for i, xy in enumerate(data_triple)
            ]
        all_data = np.vstack([e[0] for e in data_triple])
        all_labels = np.vstack([e[1] for e in data_triple])
        all_infos = np.vstack([e[2] for e in data_triple])

    N = all_data.shape[0]
    assert N == all_labels.shape[0]

    calculated_partitions = reduce(
        lambda v1, v2: v1 + [sum(v1) + v2],
        [int(N * prp) for prp in partition_proportions],
        [0],
    )
    calculated_partitions[-1] = N

    print(
        "datasets.redivide_data:, computed partitions numbers -",
        calculated_partitions,
        "len all",
        N,
        end=" ",
    )

    new_general_info_dict = {}
    for data in datasets:
        new_general_info_dict = {**new_general_info_dict, **data.info}

        if balance_classes:
            new_datasets = []
            forbidden_indices = np.empty(0, dtype=np.int64)
            for d1, d2 in zip(calculated_partitions[:-1],
                              calculated_partitions[1:-1]):
                indices = np.array(
                    get_indices_balanced_classes(d2 - d1, all_labels,
                                                 forbidden_indices))
                dataset = dl.Dataset(
                    data=all_data[indices],
                    target=all_labels[indices],
                    sample_info=all_infos[indices],
                    info=new_general_info_dict,
                )
                new_datasets.append(dataset)
                forbidden_indices = np.append(forbidden_indices, indices)
                test_if_balanced(dataset)
            remaining_indices = np.array(
                list(set(list(range(N))) - set(forbidden_indices)))
            new_datasets.append(
                dl.Dataset(
                    data=all_data[remaining_indices],
                    target=all_labels[remaining_indices],
                    sample_info=all_infos[remaining_indices],
                    info=new_general_info_dict,
                ))
        else:
            new_datasets = [
                dl.Dataset(
                    data=all_data[d1:d2],
                    target=all_labels[d1:d2],
                    sample_info=all_infos[d1:d2],
                    info=new_general_info_dict,
                ) for d1, d2 in zip(calculated_partitions,
                                    calculated_partitions[1:])
            ]

        print("DONE")

        return new_datasets
Exemplo n.º 4
0
def balanced_choice_wr(a, num, rand=None):
    rand = get_rand_state(rand)
    lst = [len(a)] * (num // len(a)) + [num % len(a)]
    return np.concatenate(
        [rand.choice(a, size=(d, ), replace=False) for d in lst])