def generate(self, count, batch_size=1, rand=None, *args, **kwargs): """ Generator of datasets :param rand: random seed, state or None :param count: number of datasets to generate :param batch_size: number of episodes to generate at each call :param args: :param kwargs: :return: one or a list of Datasets objects """ if not args: args = self.args if not kwargs: kwargs = self.kwargs rand = get_rand_state(rand) for _ in range(count): if batch_size == 1: yield self.generate_datasets(rand=rand, *args, **kwargs) else: yield self.generate_batch(batch_size, rand=rand, *args, **kwargs)
def __init__(self, dataset, batch_size, epochs=None, rnd=None): """ Class for stochastic sampling of data points. It is most useful for feeding examples for the the training ops of `ReverseHG` or `ForwardHG`. Most notably, if the number of epochs is specified, the class takes track of the examples per mini-batches which is important for the backward pass of `ReverseHG` method. :param dataset: instance of `Dataset` class :param batch_size: :param epochs: number of epochs (can be None, in which case examples are fed continuously) """ self.dataset = dataset self.batch_size = batch_size self.epochs = epochs self.T = int(np.ceil(dataset.num_examples / batch_size)) if self.epochs: self.T *= self.epochs self.rnd = get_rand_state(rnd) self.training_schedule = None self.iter_per_epoch = int(dataset.num_examples / batch_size)
def redivide_data( datasets, partition_proportions=None, shuffle=False, filters=None, maps=None, balance_classes=False, rand=None, ): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param rand: :param balance_classes: # TODO RICCARDO :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :param filters: (optional, default None) filter or list of filters: functions with signature (data, target, index) -> boolean (accept or reject the sample) :param maps: (optional, default None) map or list of maps: functions with signature (data, target, index) -> (new_data, new_target) (maps the old sample to a new one, possibly also to more than one sample, for data augmentation) :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ rnd = get_rand_state(rand) all_data = vstack([get_data(d) for d in datasets]) all_labels = stack_or_concat([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info for d in datasets]) N = all_data.shape[0] if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance( partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, ( "partition proportions must sum up to at most one: %d" % sum_proportions) if sum_proportions < 1.0: partition_proportions += [1.0 - sum_proportions] else: partition_proportions = [ 1.0 * get_data(d).shape[0] / N for d in datasets ] if shuffle: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() # if sk_shuffle: # TODO this does not work!!! find a way to shuffle these matrices while # keeping compatibility with tensorflow! # all_data, all_labels, all_infos = sk_shuffle(all_data, all_labels, all_infos) # else: permutation = np.arange(all_data.shape[0]) rnd.shuffle(permutation) all_data = all_data[permutation] all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) if filters: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() filters = as_list(filters) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for fiat in filters: data_triple = [ xy for i, xy in enumerate(data_triple) if fiat(xy[0], xy[1], xy[2], i) ] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) if maps: if sp and isinstance(all_data, sp.sparse.csr.csr_matrix): raise NotImplementedError() maps = as_list(maps) data_triple = [(x, y, d) for x, y, d in zip(all_data, all_labels, all_infos)] for _map in maps: data_triple = [ _map(xy[0], xy[1], xy[2], i) for i, xy in enumerate(data_triple) ] all_data = np.vstack([e[0] for e in data_triple]) all_labels = np.vstack([e[1] for e in data_triple]) all_infos = np.vstack([e[2] for e in data_triple]) N = all_data.shape[0] assert N == all_labels.shape[0] calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0], ) calculated_partitions[-1] = N print( "datasets.redivide_data:, computed partitions numbers -", calculated_partitions, "len all", N, end=" ", ) new_general_info_dict = {} for data in datasets: new_general_info_dict = {**new_general_info_dict, **data.info} if balance_classes: new_datasets = [] forbidden_indices = np.empty(0, dtype=np.int64) for d1, d2 in zip(calculated_partitions[:-1], calculated_partitions[1:-1]): indices = np.array( get_indices_balanced_classes(d2 - d1, all_labels, forbidden_indices)) dataset = dl.Dataset( data=all_data[indices], target=all_labels[indices], sample_info=all_infos[indices], info=new_general_info_dict, ) new_datasets.append(dataset) forbidden_indices = np.append(forbidden_indices, indices) test_if_balanced(dataset) remaining_indices = np.array( list(set(list(range(N))) - set(forbidden_indices))) new_datasets.append( dl.Dataset( data=all_data[remaining_indices], target=all_labels[remaining_indices], sample_info=all_infos[remaining_indices], info=new_general_info_dict, )) else: new_datasets = [ dl.Dataset( data=all_data[d1:d2], target=all_labels[d1:d2], sample_info=all_infos[d1:d2], info=new_general_info_dict, ) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print("DONE") return new_datasets
def balanced_choice_wr(a, num, rand=None): rand = get_rand_state(rand) lst = [len(a)] * (num // len(a)) + [num % len(a)] return np.concatenate( [rand.choice(a, size=(d, ), replace=False) for d in lst])