def mnist(data_root_folder=None, one_hot=True, partitions=( 0.8, .1, ), shuffle=False): """ Loads (download if necessary) Mnist dataset, and optionally splits it to form different training, validation and test sets (use partitions parameters for that) """ data_folder_name = 'mnist' if data_root_folder is None: data_root_folder = os.path.join(os.getcwd(), 'DATA') if not os.path.exists(data_root_folder): os.mkdir(data_root_folder) data_folder = os.path.join(data_root_folder, data_folder_name) datasets = read_data_sets(data_folder, one_hot=one_hot) train = Dataset(datasets.train.images, datasets.train.labels, name='MNIST') validation = Dataset(datasets.validation.images, datasets.validation.labels, name='MNIST') test = Dataset(datasets.test.images, datasets.test.labels, name='MNIST') res = [train, validation, test] if partitions: res = redivide_data(res, partition_proportions=partitions, shuffle=shuffle) return Datasets.from_list(res)
def redivide_data(datasets, partition_proportions=None, shuffle=False): """ Function that redivides datasets. Can be use also to shuffle or filter or map examples. :param datasets: original datasets, instances of class Dataset (works with get_data and get_targets for compatibility with mnist datasets :param partition_proportions: (optional, default None) list of fractions that can either sum up to 1 or less then one, in which case one additional partition is created with proportion 1 - sum(partition proportions). If None it will retain the same proportion of samples found in datasets :param shuffle: (optional, default False) if True shuffles the examples :return: a list of datasets of length equal to the (possibly augmented) partition_proportion """ all_data = np.vstack([get_data(d) for d in datasets]) all_labels = np.vstack([get_targets(d) for d in datasets]) all_infos = np.concatenate([d.sample_info for d in datasets]) N = all_data.shape[0] if partition_proportions: # argument check partition_proportions = list([partition_proportions] if isinstance(partition_proportions, float) else partition_proportions) sum_proportions = sum(partition_proportions) assert sum_proportions <= 1, "partition proportions must sum up to at most one: %d" % sum_proportions if sum_proportions < 1.: partition_proportions += [1. - sum_proportions] else: partition_proportions = [1. * get_data(d).shape[0] / N for d in datasets] if shuffle: permutation = np.arange(all_data.shape[0]) np.random.shuffle(permutation) all_data = all_data[permutation] all_labels = np.array(all_labels[permutation]) all_infos = np.array(all_infos[permutation]) N = all_data.shape[0] assert N == all_labels.shape[0] calculated_partitions = reduce( lambda v1, v2: v1 + [sum(v1) + v2], [int(N * prp) for prp in partition_proportions], [0] ) calculated_partitions[-1] = N print('datasets.redivide_data:, computed partitions numbers -', calculated_partitions, 'len all', N, end=' ') new_general_info_dict = merge_dicts(*[d.info for d in datasets]) new_datasets = [ Dataset(data=all_data[d1:d2], target=all_labels[d1:d2], sample_info=all_infos[d1:d2], info=new_general_info_dict) for d1, d2 in zip(calculated_partitions, calculated_partitions[1:]) ] print('DONE') return new_datasets
def mnist(folder=None, one_hot=True, partitions=None, shuffle=False): """ Loads (download if necessary) Mnist dataset, and optionally splits it to form different training, validation and test sets (use partitions parameters for that) :param folder: :param one_hot: :param partitions: :param shuffle: :return: """ datasets = read_data_sets(folder, one_hot=one_hot) train = Dataset(datasets.train.images, datasets.train.labels, name='MNIST') validation = Dataset(datasets.validation.images, datasets.validation.labels, name='MNIST') test = Dataset(datasets.test.images, datasets.test.labels, name='MNIST') res = [train, validation, test] if partitions: res = redivide_data(res, partition_proportions=partitions, shuffle=shuffle) res += [None] * (3 - len(res)) return Datasets.from_list(res)