def __init__(self, which_set, which_partitions, dictionary, **kwargs): if which_set not in ('training', 'heldout'): raise ValueError if which_set == 'training': if not all(partition in range(1, 100) for partition in which_partitions): raise ValueError files = [ find_in_data_path( os.path.join('1-billion-word', 'training-monolingual.tokenized.shuffled', 'news.en-{:05d}-of-00100'.format(partition))) for partition in which_partitions ] else: if not all(partition in range(50) for partition in which_partitions): raise ValueError files = [ find_in_data_path( os.path.join( '1-billion-word', 'heldout-monolingual.tokenized.shuffled', 'news.en.heldout-{:05d}-of-00050'.format(partition))) for partition in which_partitions ] super(OneBillionWord, self).__init__(files, dictionary, **kwargs)
def test_prepare_metadata(): skip_if_not_available(datasets=[DEVKIT_ARCHIVE, TEST_GROUNDTRUTH]) devkit_path = find_in_data_path(DEVKIT_ARCHIVE) test_gt_path = find_in_data_path(TEST_GROUNDTRUTH) n_train, v_gt, t_gt, wnid_map = prepare_metadata(devkit_path, test_gt_path) assert n_train == 1261406 assert len(v_gt) == 50000 assert len(t_gt) == 150000 assert sorted(wnid_map.values()) == list(range(1000)) assert all(isinstance(k, six.string_types) and len(k) == 9 for k in wnid_map)
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: datastream.default_transformers = uint8_pixels_to_floatX( ('features', )) train_stream = DataStream.default_stream( dataset=datastream, iteration_scheme=SequentialExampleScheme(datastream.num_examples)) it = train_stream.get_epoch_iterator() return it
def __init__(self, which_sets, height, width, N, n_iter, **kwargs): super(SVHN, self).__init__(file_or_path=find_in_data_path('svhn_format_1.hdf5'), which_sets=which_sets, **kwargs) global N_global, height_global, width_global, n_iter_global height_global = height width_global = width N_global = N n_iter_global = n_iter
def test_read_devkit(): skip_if_not_available(datasets=[DEVKIT_ARCHIVE]) synsets, raw_valid_gt = read_devkit(find_in_data_path(DEVKIT_ARCHIVE)) # synset sanity tests appear in test_read_metadata_mat_file assert raw_valid_gt.min() == 1 assert raw_valid_gt.max() == 1000 assert raw_valid_gt.dtype.kind == 'i' assert raw_valid_gt.shape == (50000,)
def __init__(self, which_sets, **kwargs): try: path = find_in_data_path(self._filename) except IOError as e: msg = str(e) + (""". You need to download the dataset and convert it to hdf5 before.""") raise IOError(msg) super(Cars196Dataset, self).__init__( file_or_path=path, which_sets=which_sets, **kwargs)
def test_prepare_metadata(): skip_if_not_available(datasets=[DEVKIT_ARCHIVE, TEST_IMAGES_TAR]) devkit_path = find_in_data_path(DEVKIT_ARCHIVE) n_train, v_gt, n_test, wnid_map = prepare_metadata(devkit_path) assert n_train == 1281167 assert len(v_gt) == 50000 assert n_test == 100000 assert sorted(wnid_map.values()) == list(range(1000)) assert all(isinstance(k, six.string_types) and len(k) == 9 for k in wnid_map)
def __init__(self, which_set, which_partitions, dictionary, **kwargs): if which_set not in ('training', 'heldout'): raise ValueError if which_set == 'training': if not all(partition in range(1, 100) for partition in which_partitions): raise ValueError files = [find_in_data_path(os.path.join( '1-billion-word', 'training-monolingual.tokenized.shuffled', 'news.en-{:05d}-of-00100'.format(partition))) for partition in which_partitions] else: if not all(partition in range(50) for partition in which_partitions): raise ValueError files = [find_in_data_path(os.path.join( '1-billion-word', 'heldout-monolingual.tokenized.shuffled', 'news.en.heldout-{:05d}-of-00050'.format(partition))) for partition in which_partitions] super(OneBillionWord, self).__init__(files, dictionary, **kwargs)
def get_all_data_inorder(filename, batch_size): sources = ('features', 'targets') dataset_fname = find_in_data_path(filename+'.hdf5') data_all = H5PYDataset(dataset_fname, which_sets=['train', 'valid', 'test'], sources=sources) data_all.default_transformers = uint8_pixels_to_floatX(('features',)) main_stream = DataStream.default_stream( dataset=data_all, iteration_scheme=SequentialScheme(data_all.num_examples, batch_size)) color_stream = Colorize(main_stream, which_sources=('features',)) return data_all.num_examples, color_stream
def get_all_data_inorder(filename, batch_size): sources = ('features', 'targets') dataset_fname = find_in_data_path(filename + '.hdf5') data_all = H5PYDataset(dataset_fname, which_sets=['train', 'valid', 'test'], sources=sources) data_all.default_transformers = uint8_pixels_to_floatX(('features', )) main_stream = DataStream.default_stream(dataset=data_all, iteration_scheme=SequentialScheme( data_all.num_examples, batch_size)) color_stream = Colorize(main_stream, which_sources=('features', )) return data_all.num_examples, color_stream
def skip_if_not_available(modules=None, datasets=None, configurations=None): """Raises a SkipTest exception when requirements are not met. Parameters ---------- modules : list A list of strings of module names. If one of the modules fails to import, the test will be skipped. datasets : list A list of strings of folder names. If the data path is not configured, or the folder does not exist, the test is skipped. configurations : list A list of of strings of configuration names. If this configuration is not set and does not have a default, the test will be skipped. """ if modules is None: modules = [] if datasets is None: datasets = [] if configurations is None: configurations = [] for module in modules: try: import_module(module) except Exception: raise SkipTest if datasets and not hasattr(config, 'data_path'): raise SkipTest for dataset in datasets: try: find_in_data_path(dataset) except IOError: raise SkipTest for configuration in configurations: if not hasattr(config, configuration): raise SkipTest
def test_read_metadata_mat_file(): skip_if_not_available(datasets=[DEVKIT_ARCHIVE]) with tarfile.open(find_in_data_path(DEVKIT_ARCHIVE)) as tar: meta_mat = tar.extractfile(DEVKIT_META_PATH) synsets = read_metadata_mat_file(meta_mat) assert (synsets['ILSVRC2012_ID'] == numpy.arange(1, len(synsets) + 1)).all() assert synsets['num_train_images'][1000:].sum() == 0 assert (synsets['num_train_images'][:1000] > 0).all() assert synsets.ndim == 1 assert synsets['wordnet_height'].min() == 0 assert synsets['wordnet_height'].max() == 19 assert synsets['WNID'].dtype == numpy.dtype('S9') assert (synsets['num_children'][:1000] == 0).all() assert (synsets['children'][:1000] == -1).all()
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True, label_transforms=False, return_length=False): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: h5_dataset.default_transformers = uint8_pixels_to_floatX( ('features', )) datastream = DataStream.default_stream( dataset=h5_dataset, iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples)) if label_transforms: # TODO: maybe refactor this common bit with get_custom_streams below datastream = AddLabelUncertainty(datastream, chance=0, which_sources=('targets', )) datastream = RandomLabelStrip(datastream, chance=0, which_sources=('targets', )) # HACK: allow variable stretch datastream = StretchLabels(datastream, length=128, which_sources=('targets', )) it = datastream.get_epoch_iterator() if return_length: return it, h5_dataset.num_examples else: return it
def __init__(self, split='train', **kwargs): path = find_in_data_path(self._filename) self.split = split self.train = H5PYDataset(file_or_path=path, which_sets=['train']) self.train_labels = H5PYDataset( file_or_path=path, which_sets=['train'], sources=['targets'], load_in_memory=True).data_sources[0].ravel() self.test = H5PYDataset(file_or_path=path, which_sets=['test']) self.test_labels = H5PYDataset( file_or_path=path, which_sets=['test'], sources=['targets'], load_in_memory=True).data_sources[0].ravel() self.train_handle = self.train.open() self.test_hanle = self.test.open() self.ntest = self.test.num_examples self.ntrain = self.train.num_examples
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True, label_transforms=False, return_length=False): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split,) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: h5_dataset.default_transformers = uint8_pixels_to_floatX(('features',)) datastream = DataStream.default_stream( dataset=h5_dataset, iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples)) if label_transforms: # TODO: maybe refactor this common bit with get_custom_streams below datastream = AddLabelUncertainty(datastream, chance=0, which_sources=('targets',)) datastream = RandomLabelStrip(datastream, chance=0, which_sources=('targets',)) # HACK: allow variable stretch datastream = StretchLabels(datastream, length=128, which_sources=('targets',)) it = datastream.get_epoch_iterator() if return_length: return it, h5_dataset.num_examples else: return it
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split,) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: datastream.default_transformers = uint8_pixels_to_floatX(('features',)) train_stream = DataStream.default_stream( dataset=datastream, iteration_scheme=SequentialExampleScheme(datastream.num_examples)) it = train_stream.get_epoch_iterator() return it
def test_read_metadata_mat_file(): skip_if_not_available(datasets=[DEVKIT_ARCHIVE]) with tarfile.open(find_in_data_path(DEVKIT_ARCHIVE)) as tar: meta_mat = tar.extractfile(DEVKIT_META_PATH) synsets, cost_mat = read_metadata_mat_file(meta_mat) assert (synsets['ILSVRC2010_ID'] == numpy.arange(1, len(synsets) + 1)).all() assert synsets['num_train_images'][1000:].sum() == 0 assert (synsets['num_train_images'][:1000] > 0).all() assert synsets.ndim == 1 assert synsets['wordnet_height'].min() == 0 assert synsets['wordnet_height'].max() == 19 assert synsets['WNID'].dtype == numpy.dtype('S9') assert (synsets['num_children'][:1000] == 0).all() assert (synsets['children'][:1000] == -1).all() # Assert the basics about the cost matrix. assert cost_mat.shape == (1000, 1000) assert cost_mat.dtype == 'uint8' assert cost_mat.min() == 0 assert cost_mat.max() == 18 assert (cost_mat == cost_mat.T).all() # Assert that the diagonal is 0. assert (cost_mat.flat[::1001] == 0).all()
def read_frame(cls, key, *args, **kwargs): """ Load a set of features from the dataset as a pandas object. Args: key (str): The HDF5 key for required data. Typically, this will be one of - structure: for the raw molecules - smiles: for the smiles - features/{feat_name}: for the features - targets/{targ_name}: for the targets Returns: pd.Series or pd.DataFrame or pd.Panel The data as a dataframe. """ with warnings.catch_warnings(): warnings.simplefilter('ignore') data = pd.read_hdf(find_in_data_path(cls.filename), key, *args, **kwargs) if isinstance(data, pd.Panel): data = data.transpose(2, 1, 0) return data
def get_data(data_name): if data_name == 'mnist_transform': from fuel.datasets import H5PYDataset from fuel.utils import find_in_data_path img_size = (60, 60) channels = 1 f_name = find_in_data_path('mnist_transform.hdf5') data_train = H5PYDataset(f_name, which_sets=['train'], load_in_memory=True, sources=['features', 'targets']) data_valid = H5PYDataset(f_name, which_sets=['valid'], load_in_memory=True, sources=['features', 'targets']) data_test = H5PYDataset(f_name, which_sets=['test'], load_in_memory=True, sources=['features', 'targets']) elif data_name == 'mnist_multi_translated': from fuel.datasets import H5PYDataset from fuel.utils import find_in_data_path img_size = (28 * 3, 28 * 3) channels = 1 f_name = find_in_data_path('multi_mnist_translated.hdf5') data_train = H5PYDataset(f_name, which_sets=['train'], load_in_memory=True, sources=['features', 'targets']) data_valid = H5PYDataset(f_name, which_sets=['valid'], load_in_memory=True, sources=['features', 'targets']) data_test = H5PYDataset(f_name, which_sets=['test'], load_in_memory=True, sources=['features', 'targets']) elif data_name == 'mnist': from fuel.datasets import MNIST img_size = (28, 28) channels = 1 data_train = MNIST(which_sets=["train"], sources=['features', 'targets']) data_valid = MNIST(which_sets=["test"], sources=['features', 'targets']) data_test = MNIST(which_sets=["test"], sources=['features', 'targets']) elif data_name == 'bmnist': from fuel.datasets.binarized_mnist import BinarizedMNIST img_size = (28, 28) channels = 1 data_train = BinarizedMNIST(which_sets=['train'], sources=['features']) data_valid = BinarizedMNIST(which_sets=['valid'], sources=['features']) data_test = BinarizedMNIST(which_sets=['test'], sources=['features']) # TODO: make a generic catch-all for # loading custom datasets like "colormnist" elif data_name == 'colormnist': from draw.colormnist import ColorMNIST img_size = (28, 28) channels = 3 data_train = ColorMNIST(which_sets=['train'], sources=['features']) data_valid = ColorMNIST(which_sets=['test'], sources=['features']) data_test = ColorMNIST(which_sets=['test'], sources=['features']) elif data_name == 'cifar10': from fuel.datasets.cifar10 import CIFAR10 img_size = (32, 32) channels = 3 data_train = CIFAR10(which_sets=['train'], sources=['features']) data_valid = CIFAR10(which_sets=['test'], sources=['features']) data_test = CIFAR10(which_sets=['test'], sources=['features']) elif data_name == 'svhn2': from fuel.datasets.svhn import SVHN img_size = (32, 32) channels = 3 data_train = SVHN(which_format=2, which_sets=['train'], sources=['features']) data_valid = SVHN(which_format=2, which_sets=['test'], sources=['features']) data_test = SVHN(which_format=2, which_sets=['test'], sources=['features']) elif data_name == 'silhouettes': from fuel.datasets.caltech101_silhouettes import CalTech101Silhouettes size = 28 img_size = (size, size) channels = 1 data_train = CalTech101Silhouettes(which_sets=['train'], size=size, sources=['features']) data_valid = CalTech101Silhouettes(which_sets=['valid'], size=size, sources=['features']) data_test = CalTech101Silhouettes(which_sets=['test'], size=size, sources=['features']) elif data_name == 'tfd': from fuel.datasets.toronto_face_database import TorontoFaceDatabase img_size = (28, 28) channels = 1 data_train = TorontoFaceDatabase(which_sets=['unlabeled'], size=size, sources=['features']) data_valid = TorontoFaceDatabase(which_sets=['valid'], size=size, sources=['features']) data_test = TorontoFaceDatabase(which_sets=['test'], size=size, sources=['features']) else: raise ValueError("Unknown dataset %s" % data_name) return img_size, channels, data_train, data_valid, data_test
def available_sets(cls): with h5py.File(find_in_data_path(cls.filename)) as f: return cls.get_all_splits(f)
def __init__(self, which_sets, **kwargs): kwargs.setdefault('load_in_memory', True) super(SubredditTopPhotosFeatures22, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def __init__(self, which_sets, **kwargs): kwargs.setdefault("load_in_memory", False) super(SVHN17, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def setup_data(p, test_set=False): dataset_class = { 'cifar10': (CIFAR10), 'jos': (JOS), 'mnist': (MNIST), }[p.dataset] training_set_size = p.unlabeled_samples # Allow overriding the default from command line if p.get('unlabeled_samples') is not None: training_set_size = p.unlabeled_samples train_set = dataset_class(["train"]) # Make sure the MNIST data is in right format if p.dataset == 'mnist': d = train_set.data_sources[train_set.sources.index('features')] assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \ 'Make sure data is in float format and in range 0 to 1' # Take all indices and permutate them all_ind = numpy.arange(train_set.num_examples) if p.get('dseed'): rng = numpy.random.RandomState(seed=p.dseed) rng.shuffle(all_ind) d = AttributeDict() # Choose the training set d.train = train_set d.train_ind = all_ind[:training_set_size] # Then choose validation set from the remaining indices d.valid = train_set d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size] logger.info('Using %d examples for validation' % len(d.valid_ind)) # Only touch test data if requested if test_set: d.test = dataset_class(["test"]) d.test_ind = numpy.arange(d.test.num_examples) # Setup optional whitening, only used for Cifar-10 fn = find_in_data_path(train_set.filename) #iprint(fn) s1 = H5PYDataset(fn, ("train", )) handle = s1.open() in_dim = s1.get_data(handle, slice(0, 1))[0].shape[1:] s1.close(handle) #in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:] if len(in_dim) > 1 and p.whiten_zca > 0: assert numpy.product(in_dim) == p.whiten_zca, \ 'Need %d whitening dimensions, not %d' % (numpy.product(in_dim), p.whiten_zca) cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None def get_data(d, i): data = d.get_data(request=list(i))[d.sources.index('features')] # Fuel provides Cifar in uint8, convert to float32 data = numpy.require(data, dtype=numpy.float32) return data if cnorm is None else cnorm.apply(data) if p.whiten_zca > 0: logger.info('Whitening using %d ZCA components' % p.whiten_zca) whiten = ZCA() whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind)) else: whiten = None return in_dim, d, whiten, cnorm
def __init__(self, which_sets, **kwargs): super(MNISTCluttered, self).__init__(file_or_path=find_in_data_path('mnist_cluttered.hdf5'), which_sets=which_sets, **kwargs)
def data_path(self): return find_in_data_path(self.filename)
def __init__(self, which_sets, **kwargs): kwargs.setdefault('load_in_memory', True) super(MNIST, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def test_returns_file_path(self): assert_equal(find_in_data_path('file_2.txt'), os.path.join(self.tempdir, 'dir2', 'file_2.txt'))
def __init__(self, which_format, which_sets, **kwargs): self.which_format = which_format super(CelebA, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def __init__(self, which_sets, **kwargs): super(MNISTCluttered, self).__init__( file_or_path=find_in_data_path('mnist_cluttered.hdf5'), which_sets=which_sets, **kwargs)
def test_returns_first_file_found(self): assert_equal(find_in_data_path('file_1.txt'), os.path.join(self.tempdir, 'dir1', 'file_1.txt'))
def __init__(self, which_sets, load_in_memory=True, **kwargs): super(BinarizedMNIST, self).__init__(file_or_path=find_in_data_path(self.filename), which_sets=which_sets, load_in_memory=load_in_memory, **kwargs)
def __init__(self, youtube_id, **kwargs): super(YouTubeAudio, self).__init__( file_or_path=find_in_data_path('{}.hdf5'.format(youtube_id)), which_sets=('train',), **kwargs )
def create_custom_streams(filename, training_batch_size, monitoring_batch_size, include_targets=False, color_convert=False, allowed=None, stretch=False, split_names=['train', 'valid', 'test']): """Creates data streams from fuel hdf5 file. Currently features must be 64x64. Parameters ---------- filename : string basename to hdf5 file for input training_batch_size : int Batch size for training. monitoring_batch_size : int Batch size for monitoring. include_targets : bool If ``True``, use both features and targets. If ``False``, use features only. color_convert : bool If ``True``, input is assumed to be one-channel, and so will be transformed to three-channel by duplication. Returns ------- rval : tuple of data streams Data streams for the main loop, the training set monitor, the validation set monitor and the test set monitor. """ sources = ('features', 'targets') if include_targets else ('features', ) dataset_fname = find_in_data_path(filename + '.hdf5') data_train = H5PYDataset(dataset_fname, which_sets=[split_names[0]], sources=sources) data_valid = H5PYDataset(dataset_fname, which_sets=[split_names[1]], sources=sources) data_test = H5PYDataset(dataset_fname, which_sets=[split_names[2]], sources=sources) data_train.default_transformers = uint8_pixels_to_floatX(('features', )) data_valid.default_transformers = uint8_pixels_to_floatX(('features', )) data_test.default_transformers = uint8_pixels_to_floatX(('features', )) results = create_streams(data_train, data_valid, data_test, training_batch_size, monitoring_batch_size) if color_convert: results = tuple( map(lambda s: Colorize(s, which_sources=('features', )), results)) # wrap labels in stretcher if requested if stretch: results = tuple( map(lambda s: StretchLabels(s, which_sources=('targets', )), results)) # wrap labels in scrubber if not all labels are allowed if allowed: results = tuple( map( lambda s: Scrubber( s, allowed=allowed, which_sources=('targets', )), results)) return results
def __init__(self, **kwargs): kwargs.setdefault('load_in_memory', True) super(Dataset, self).__init__( file_or_path=find_in_data_path(self.filename), **kwargs)
def __init__(self, which_sets, **kwargs): super(DogsVsCats, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def __init__(self, which_sets, **kwargs): kwargs.setdefault('load_in_memory', False) super(ILSVRC2010, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def get_filepath(filename=None): if filename is None: filename = MMImdbDataset.filename return find_in_data_path(filename)
def dataset_class(which_sets): return H5PYDataset(file_or_path=find_in_data_path(fn), which_sets=which_sets, load_in_memory=True)
def create_custom_streams(filename, training_batch_size, monitoring_batch_size, include_targets=False, color_convert=False, allowed=None, stretch=None, random_spread=False, random_label_strip=False, add_label_uncertainty=False, uuid_str=None, split_names=['train', 'valid', 'test']): """Creates data streams from fuel hdf5 file. Currently features must be 64x64. Parameters ---------- filename : string basename to hdf5 file for input training_batch_size : int Batch size for training. monitoring_batch_size : int Batch size for monitoring. include_targets : bool If ``True``, use both features and targets. If ``False``, use features only. color_convert : bool If ``True``, input is assumed to be one-channel, and so will be transformed to three-channel by duplication. Returns ------- rval : tuple of data streams Data streams for the main loop, the training set monitor, the validation set monitor and the test set monitor. """ sources = ('features', 'targets') if include_targets else ('features',) dataset_fname = find_in_data_path(filename+'.hdf5') data_train = H5PYDataset(dataset_fname, which_sets=[split_names[0]], sources=sources) data_valid = H5PYDataset(dataset_fname, which_sets=[split_names[1]], sources=sources) data_test = H5PYDataset(dataset_fname, which_sets=[split_names[2]], sources=sources) data_train.default_transformers = uint8_pixels_to_floatX(('features',)) data_valid.default_transformers = uint8_pixels_to_floatX(('features',)) data_test.default_transformers = uint8_pixels_to_floatX(('features',)) results = create_streams(data_train, data_valid, data_test, training_batch_size, monitoring_batch_size) if color_convert: results = tuple(map( lambda s: Colorize(s, which_sources=('features',)), results)) if add_label_uncertainty: results = tuple(map( lambda s: AddLabelUncertainty(s, chance=add_label_uncertainty, which_sources=('targets',)), results)) if random_label_strip: results = tuple(map( lambda s: RandomLabelStrip(s, chance=random_label_strip, which_sources=('targets',)), results)) # wrap labels in stretcher if requested if stretch is not None: results = tuple(map( lambda s: StretchLabels(s, which_sources=('targets',), length=stretch), results)) # wrap labels in scrubber if not all labels are allowed if allowed: results = tuple(map( lambda s: Scrubber(s, allowed=allowed, which_sources=('targets',)), results)) if random_spread: results = tuple(map( lambda s: RandomLabelOptionalSpreader(s, which_sources=('targets',)), results)) if uuid_str is not None: results = tuple(map( lambda s: UUIDStretch(s, uuid_str=uuid_str, which_sources=('targets',)), results)) return results
def __init__(self, which_sets, load_in_memory=True, **kwargs): super(BinarizedMNIST, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, load_in_memory=load_in_memory, **kwargs)
def __init__(self, which_sets, **kwargs): kwargs.setdefault('load_in_memory', False) super(TinyILSVRC2012, self).__init__( file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def __init__(self, which_format, which_sets, **kwargs): self.which_format = which_format super(CelebA, self).__init__(file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def __init__(self, which_sets, **kwargs): super(DogsVsCats, self).__init__(file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def __init__(self, which_sets, **kwargs): kwargs.setdefault('load_in_memory', True) super(LFW, self).__init__(file_or_path=find_in_data_path(self.filename), which_sets=which_sets, **kwargs)
def setup_data(p, test_set=False): dataset_class = { 'cifar10': (CIFAR10), 'jos' : (JOS), 'mnist': (MNIST), }[p.dataset] training_set_size = p.unlabeled_samples # Allow overriding the default from command line if p.get('unlabeled_samples') is not None: training_set_size = p.unlabeled_samples train_set = dataset_class(["train"]) # Make sure the MNIST data is in right format if p.dataset == 'mnist': d = train_set.data_sources[train_set.sources.index('features')] assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \ 'Make sure data is in float format and in range 0 to 1' # Take all indices and permutate them all_ind = numpy.arange(train_set.num_examples) if p.get('dseed'): rng = numpy.random.RandomState(seed=p.dseed) rng.shuffle(all_ind) d = AttributeDict() # Choose the training set d.train = train_set d.train_ind = all_ind[:training_set_size] # Then choose validation set from the remaining indices d.valid = train_set d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size] logger.info('Using %d examples for validation' % len(d.valid_ind)) # Only touch test data if requested if test_set: d.test = dataset_class(["test"]) d.test_ind = numpy.arange(d.test.num_examples) # Setup optional whitening, only used for Cifar-10 fn = find_in_data_path(train_set.filename) #iprint(fn) s1 = H5PYDataset(fn, ("train",)) handle = s1.open() in_dim = s1.get_data(handle,slice(0,1))[0].shape[1:] s1.close(handle) #in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:] if len(in_dim) > 1 and p.whiten_zca > 0: assert numpy.product(in_dim) == p.whiten_zca, \ 'Need %d whitening dimensions, not %d' % (numpy.product(in_dim), p.whiten_zca) cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None def get_data(d, i): data = d.get_data(request=list(i))[d.sources.index('features')] # Fuel provides Cifar in uint8, convert to float32 data = numpy.require(data, dtype=numpy.float32) return data if cnorm is None else cnorm.apply(data) if p.whiten_zca > 0: logger.info('Whitening using %d ZCA components' % p.whiten_zca) whiten = ZCA() whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind)) else: whiten = None return in_dim, d, whiten, cnorm