Exemplo n.º 1
0
 def __init__(self, which_set, which_partitions, dictionary, **kwargs):
     if which_set not in ('training', 'heldout'):
         raise ValueError
     if which_set == 'training':
         if not all(partition in range(1, 100)
                    for partition in which_partitions):
             raise ValueError
         files = [
             find_in_data_path(
                 os.path.join('1-billion-word',
                              'training-monolingual.tokenized.shuffled',
                              'news.en-{:05d}-of-00100'.format(partition)))
             for partition in which_partitions
         ]
     else:
         if not all(partition in range(50)
                    for partition in which_partitions):
             raise ValueError
         files = [
             find_in_data_path(
                 os.path.join(
                     '1-billion-word',
                     'heldout-monolingual.tokenized.shuffled',
                     'news.en.heldout-{:05d}-of-00050'.format(partition)))
             for partition in which_partitions
         ]
     super(OneBillionWord, self).__init__(files, dictionary, **kwargs)
Exemplo n.º 2
0
def test_prepare_metadata():
    skip_if_not_available(datasets=[DEVKIT_ARCHIVE, TEST_GROUNDTRUTH])
    devkit_path = find_in_data_path(DEVKIT_ARCHIVE)
    test_gt_path = find_in_data_path(TEST_GROUNDTRUTH)
    n_train, v_gt, t_gt, wnid_map = prepare_metadata(devkit_path,
                                                     test_gt_path)
    assert n_train == 1261406
    assert len(v_gt) == 50000
    assert len(t_gt) == 150000
    assert sorted(wnid_map.values()) == list(range(1000))
    assert all(isinstance(k, six.string_types) and len(k) == 9
               for k in wnid_map)
Exemplo n.º 3
0
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        datastream.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    train_stream = DataStream.default_stream(
        dataset=datastream,
        iteration_scheme=SequentialExampleScheme(datastream.num_examples))

    it = train_stream.get_epoch_iterator()
    return it
Exemplo n.º 4
0
 def __init__(self, which_sets, height, width, N, n_iter, **kwargs):
     super(SVHN, self).__init__(file_or_path=find_in_data_path('svhn_format_1.hdf5'), which_sets=which_sets, **kwargs)
     global N_global, height_global, width_global, n_iter_global
     height_global = height
     width_global = width
     N_global = N
     n_iter_global = n_iter
Exemplo n.º 5
0
def test_read_devkit():
    skip_if_not_available(datasets=[DEVKIT_ARCHIVE])
    synsets, raw_valid_gt = read_devkit(find_in_data_path(DEVKIT_ARCHIVE))
    # synset sanity tests appear in test_read_metadata_mat_file
    assert raw_valid_gt.min() == 1
    assert raw_valid_gt.max() == 1000
    assert raw_valid_gt.dtype.kind == 'i'
    assert raw_valid_gt.shape == (50000,)
 def __init__(self, which_sets, **kwargs):
     try:
         path = find_in_data_path(self._filename)
     except IOError as e:
         msg = str(e) + (""".
      You need to download the dataset and convert it to hdf5 before.""")
         raise IOError(msg)
     super(Cars196Dataset, self).__init__(
         file_or_path=path, which_sets=which_sets, **kwargs)
Exemplo n.º 7
0
def test_prepare_metadata():
    skip_if_not_available(datasets=[DEVKIT_ARCHIVE, TEST_IMAGES_TAR])
    devkit_path = find_in_data_path(DEVKIT_ARCHIVE)
    n_train, v_gt, n_test, wnid_map = prepare_metadata(devkit_path)
    assert n_train == 1281167
    assert len(v_gt) == 50000
    assert n_test == 100000
    assert sorted(wnid_map.values()) == list(range(1000))
    assert all(isinstance(k, six.string_types) and len(k) == 9
               for k in wnid_map)
Exemplo n.º 8
0
 def __init__(self, which_set, which_partitions, dictionary, **kwargs):
     if which_set not in ('training', 'heldout'):
         raise ValueError
     if which_set == 'training':
         if not all(partition in range(1, 100)
                    for partition in which_partitions):
             raise ValueError
         files = [find_in_data_path(os.path.join(
             '1-billion-word', 'training-monolingual.tokenized.shuffled',
             'news.en-{:05d}-of-00100'.format(partition)))
             for partition in which_partitions]
     else:
         if not all(partition in range(50)
                    for partition in which_partitions):
             raise ValueError
         files = [find_in_data_path(os.path.join(
             '1-billion-word', 'heldout-monolingual.tokenized.shuffled',
             'news.en.heldout-{:05d}-of-00050'.format(partition)))
             for partition in which_partitions]
     super(OneBillionWord, self).__init__(files, dictionary, **kwargs)
Exemplo n.º 9
0
def get_all_data_inorder(filename, batch_size):
    sources = ('features', 'targets')

    dataset_fname = find_in_data_path(filename+'.hdf5')
    data_all = H5PYDataset(dataset_fname, which_sets=['train', 'valid', 'test'],
                             sources=sources)
    data_all.default_transformers = uint8_pixels_to_floatX(('features',))
    main_stream = DataStream.default_stream(
        dataset=data_all,
        iteration_scheme=SequentialScheme(data_all.num_examples, batch_size))
    color_stream = Colorize(main_stream, which_sources=('features',))
    return data_all.num_examples, color_stream
Exemplo n.º 10
0
def get_all_data_inorder(filename, batch_size):
    sources = ('features', 'targets')

    dataset_fname = find_in_data_path(filename + '.hdf5')
    data_all = H5PYDataset(dataset_fname,
                           which_sets=['train', 'valid', 'test'],
                           sources=sources)
    data_all.default_transformers = uint8_pixels_to_floatX(('features', ))
    main_stream = DataStream.default_stream(dataset=data_all,
                                            iteration_scheme=SequentialScheme(
                                                data_all.num_examples,
                                                batch_size))
    color_stream = Colorize(main_stream, which_sources=('features', ))
    return data_all.num_examples, color_stream
Exemplo n.º 11
0
def skip_if_not_available(modules=None, datasets=None, configurations=None):
    """Raises a SkipTest exception when requirements are not met.

    Parameters
    ----------
    modules : list
        A list of strings of module names. If one of the modules fails to
        import, the test will be skipped.
    datasets : list
        A list of strings of folder names. If the data path is not
        configured, or the folder does not exist, the test is skipped.
    configurations : list
        A list of of strings of configuration names. If this configuration
        is not set and does not have a default, the test will be skipped.

    """
    if modules is None:
        modules = []
    if datasets is None:
        datasets = []
    if configurations is None:
        configurations = []
    for module in modules:
        try:
            import_module(module)
        except Exception:
            raise SkipTest
    if datasets and not hasattr(config, 'data_path'):
        raise SkipTest
    for dataset in datasets:
        try:
            find_in_data_path(dataset)
        except IOError:
            raise SkipTest
    for configuration in configurations:
        if not hasattr(config, configuration):
            raise SkipTest
Exemplo n.º 12
0
def test_read_metadata_mat_file():
    skip_if_not_available(datasets=[DEVKIT_ARCHIVE])
    with tarfile.open(find_in_data_path(DEVKIT_ARCHIVE)) as tar:
        meta_mat = tar.extractfile(DEVKIT_META_PATH)
        synsets = read_metadata_mat_file(meta_mat)
    assert (synsets['ILSVRC2012_ID'] ==
            numpy.arange(1, len(synsets) + 1)).all()
    assert synsets['num_train_images'][1000:].sum() == 0
    assert (synsets['num_train_images'][:1000] > 0).all()
    assert synsets.ndim == 1
    assert synsets['wordnet_height'].min() == 0
    assert synsets['wordnet_height'].max() == 19
    assert synsets['WNID'].dtype == numpy.dtype('S9')
    assert (synsets['num_children'][:1000] == 0).all()
    assert (synsets['children'][:1000] == -1).all()
Exemplo n.º 13
0
def get_dataset_iterator(dataset,
                         split,
                         include_features=True,
                         include_targets=False,
                         unit_scale=True,
                         label_transforms=False,
                         return_length=False):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split, )

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources)
    if unit_scale:
        h5_dataset.default_transformers = uint8_pixels_to_floatX(
            ('features', ))

    datastream = DataStream.default_stream(
        dataset=h5_dataset,
        iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples))

    if label_transforms:
        # TODO: maybe refactor this common bit with get_custom_streams below
        datastream = AddLabelUncertainty(datastream,
                                         chance=0,
                                         which_sources=('targets', ))

        datastream = RandomLabelStrip(datastream,
                                      chance=0,
                                      which_sources=('targets', ))

        # HACK: allow variable stretch
        datastream = StretchLabels(datastream,
                                   length=128,
                                   which_sources=('targets', ))

    it = datastream.get_epoch_iterator()
    if return_length:
        return it, h5_dataset.num_examples
    else:
        return it
Exemplo n.º 14
0
 def __init__(self, split='train', **kwargs):
     path = find_in_data_path(self._filename)
     self.split = split
     self.train = H5PYDataset(file_or_path=path, which_sets=['train'])
     self.train_labels = H5PYDataset(
         file_or_path=path,
         which_sets=['train'],
         sources=['targets'],
         load_in_memory=True).data_sources[0].ravel()
     self.test = H5PYDataset(file_or_path=path, which_sets=['test'])
     self.test_labels = H5PYDataset(
         file_or_path=path,
         which_sets=['test'],
         sources=['targets'],
         load_in_memory=True).data_sources[0].ravel()
     self.train_handle = self.train.open()
     self.test_hanle = self.test.open()
     self.ntest = self.test.num_examples
     self.ntrain = self.train.num_examples
Exemplo n.º 15
0
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True, label_transforms=False, return_length=False):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split,)

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    h5_dataset = H5PYDataset(dataset_fname, which_sets=splits,
                             sources=sources)
    if unit_scale:
        h5_dataset.default_transformers = uint8_pixels_to_floatX(('features',))

    datastream = DataStream.default_stream(
        dataset=h5_dataset,
        iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples))

    if label_transforms:
        # TODO: maybe refactor this common bit with get_custom_streams below
        datastream = AddLabelUncertainty(datastream,
                                         chance=0,
                                         which_sources=('targets',))

        datastream = RandomLabelStrip(datastream,
                                         chance=0,
                                         which_sources=('targets',))

        # HACK: allow variable stretch
        datastream = StretchLabels(datastream,
                                         length=128,
                                         which_sources=('targets',))


    it = datastream.get_epoch_iterator()
    if return_length:
        return it, h5_dataset.num_examples
    else:
        return it
Exemplo n.º 16
0
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True):
    """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)"""
    sources = []
    sources = sources + ['features'] if include_features else sources
    sources = sources + ['targets'] if include_targets else sources
    if split == "all":
        splits = ('train', 'valid', 'test')
    elif split == "nontrain":
        splits = ('valid', 'test')
    else:
        splits = (split,)

    dataset_fname = find_in_data_path("{}.hdf5".format(dataset))
    datastream = H5PYDataset(dataset_fname, which_sets=splits,
                             sources=sources)
    if unit_scale:
        datastream.default_transformers = uint8_pixels_to_floatX(('features',))

    train_stream = DataStream.default_stream(
        dataset=datastream,
        iteration_scheme=SequentialExampleScheme(datastream.num_examples))

    it = train_stream.get_epoch_iterator()
    return it
Exemplo n.º 17
0
def test_read_metadata_mat_file():
    skip_if_not_available(datasets=[DEVKIT_ARCHIVE])
    with tarfile.open(find_in_data_path(DEVKIT_ARCHIVE)) as tar:
        meta_mat = tar.extractfile(DEVKIT_META_PATH)
        synsets, cost_mat = read_metadata_mat_file(meta_mat)
    assert (synsets['ILSVRC2010_ID'] ==
            numpy.arange(1, len(synsets) + 1)).all()
    assert synsets['num_train_images'][1000:].sum() == 0
    assert (synsets['num_train_images'][:1000] > 0).all()
    assert synsets.ndim == 1
    assert synsets['wordnet_height'].min() == 0
    assert synsets['wordnet_height'].max() == 19
    assert synsets['WNID'].dtype == numpy.dtype('S9')
    assert (synsets['num_children'][:1000] == 0).all()
    assert (synsets['children'][:1000] == -1).all()

    # Assert the basics about the cost matrix.
    assert cost_mat.shape == (1000, 1000)
    assert cost_mat.dtype == 'uint8'
    assert cost_mat.min() == 0
    assert cost_mat.max() == 18
    assert (cost_mat == cost_mat.T).all()
    # Assert that the diagonal is 0.
    assert (cost_mat.flat[::1001] == 0).all()
Exemplo n.º 18
0
    def read_frame(cls, key, *args, **kwargs):

        """ Load a set of features from the dataset as a pandas object.

        Args:
            key (str):
                The HDF5 key for required data.  Typically, this will be one of

                - structure: for the raw molecules
                - smiles: for the smiles
                - features/{feat_name}: for the features
                - targets/{targ_name}: for the targets

        Returns:
            pd.Series or pd.DataFrame or pd.Panel
                The data as a dataframe.
        """

        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            data = pd.read_hdf(find_in_data_path(cls.filename), key, *args, **kwargs)
        if isinstance(data, pd.Panel):
            data = data.transpose(2, 1, 0)
        return data
Exemplo n.º 19
0
def get_data(data_name):
    if data_name == 'mnist_transform':
        from fuel.datasets import H5PYDataset
        from fuel.utils import find_in_data_path
        img_size = (60, 60)
        channels = 1
        f_name = find_in_data_path('mnist_transform.hdf5')
        data_train = H5PYDataset(f_name, which_sets=['train'],
                                 load_in_memory=True,
                                 sources=['features', 'targets'])
        data_valid = H5PYDataset(f_name, which_sets=['valid'],
                                 load_in_memory=True,
                                 sources=['features', 'targets'])
        data_test = H5PYDataset(f_name, which_sets=['test'],
                                load_in_memory=True,
                                sources=['features', 'targets'])

    elif data_name == 'mnist_multi_translated':
        from fuel.datasets import H5PYDataset
        from fuel.utils import find_in_data_path
        img_size = (28 * 3, 28 * 3)
        channels = 1
        f_name = find_in_data_path('multi_mnist_translated.hdf5')

        data_train = H5PYDataset(f_name, which_sets=['train'],
                                 load_in_memory=True,
                                 sources=['features', 'targets'])
        data_valid = H5PYDataset(f_name, which_sets=['valid'],
                                 load_in_memory=True,
                                 sources=['features', 'targets'])
        data_test = H5PYDataset(f_name, which_sets=['test'],
                                load_in_memory=True,
                                sources=['features', 'targets'])

    elif data_name == 'mnist':
        from fuel.datasets import MNIST
        img_size = (28, 28)
        channels = 1
        data_train = MNIST(which_sets=["train"],
                           sources=['features', 'targets'])
        data_valid = MNIST(which_sets=["test"],
                           sources=['features', 'targets'])
        data_test = MNIST(which_sets=["test"],
                          sources=['features', 'targets'])
    elif data_name == 'bmnist':
        from fuel.datasets.binarized_mnist import BinarizedMNIST
        img_size = (28, 28)
        channels = 1
        data_train = BinarizedMNIST(which_sets=['train'],
                                    sources=['features'])
        data_valid = BinarizedMNIST(which_sets=['valid'],
                                    sources=['features'])
        data_test = BinarizedMNIST(which_sets=['test'],
                                   sources=['features'])
    # TODO: make a generic catch-all for
    # loading custom datasets like "colormnist"
    elif data_name == 'colormnist':
        from draw.colormnist import ColorMNIST
        img_size = (28, 28)
        channels = 3
        data_train = ColorMNIST(which_sets=['train'], sources=['features'])
        data_valid = ColorMNIST(which_sets=['test'], sources=['features'])
        data_test = ColorMNIST(which_sets=['test'], sources=['features'])
    elif data_name == 'cifar10':
        from fuel.datasets.cifar10 import CIFAR10
        img_size = (32, 32)
        channels = 3
        data_train = CIFAR10(which_sets=['train'], sources=['features'])
        data_valid = CIFAR10(which_sets=['test'], sources=['features'])
        data_test = CIFAR10(which_sets=['test'], sources=['features'])
    elif data_name == 'svhn2':
        from fuel.datasets.svhn import SVHN
        img_size = (32, 32)
        channels = 3
        data_train = SVHN(which_format=2,
                          which_sets=['train'], sources=['features'])
        data_valid = SVHN(which_format=2,
                          which_sets=['test'], sources=['features'])
        data_test = SVHN(which_format=2,
                         which_sets=['test'], sources=['features'])
    elif data_name == 'silhouettes':
        from fuel.datasets.caltech101_silhouettes import CalTech101Silhouettes
        size = 28
        img_size = (size, size)
        channels = 1
        data_train = CalTech101Silhouettes(which_sets=['train'],
                                           size=size, sources=['features'])
        data_valid = CalTech101Silhouettes(which_sets=['valid'],
                                           size=size, sources=['features'])
        data_test = CalTech101Silhouettes(which_sets=['test'],
                                          size=size, sources=['features'])
    elif data_name == 'tfd':
        from fuel.datasets.toronto_face_database import TorontoFaceDatabase
        img_size = (28, 28)
        channels = 1
        data_train = TorontoFaceDatabase(which_sets=['unlabeled'],
                                         size=size, sources=['features'])
        data_valid = TorontoFaceDatabase(which_sets=['valid'],
                                         size=size, sources=['features'])
        data_test = TorontoFaceDatabase(which_sets=['test'],
                                        size=size, sources=['features'])
    else:
        raise ValueError("Unknown dataset %s" % data_name)

    return img_size, channels, data_train, data_valid, data_test
Exemplo n.º 20
0
 def available_sets(cls):
     with h5py.File(find_in_data_path(cls.filename)) as f:
         return cls.get_all_splits(f)
Exemplo n.º 21
0
 def __init__(self, which_sets, **kwargs):
     kwargs.setdefault('load_in_memory', True)
     super(SubredditTopPhotosFeatures22, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets,
         **kwargs)
Exemplo n.º 22
0
 def __init__(self, which_sets, **kwargs):
     kwargs.setdefault("load_in_memory", False)
     super(SVHN17, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets, **kwargs)
Exemplo n.º 23
0
def setup_data(p, test_set=False):
    dataset_class = {
        'cifar10': (CIFAR10),
        'jos': (JOS),
        'mnist': (MNIST),
    }[p.dataset]

    training_set_size = p.unlabeled_samples

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class(["train"])

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    fn = find_in_data_path(train_set.filename)
    #iprint(fn)
    s1 = H5PYDataset(fn, ("train", ))
    handle = s1.open()
    in_dim = s1.get_data(handle, slice(0, 1))[0].shape[1:]
    s1.close(handle)
    #in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=list(i))[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm
Exemplo n.º 24
0
 def __init__(self, which_sets, **kwargs):
     super(MNISTCluttered, self).__init__(file_or_path=find_in_data_path('mnist_cluttered.hdf5'), which_sets=which_sets, **kwargs)
Exemplo n.º 25
0
 def data_path(self):
     return find_in_data_path(self.filename)
Exemplo n.º 26
0
 def __init__(self, which_sets, **kwargs):
     kwargs.setdefault('load_in_memory', True)
     super(MNIST, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets, **kwargs)
Exemplo n.º 27
0
 def test_returns_file_path(self):
     assert_equal(find_in_data_path('file_2.txt'),
                  os.path.join(self.tempdir, 'dir2', 'file_2.txt'))
Exemplo n.º 28
0
Arquivo: celeba.py Projeto: Afrik/fuel
 def __init__(self, which_format, which_sets, **kwargs):
     self.which_format = which_format
     super(CelebA, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets, **kwargs)
Exemplo n.º 29
0
 def __init__(self, which_sets, **kwargs):
     super(MNISTCluttered, self).__init__(
         file_or_path=find_in_data_path('mnist_cluttered.hdf5'),
         which_sets=which_sets,
         **kwargs)
Exemplo n.º 30
0
 def test_returns_first_file_found(self):
     assert_equal(find_in_data_path('file_1.txt'),
                  os.path.join(self.tempdir, 'dir1', 'file_1.txt'))
Exemplo n.º 31
0
 def __init__(self, which_sets, load_in_memory=True, **kwargs):
     super(BinarizedMNIST,
           self).__init__(file_or_path=find_in_data_path(self.filename),
                          which_sets=which_sets,
                          load_in_memory=load_in_memory,
                          **kwargs)
Exemplo n.º 32
0
 def __init__(self, youtube_id, **kwargs):
     super(YouTubeAudio, self).__init__(
         file_or_path=find_in_data_path('{}.hdf5'.format(youtube_id)),
         which_sets=('train',), **kwargs
     )
Exemplo n.º 33
0
def create_custom_streams(filename,
                          training_batch_size,
                          monitoring_batch_size,
                          include_targets=False,
                          color_convert=False,
                          allowed=None,
                          stretch=False,
                          split_names=['train', 'valid', 'test']):
    """Creates data streams from fuel hdf5 file.

    Currently features must be 64x64.

    Parameters
    ----------
    filename : string
        basename to hdf5 file for input
    training_batch_size : int
        Batch size for training.
    monitoring_batch_size : int
        Batch size for monitoring.
    include_targets : bool
        If ``True``, use both features and targets. If ``False``, use
        features only.
    color_convert : bool
        If ``True``, input is assumed to be one-channel, and so will
        be transformed to three-channel by duplication.

    Returns
    -------
    rval : tuple of data streams
        Data streams for the main loop, the training set monitor,
        the validation set monitor and the test set monitor.

    """
    sources = ('features', 'targets') if include_targets else ('features', )

    dataset_fname = find_in_data_path(filename + '.hdf5')
    data_train = H5PYDataset(dataset_fname,
                             which_sets=[split_names[0]],
                             sources=sources)
    data_valid = H5PYDataset(dataset_fname,
                             which_sets=[split_names[1]],
                             sources=sources)
    data_test = H5PYDataset(dataset_fname,
                            which_sets=[split_names[2]],
                            sources=sources)
    data_train.default_transformers = uint8_pixels_to_floatX(('features', ))
    data_valid.default_transformers = uint8_pixels_to_floatX(('features', ))
    data_test.default_transformers = uint8_pixels_to_floatX(('features', ))

    results = create_streams(data_train, data_valid, data_test,
                             training_batch_size, monitoring_batch_size)

    if color_convert:
        results = tuple(
            map(lambda s: Colorize(s, which_sources=('features', )), results))

    # wrap labels in stretcher if requested
    if stretch:
        results = tuple(
            map(lambda s: StretchLabels(s, which_sources=('targets', )),
                results))

    # wrap labels in scrubber if not all labels are allowed
    if allowed:
        results = tuple(
            map(
                lambda s: Scrubber(
                    s, allowed=allowed, which_sources=('targets', )), results))

    return results
Exemplo n.º 34
0
 def __init__(self, **kwargs):
     kwargs.setdefault('load_in_memory', True)
     super(Dataset, self).__init__(
         file_or_path=find_in_data_path(self.filename), **kwargs)
Exemplo n.º 35
0
 def __init__(self, which_sets, **kwargs):
     super(DogsVsCats, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets, **kwargs)
Exemplo n.º 36
0
 def test_returns_first_file_found(self):
     assert_equal(find_in_data_path('file_1.txt'),
                  os.path.join(self.tempdir, 'dir1', 'file_1.txt'))
Exemplo n.º 37
0
 def __init__(self, which_sets, **kwargs):
     kwargs.setdefault('load_in_memory', False)
     super(ILSVRC2010, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets, **kwargs)
Exemplo n.º 38
0
 def get_filepath(filename=None):
     if filename is None:
         filename = MMImdbDataset.filename
     return find_in_data_path(filename)
Exemplo n.º 39
0
 def dataset_class(which_sets):
     return H5PYDataset(file_or_path=find_in_data_path(fn),
                        which_sets=which_sets,
                        load_in_memory=True)
Exemplo n.º 40
0
 def test_returns_file_path(self):
     assert_equal(find_in_data_path('file_2.txt'),
                  os.path.join(self.tempdir, 'dir2', 'file_2.txt'))
Exemplo n.º 41
0
def create_custom_streams(filename, training_batch_size, monitoring_batch_size,
                          include_targets=False, color_convert=False,
                          allowed=None, stretch=None, random_spread=False,
                          random_label_strip=False, add_label_uncertainty=False,
                          uuid_str=None,
                          split_names=['train', 'valid', 'test']):
    """Creates data streams from fuel hdf5 file.

    Currently features must be 64x64.

    Parameters
    ----------
    filename : string
        basename to hdf5 file for input
    training_batch_size : int
        Batch size for training.
    monitoring_batch_size : int
        Batch size for monitoring.
    include_targets : bool
        If ``True``, use both features and targets. If ``False``, use
        features only.
    color_convert : bool
        If ``True``, input is assumed to be one-channel, and so will
        be transformed to three-channel by duplication.

    Returns
    -------
    rval : tuple of data streams
        Data streams for the main loop, the training set monitor,
        the validation set monitor and the test set monitor.

    """
    sources = ('features', 'targets') if include_targets else ('features',)

    dataset_fname = find_in_data_path(filename+'.hdf5')
    data_train = H5PYDataset(dataset_fname, which_sets=[split_names[0]],
                             sources=sources)
    data_valid = H5PYDataset(dataset_fname, which_sets=[split_names[1]],
                             sources=sources)
    data_test = H5PYDataset(dataset_fname, which_sets=[split_names[2]],
                            sources=sources)
    data_train.default_transformers = uint8_pixels_to_floatX(('features',))
    data_valid.default_transformers = uint8_pixels_to_floatX(('features',))
    data_test.default_transformers = uint8_pixels_to_floatX(('features',))

    results = create_streams(data_train, data_valid, data_test,
                             training_batch_size, monitoring_batch_size)

    if color_convert:
        results = tuple(map(
                    lambda s: Colorize(s, which_sources=('features',)),
                    results))

    if add_label_uncertainty:
        results = tuple(map(
                    lambda s: AddLabelUncertainty(s, chance=add_label_uncertainty,
                                       which_sources=('targets',)),
                    results))

    if random_label_strip:
        results = tuple(map(
                    lambda s: RandomLabelStrip(s, chance=random_label_strip,
                                       which_sources=('targets',)),
                    results))

    # wrap labels in stretcher if requested
    if stretch is not None:
        results = tuple(map(
                    lambda s: StretchLabels(s, which_sources=('targets',), length=stretch),
                    results))

    # wrap labels in scrubber if not all labels are allowed
    if allowed:
        results = tuple(map(
                    lambda s: Scrubber(s, allowed=allowed,
                                       which_sources=('targets',)),
                    results))

    if random_spread:
        results = tuple(map(
                    lambda s: RandomLabelOptionalSpreader(s,
                                       which_sources=('targets',)),
                    results))

    if uuid_str is not None:
        results = tuple(map(
                    lambda s: UUIDStretch(s, uuid_str=uuid_str,
                                       which_sources=('targets',)),
                    results))

    return results
Exemplo n.º 42
0
 def __init__(self, which_sets, load_in_memory=True, **kwargs):
     super(BinarizedMNIST, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets,
         load_in_memory=load_in_memory, **kwargs)
Exemplo n.º 43
0
 def dataset_class(which_sets):
     return H5PYDataset(file_or_path=find_in_data_path(fn),
                        which_sets=which_sets,
                        load_in_memory=True)
Exemplo n.º 44
0
 def __init__(self, which_sets, **kwargs):
     kwargs.setdefault('load_in_memory', False)
     super(TinyILSVRC2012, self).__init__(
         file_or_path=find_in_data_path(self.filename),
         which_sets=which_sets, **kwargs)
Exemplo n.º 45
0
 def data_path(self):
     return find_in_data_path(self.filename)
Exemplo n.º 46
0
 def __init__(self, which_format, which_sets, **kwargs):
     self.which_format = which_format
     super(CelebA,
           self).__init__(file_or_path=find_in_data_path(self.filename),
                          which_sets=which_sets,
                          **kwargs)
Exemplo n.º 47
0
 def __init__(self, which_sets, **kwargs):
     super(DogsVsCats,
           self).__init__(file_or_path=find_in_data_path(self.filename),
                          which_sets=which_sets,
                          **kwargs)
Exemplo n.º 48
0
 def __init__(self, which_sets, **kwargs):
     kwargs.setdefault('load_in_memory', True)
     super(LFW,
           self).__init__(file_or_path=find_in_data_path(self.filename),
                          which_sets=which_sets,
                          **kwargs)
Exemplo n.º 49
0
Arquivo: run.py Projeto: josvr/ladder
def setup_data(p, test_set=False):
    dataset_class = {
        'cifar10': (CIFAR10),
        'jos' : (JOS),
        'mnist': (MNIST),
    }[p.dataset]

    training_set_size = p.unlabeled_samples 

    # Allow overriding the default from command line
    if p.get('unlabeled_samples') is not None:
        training_set_size = p.unlabeled_samples

    train_set = dataset_class(["train"])

    # Make sure the MNIST data is in right format
    if p.dataset == 'mnist':
        d = train_set.data_sources[train_set.sources.index('features')]
        assert numpy.all(d <= 1.0) and numpy.all(d >= 0.0), \
            'Make sure data is in float format and in range 0 to 1'

    # Take all indices and permutate them
    all_ind = numpy.arange(train_set.num_examples)
    if p.get('dseed'):
        rng = numpy.random.RandomState(seed=p.dseed)
        rng.shuffle(all_ind)

    d = AttributeDict()

    # Choose the training set
    d.train = train_set
    d.train_ind = all_ind[:training_set_size]

    # Then choose validation set from the remaining indices
    d.valid = train_set
    d.valid_ind = numpy.setdiff1d(all_ind, d.train_ind)[:p.valid_set_size]
    logger.info('Using %d examples for validation' % len(d.valid_ind))

    # Only touch test data if requested
    if test_set:
        d.test = dataset_class(["test"])
        d.test_ind = numpy.arange(d.test.num_examples)

    # Setup optional whitening, only used for Cifar-10
    fn = find_in_data_path(train_set.filename)
    #iprint(fn)
    s1 = H5PYDataset(fn, ("train",))
    handle = s1.open()
    in_dim =  s1.get_data(handle,slice(0,1))[0].shape[1:]
    s1.close(handle)
    #in_dim = train_set.data_sources[train_set.sources.index('features')].shape[1:]
    if len(in_dim) > 1 and p.whiten_zca > 0:
        assert numpy.product(in_dim) == p.whiten_zca, \
            'Need %d whitening dimensions, not %d' % (numpy.product(in_dim),
                                                      p.whiten_zca)
    cnorm = ContrastNorm(p.contrast_norm) if p.contrast_norm != 0 else None

    def get_data(d, i):
        data = d.get_data(request=list(i))[d.sources.index('features')]
        # Fuel provides Cifar in uint8, convert to float32
        data = numpy.require(data, dtype=numpy.float32)
        return data if cnorm is None else cnorm.apply(data)

    if p.whiten_zca > 0:
        logger.info('Whitening using %d ZCA components' % p.whiten_zca)
        whiten = ZCA()
        whiten.fit(p.whiten_zca, get_data(d.train, d.train_ind))
    else:
        whiten = None

    return in_dim, d, whiten, cnorm