Пример #1
0
def save_fuel_dataset(output_filename, train_features, valid_features,
                      test_features):
    datasource_dir = fuel.config.data_path[0]
    """Converts the dataset to HDF5.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(datasource_dir, output_filename)
    h5file = h5py.File(output_path, mode='w')

    data = (('train', 'features', train_features),
            ('valid', 'features', valid_features), ('test', 'features',
                                                    test_features))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'

    h5file.flush()
    h5file.close()

    return (output_path, )
Пример #2
0
def convert_iris(directory, output_directory, output_filename='iris.hdf5'):
  output_path = os.path.join(output_directory, output_filename)
  h5file = h5py.File(output_path, mode='w')
  classes = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
  data = numpy.loadtxt(
          os.path.join(directory, 'iris.data'),
          converters={4: lambda x: classes[x]},
          delimiter=',')
  numpy.random.shuffle(data)
  features = data[:, :-1].astype('float32')
  targets = data[:, -1].astype('uint8')
  train_features = features[:100]
  train_targets = targets[:100]
  valid_features = features[100:120]
  valid_targets = targets[100:120]
  test_features = features[120:]
  data = (('train', 'features', train_features),
          ('train', 'targets', train_targets),
          ('valid', 'features', valid_features),
          ('valid', 'targets', valid_targets),
          ('test', 'features', test_features))
  fill_hdf5_file(h5file, data)
  h5file['features'].dims[0].label = 'batch'
  h5file['features'].dims[1].label = 'feature'
  h5file['targets'].dims[0].label = 'batch'
  h5file['targets'].dims[1].label = 'index'
  
  h5file.flush()
  h5file.close()
  
  return (output_path,)
Пример #3
0
def read_in(size=64, limit=10, class_limit=10):
    all_vs = None
    vectors = []
    targets = []
    for i, c in zip(label_ids, labels)[:class_limit]:
        print(c)
        for j, fname in enumerate(os.listdir('food-101/images/' + c)[:limit]):
            infile = os.path.join('food-101/images', c, fname)
            outfname = re.sub('jpe?g$', 'png', fname)
            outfile = 'food-101-{0}x{0}/{1}_{2}'.format(size, c, outfname)
            try:
                im = Image.open(infile)
                im = ImageOps.fit(im, (size, size), Image.ANTIALIAS)
                if j < 10:
                    # Save the sample 10 images
                    im.save(outfile, "png")
                im = np.array(im, dtype=np.uint8)
                if len(im.shape) != 3:
                    continue
                imarr = im.transpose([2, 0, 1])
                if imarr.shape != (3, size, size):
                    continue
                targets.append(i)
                vectors.append(imarr)
                if len(vectors) >= 10000:
                    if all_vs is None:
                        all_vs = np.stack(vectors, 0)
                    else:
                        vs = np.stack(vectors, 0)
                        all_vs = np.concatenate([all_vs, vs], 0)
                    vectors = []
            except IOError:
                print "cannot create thumbnail for '%s'" % infile

    vectors = np.stack(vectors, 0)
    if all_vs is not None:
        vectors = np.concatenate([all_vs, vectors], 0)
    targets = np.array(targets).reshape(-1, 1)
    assert len(vectors) == len(targets)

    data = (('train', 'features', vectors), ('train', 'targets', targets))

    h5file = h5py.File('food-101-{0}x{0}.hdf5'.format(size), mode='w')
    try:
        fill_hdf5_file(h5file, data)

        h5file['features'].dims[0].label = 'batch'
        h5file['features'].dims[1].label = 'channel'
        h5file['features'].dims[2].label = 'height'
        h5file['features'].dims[3].label = 'width'
        h5file['targets'].dims[0].label = 'batch'
        h5file['targets'].dims[1].label = 'index'

        h5file.flush()
    finally:
        h5file.close()

    return vectors, targets
    print('Done!')
Пример #4
0
 def test_dtype(self):
     fill_hdf5_file(self.h5file,
                    (('train', 'features', self.train_features),
                     ('train', 'targets', self.train_targets),
                     ('test', 'features', self.test_features),
                     ('test', 'targets', self.test_targets)))
     assert_equal(str(self.h5file['features'].dtype), 'uint8')
     assert_equal(str(self.h5file['targets'].dtype), 'float32')
def convert_binarized_mnist(directory, output_directory,
                            output_filename='binarized_mnist.hdf5'):
    """Converts the binarized MNIST dataset to HDF5.

    Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN
    paper [DBN] to an HDF5 dataset compatible with
    :class:`fuel.datasets.BinarizedMNIST`. The converted dataset is
    saved as 'binarized_mnist.hdf5'.

    This method assumes the existence of the files
    `binarized_mnist_{train,valid,test}.amat`, which are accessible
    through Hugo Larochelle's website [HUGO].

    .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative
       Analysis of Deep Belief Networks*, Proceedings of the 25th
       international conference on Machine learning, 2008, pp. 872-879.

    .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/
       binarized_mnist/binarized_mnist_{train,valid,test}.amat

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'binarized_mnist.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')

    train_set = numpy.loadtxt(
        os.path.join(directory, TRAIN_FILE)).reshape(
            (-1, 1, 28, 28)).astype('uint8')
    valid_set = numpy.loadtxt(
        os.path.join(directory, VALID_FILE)).reshape(
            (-1, 1, 28, 28)).astype('uint8')
    test_set = numpy.loadtxt(
        os.path.join(directory, TEST_FILE)).reshape(
            (-1, 1, 28, 28)).astype('uint8')
    data = (('train', 'features', train_set),
            ('valid', 'features', valid_set),
            ('test', 'features', test_set))
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #6
0
def convert_svhn_format_2(directory,
                          output_directory,
                          output_filename='svhn_format_2.hdf5'):
    """Converts the SVHN dataset (format 2) to HDF5.

    This method assumes the existence of the files
    `{train,test,extra}_32x32.mat`, which are accessible through the
    official website [SVHNSITE].

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'svhn_format_2.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')

    train_set = loadmat(os.path.join(directory, FORMAT_2_TRAIN_FILE))
    train_features = train_set['X'].transpose(3, 2, 0, 1)
    train_targets = train_set['y']
    train_targets[train_targets == 10] = 0

    test_set = loadmat(os.path.join(directory, FORMAT_2_TEST_FILE))
    test_features = test_set['X'].transpose(3, 2, 0, 1)
    test_targets = test_set['y']
    test_targets[test_targets == 10] = 0

    extra_set = loadmat(os.path.join(directory, FORMAT_2_EXTRA_FILE))
    extra_features = extra_set['X'].transpose(3, 2, 0, 1)
    extra_targets = extra_set['y']
    extra_targets[extra_targets == 10] = 0

    data = (('train', 'features', train_features), ('test', 'features',
                                                    test_features),
            ('extra', 'features', extra_features), ('train', 'targets',
                                                    train_targets),
            ('test', 'targets', test_targets), ('extra', 'targets',
                                                extra_targets))
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['targets'].dims[i].label = label

    h5file.flush()
    h5file.close()

    return (output_path, )
Пример #7
0
 def test_dtype(self):
     fill_hdf5_file(
         self.h5file,
         (('train', 'features', self.train_features),
          ('train', 'targets', self.train_targets),
          ('test', 'features', self.test_features),
          ('test', 'targets', self.test_targets)))
     assert_equal(str(self.h5file['features'].dtype), 'uint8')
     assert_equal(str(self.h5file['targets'].dtype), 'float32')
Пример #8
0
def convert_svhn_format_2(directory, output_directory,
                          output_filename='svhn_format_2.hdf5'):
    """Converts the SVHN dataset (format 2) to HDF5.

    This method assumes the existence of the files
    `{train,test,extra}_32x32.mat`, which are accessible through the
    official website [SVHNSITE].

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'svhn_format_2.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')

    train_set = loadmat(os.path.join(directory, FORMAT_2_TRAIN_FILE))
    train_features = train_set['X'].transpose(3, 2, 0, 1)
    train_targets = train_set['y']
    train_targets[train_targets == 10] = 0

    test_set = loadmat(os.path.join(directory, FORMAT_2_TEST_FILE))
    test_features = test_set['X'].transpose(3, 2, 0, 1)
    test_targets = test_set['y']
    test_targets[test_targets == 10] = 0

    extra_set = loadmat(os.path.join(directory, FORMAT_2_EXTRA_FILE))
    extra_features = extra_set['X'].transpose(3, 2, 0, 1)
    extra_targets = extra_set['y']
    extra_targets[extra_targets == 10] = 0

    data = (('train', 'features', train_features),
            ('test', 'features', test_features),
            ('extra', 'features', extra_features),
            ('train', 'targets', train_targets),
            ('test', 'targets', test_targets),
            ('extra', 'targets', extra_targets))
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['targets'].dims[i].label = label

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #9
0
def convert_mnist(directory, output_file, dtype=None):
    """Converts the MNIST dataset to HDF5.

    Converts the MNIST dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.MNIST`. The converted dataset is
    saved as 'mnist.hdf5'.

    This method assumes the existence of the following files:
    `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz`
    `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz`

    It assumes the existence of the following files:

    * `train-images-idx3-ubyte.gz`
    * `train-labels-idx1-ubyte.gz`
    * `t10k-images-idx3-ubyte.gz`
    * `t10k-labels-idx1-ubyte.gz`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_file : str
        Where to save the converted dataset.
    dtype : str, optional
        Either 'float32', 'float64', or 'bool'. Defaults to `None`,
        in which case images will be returned in their original
        unsigned byte format.

    """
    h5file = h5py.File(output_file, mode="w")

    train_feat_path = os.path.join(directory, TRAIN_IMAGES)
    train_features = read_mnist_images(train_feat_path, dtype)
    train_lab_path = os.path.join(directory, TRAIN_LABELS)
    train_labels = read_mnist_labels(train_lab_path)
    test_feat_path = os.path.join(directory, TEST_IMAGES)
    test_features = read_mnist_images(test_feat_path, dtype)
    test_lab_path = os.path.join(directory, TEST_LABELS)
    test_labels = read_mnist_labels(test_lab_path)
    data = (
        ("train", "features", train_features),
        ("train", "targets", train_labels),
        ("test", "features", test_features),
        ("test", "targets", test_labels),
    )
    fill_hdf5_file(h5file, data)
    h5file["features"].dims[0].label = "batch"
    h5file["features"].dims[1].label = "channel"
    h5file["features"].dims[2].label = "height"
    h5file["features"].dims[3].label = "width"
    h5file["targets"].dims[0].label = "batch"
    h5file["targets"].dims[1].label = "index"

    h5file.flush()
    h5file.close()
Пример #10
0
 def test_data(self):
     fill_hdf5_file(self.h5file,
                    (('train', 'features', self.train_features, '.'),
                     ('train', 'targets', self.train_targets),
                     ('test', 'features', self.test_features),
                     ('test', 'targets', self.test_targets)))
     assert_equal(self.h5file['features'],
                  numpy.vstack([self.train_features, self.test_features]))
     assert_equal(self.h5file['targets'],
                  numpy.vstack([self.train_targets, self.test_targets]))
Пример #11
0
def convert_silhouettes(size,
                        directory,
                        output_directory,
                        output_filename=None):
    """ Convert the CalTech 101 Silhouettes Datasets.

    Parameters
    ----------
    size : {16, 28}
        Convert either the 16x16 or 28x28 sized version of the dataset.
    directory : str
        Directory in which the required input files reside.
    output_filename : str
        Where to save the converted dataset.

    """
    if size not in (16, 28):
        raise ValueError('size must be 16 or 28')

    if output_filename is None:
        output_filename = 'caltech101_silhouettes{}.hdf5'.format(size)
    output_file = os.path.join(output_directory, output_filename)

    input_file = 'caltech101_silhouettes_{}_split1.mat'.format(size)
    input_file = os.path.join(directory, input_file)

    if not os.path.isfile(input_file):
        raise MissingInputFiles('Required files missing', [input_file])

    with h5py.File(output_file, mode="w") as h5file:
        mat = loadmat(input_file)

        train_features = mat['train_data'].reshape([-1, 1, size, size])
        train_targets = mat['train_labels']
        valid_features = mat['val_data'].reshape([-1, 1, size, size])
        valid_targets = mat['val_labels']
        test_features = mat['test_data'].reshape([-1, 1, size, size])
        test_targets = mat['test_labels']

        data = (
            ('train', 'features', train_features),
            ('train', 'targets', train_targets),
            ('valid', 'features', valid_features),
            ('valid', 'targets', valid_targets),
            ('test', 'features', test_features),
            ('test', 'targets', test_targets),
        )
        fill_hdf5_file(h5file, data)

        for i, label in enumerate(('batch', 'channel', 'height', 'width')):
            h5file['features'].dims[i].label = label

        for i, label in enumerate(('batch', 'index')):
            h5file['targets'].dims[i].label = label
    return (output_file, )
Пример #12
0
def preprocess_svhn(main_loop, save_path):
    h5file = h5py.File(save_path, mode='w')

    ali, = Selector(main_loop.model.top_bricks).select('/ali').bricks
    x = tensor.tensor4('features')
    y = tensor.imatrix('targets')
    params = ali.encoder.apply(x)
    mu = params[:, :ali.encoder._nlat]
    acts = []
    acts += [mu]
    acts += VariableFilter(bricks=[
        ali.encoder.layers[-9], ali.encoder.layers[-6], ali.encoder.layers[-3]
    ],
                           roles=[OUTPUT])(ComputationGraph([mu]).variables)
    output = tensor.concatenate([act.flatten(ndim=2) for act in acts], axis=1)
    preprocess = theano.function([x, y], [output.flatten(ndim=2), y])

    train_set = SVHN(2,
                     which_sets=('train', ),
                     sources=('features', 'targets'))
    train_stream = DataStream.default_stream(train_set,
                                             iteration_scheme=SequentialScheme(
                                                 train_set.num_examples, 100))
    train_features, train_targets = map(
        numpy.vstack,
        list(
            zip(*[
                preprocess(*batch)
                for batch in train_stream.get_epoch_iterator()
            ])))

    test_set = SVHN(2, which_sets=('test', ), sources=('features', 'targets'))
    test_stream = DataStream.default_stream(test_set,
                                            iteration_scheme=SequentialScheme(
                                                test_set.num_examples, 100))
    test_features, test_targets = map(
        numpy.vstack,
        list(
            zip(*[
                preprocess(*batch)
                for batch in test_stream.get_epoch_iterator()
            ])))

    data = (('train', 'features', train_features), ('test', 'features',
                                                    test_features),
            ('train', 'targets', train_targets), ('test', 'targets',
                                                  test_targets))
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'feature')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['targets'].dims[i].label = label

    h5file.flush()
    h5file.close()
Пример #13
0
def convert_mnist(directory, output_file, dtype=None):
    """Converts the MNIST dataset to HDF5.

    Converts the MNIST dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.MNIST`. The converted dataset is
    saved as 'mnist.hdf5'.

    This method assumes the existence of the following files:
    `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz`
    `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz`

    It assumes the existence of the following files:

    * `train-images-idx3-ubyte.gz`
    * `train-labels-idx1-ubyte.gz`
    * `t10k-images-idx3-ubyte.gz`
    * `t10k-labels-idx1-ubyte.gz`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_file : str
        Where to save the converted dataset.
    dtype : str, optional
        Either 'float32', 'float64', or 'bool'. Defaults to `None`,
        in which case images will be returned in their original
        unsigned byte format.

    """
    h5file = h5py.File(output_file, mode='w')

    train_feat_path = os.path.join(directory, TRAIN_IMAGES)
    train_features = read_mnist_images(train_feat_path, dtype)
    train_lab_path = os.path.join(directory, TRAIN_LABELS)
    train_labels = read_mnist_labels(train_lab_path)
    test_feat_path = os.path.join(directory, TEST_IMAGES)
    test_features = read_mnist_images(test_feat_path, dtype)
    test_lab_path = os.path.join(directory, TEST_LABELS)
    test_labels = read_mnist_labels(test_lab_path)
    data = (('train', 'features', train_features), ('train', 'targets',
                                                    train_labels),
            ('test', 'features', test_features), ('test', 'targets',
                                                  test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()
Пример #14
0
def convert_lfw(directory, basename, output_directory):
    tgz_filename = "{}.tgz".format(basename)
    tar_filename = "{}.tar".format(basename)
    output_filename = "{}.hdf5".format(basename)
    tar_subdir = "lfw_funneled" if basename == "lfw-funneled" else basename

    # it will be faster to decompress this tar file all at once
    print("--> Converting {} to tar".format(tgz_filename))
    with gzip.open(tgz_filename, 'rb') as f_in, open(tar_filename, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    tar = tarfile.open(tar_filename)

    print("--> Building test/train lists")
    # build lists, throwing away heading
    with open('pairsDevTrain.txt', 'rb') as csvfile:
        trainrows = list(csv.reader(csvfile, delimiter='\t'))[1:]
    with open('pairsDevTest.txt', 'rb') as csvfile:
        testrows = list(csv.reader(csvfile, delimiter='\t'))[1:]

    print("--> Converting")
    # extract all images in set
    train_images = load_images("train", tar, tar_subdir, trainrows)
    test_images  = load_images("test",  tar, tar_subdir, testrows)

    train_labels = np.array(map(lambda r:loadLabelsFromRow(r), trainrows))
    test_labels = np.array(map(lambda r:loadLabelsFromRow(r), testrows))

    train_features = np.array([[f[0,:,:,0], f[0,:,:,1], f[0,:,:,2], f[1,:,:,0], f[1,:,:,1], f[1,:,:,2]] for f in train_images])
    test_features  = np.array([[f[0,:,:,0], f[0,:,:,1], f[0,:,:,2], f[1,:,:,0], f[1,:,:,1], f[1,:,:,2]] for f in test_images])

    train_targets = np.array([[n] for n in train_labels])
    test_targets  = np.array([[n] for n in test_labels])

    print("train shapes: ", train_features.shape, train_targets.shape)
    print("test shapes:  ", test_features.shape, test_targets.shape)
    
    print("--> Writing hdf5 output file")
    output_path = os.path.join(output_directory, output_filename)
    with h5py.File(output_path, mode="w") as h5file:
        data = (('train', 'features', train_features),
                ('train', 'targets', train_targets),
                ('test', 'features', test_features),
                ('test', 'targets', test_targets))
        fill_hdf5_file(h5file, data)

        for i, label in enumerate(('batch', 'channel', 'height', 'width')):
            h5file['features'].dims[i].label = label

        for i, label in enumerate(('batch', 'index')):
            h5file['targets'].dims[i].label = label

    print("--> Done, removing tar file")
    os.remove(tar_filename)
    return (output_path,)
Пример #15
0
 def test_data(self):
     fill_hdf5_file(
         self.h5file,
         (('train', 'features', self.train_features, '.'),
          ('train', 'targets', self.train_targets),
          ('test', 'features', self.test_features),
          ('test', 'targets', self.test_targets)))
     assert_equal(self.h5file['features'],
                  numpy.vstack([self.train_features, self.test_features]))
     assert_equal(self.h5file['targets'],
                  numpy.vstack([self.train_targets, self.test_targets]))
Пример #16
0
def convert_silhouettes(size, directory, output_directory,
                        output_file=None):
    """ Convert the CalTech 101 Silhouettes Datasets.

    Parameters
    ----------
    size : {16, 28}
        Convert either the 16x16 or 28x28 sized version of the dataset.
    directory : str
        Directory in which the required input files reside.
    output_file : str
        Where to save the converted dataset.

    """
    if size not in (16, 28):
        raise ValueError('size must be 16 or 28')

    if output_file is None:
        output_file = 'caltech101_silhouettes{}.hdf5'.format(size)
    output_file = os.path.join(output_directory, output_file)

    input_file = 'caltech101_silhouettes_{}_split1.mat'.format(size)
    input_file = os.path.join(directory, input_file)

    if not os.path.isfile(input_file):
        raise MissingInputFiles('Required files missing', [input_file])

    with h5py.File(output_file, mode="w") as h5file:
        mat = loadmat(input_file)

        train_features = mat['train_data'].reshape([-1, 1, size, size])
        train_targets = mat['train_labels']
        valid_features = mat['val_data'].reshape([-1, 1, size, size])
        valid_targets = mat['val_labels']
        test_features = mat['test_data'].reshape([-1, 1, size, size])
        test_targets = mat['test_labels']

        data = (
            ('train', 'features', train_features),
            ('train', 'targets', train_targets),
            ('valid', 'features', valid_features),
            ('valid', 'targets', valid_targets),
            ('test', 'features', test_features),
            ('test', 'targets', test_targets),
        )
        fill_hdf5_file(h5file, data)

        for i, label in enumerate(('batch', 'channel', 'height', 'width')):
            h5file['features'].dims[i].label = label

        for i, label in enumerate(('batch', 'index')):
            h5file['targets'].dims[i].label = label
    return (output_file,)
Пример #17
0
def convert_youtube_audio(directory, output_directory, youtube_id, channels,
                          sample, output_filename=None):
    """Converts downloaded YouTube audio to HDF5 format.

    Requires `ffmpeg` to be installed and available on the command line
    (i.e. available on your `PATH`).

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    youtube_id : str
        11-character video ID (taken from YouTube URL)
    channels : int
        The number of audio channels to use in the PCM Wave file.
    sample : int
        The sampling rate to use in Hz, e.g. 44100 or 16000.
    output_filename : str, optional
        Name of the saved dataset. If `None` (the default),
        `youtube_id.hdf5` is used.

    """
    input_file = os.path.join(directory, '{}.m4a'.format(youtube_id))
    wav_filename = '{}.wav'.format(youtube_id)
    wav_file = os.path.join(directory, wav_filename)
    ffmpeg_not_available = subprocess.call(['ffmpeg', '-version'])
    if ffmpeg_not_available:
        raise RuntimeError('conversion requires ffmpeg')
    subprocess.check_call(['ffmpeg', '-y', '-i', input_file, '-ac',
                           str(channels), '-ar', str(sample), wav_file],
                          stdout=sys.stdout)

    # Load WAV into array
    _, data = scipy.io.wavfile.read(wav_file)
    if data.ndim == 1:
        data = data[:, None]
    data = data[None, :]

    # Store in HDF5
    if output_filename is None:
        output_filename = '{}.hdf5'.format(youtube_id)
    output_file = os.path.join(output_directory, output_filename)

    with h5py.File(output_file, 'w') as h5file:
        fill_hdf5_file(h5file, (('train', 'features', data),))
        h5file['features'].dims[0].label = 'batch'
        h5file['features'].dims[1].label = 'time'
        h5file['features'].dims[2].label = 'feature'

    return (output_file,)
def main(path):
    train_features = []
    train_locations = []
    train_labels = []
    test_features = []
    test_locations = []
    test_labels = []
    for f in listdir('images'):
        if isfile(join('images', f)):
            number, label, x, y = f.split('.')[0].split('_')
            location = np.array(
                (0.28, 0, (int(x) + 14.0 - 50.0) / 50.0, 0, 0.28,
                 (int(y) + 14.0 - 50.0) / 50.0),
                ndmin=1,
                dtype=np.float32)
            image = np.array(Image.open(join('images', f)),
                             ndmin=3,
                             dtype=np.uint8)
            label = int(label)
            if int(number) <= 60000:
                train_features.append(image)
                train_locations.append(location)
                train_labels.append(label)
            else:
                test_features.append(image)
                test_locations.append(location)
                test_labels.append(label)

    h5file = h5py.File(path, mode='w')

    data = (
        ('train', 'features', np.array(train_features)),
        ('test', 'features', np.array(test_features)),
        ('train', 'locations', np.array(train_locations)),
        ('test', 'locations', np.array(test_locations)),
        ('train', 'labels', np.array(train_labels, dtype=np.uint8)),
        ('test', 'labels', np.array(test_labels, dtype=np.uint8)),
    )
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['locations'].dims[i].label = label
    for i, label in enumerate(('batch', )):
        h5file['labels'].dims[i].label = label

    h5file.flush()
    h5file.close()

    shutil.rmtree('images')
Пример #19
0
def binarized_mnist(input_directory, save_path):
    """Converts the binarized MNIST dataset to HDF5.

    Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN
    paper [DBN] to an HDF5 dataset compatible with
    :class:`fuel.datasets.BinarizedMNIST`. The converted dataset is
    saved as 'binarized_mnist.hdf5'.

    This method assumes the existence of the files
    `binarized_mnist_{train,valid,test}.amat`, which are accessible
    through Hugo Larochelle's website [HUGO].

    .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative
       Analysis of Deep Belief Networks*, Proceedings of the 25th
       international conference on Machine learning, 2008, pp. 872-879.

    .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/
       binarized_mnist/binarized_mnist_{train,valid,test}.amat

    Parameters
    ----------
    input_directory : str
        Directory in which the required input files reside.
    save_path : str
        Where to save the converted dataset.

    """
    h5file = h5py.File(save_path, mode="w")
    train_set = numpy.loadtxt(
        os.path.join(input_directory, 'binarized_mnist_train.amat')).reshape(
            (-1, 1, 28, 28))
    valid_set = numpy.loadtxt(
        os.path.join(input_directory, 'binarized_mnist_valid.amat')).reshape(
            (-1, 1, 28, 28))
    test_set = numpy.loadtxt(
        os.path.join(input_directory, 'binarized_mnist_test.amat')).reshape(
            (-1, 1, 28, 28))
    data = ((train_set, valid_set, test_set),)
    source_names = ('features',)
    shapes = ((70000, 1, 28, 28),)
    dtypes = ('uint8',)
    split_names = ('train', 'valid', 'test')
    splits = ((0, 50000), (50000, 60000), (60000, 70000))

    fill_hdf5_file(
        h5file, data, source_names, shapes, dtypes, split_names, splits)

    h5file.flush()
    h5file.close()
Пример #20
0
def mnist(input_directory, save_path, dtype=None):
    """Converts the MNIST dataset to HDF5.

    Converts the MNIST dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.MNIST`. The converted dataset is
    saved as 'mnist.hdf5'.

    This method assumes the existence of the following files:
    `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz`
    `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz`

    Parameters
    ----------
    input_directory : str
        Directory in which the required input files reside.
    save_path : str
        Where to save the converted dataset.
    dtype : 'float32', 'float64', or 'bool'
        If unspecified, images will be returned in their original
        unsigned byte format.

    """
    h5file = h5py.File(save_path, mode="w")
    train_feat_path = os.path.join(input_directory,
                                   'train-images-idx3-ubyte.gz')
    train_features = read_mnist_images(train_feat_path, dtype)
    train_lab_path = os.path.join(input_directory,
                                  'train-labels-idx1-ubyte.gz')
    train_labels = read_mnist_labels(train_lab_path)
    test_feat_path = os.path.join(input_directory,
                                  't10k-images-idx3-ubyte.gz')
    test_features = read_mnist_images(test_feat_path, dtype)
    test_lab_path = os.path.join(input_directory,
                                 't10k-labels-idx1-ubyte.gz')
    test_labels = read_mnist_labels(test_lab_path)
    data = (('train', 'features', train_features),
            ('train', 'targets', train_labels),
            ('test', 'features', test_features),
            ('test', 'targets', test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()
Пример #21
0
def convert_iris(directory, output_directory, output_filename='iris.hdf5'):
    """Convert the Iris dataset to HDF5.

    Converts the Iris dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.Iris`. The converted dataset is
    saved as 'iris.hdf5'.
    This method assumes the existence of the file `iris.data`.

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to `None`, in which case a name
        based on `dtype` will be used.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    classes = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2}
    data = numpy.loadtxt(
        os.path.join(directory, 'iris.data'),
        converters={4: lambda x: classes[x]},
        delimiter=',')
    features = data[:, :-1].astype('float32')
    targets = data[:, -1].astype('uint8').reshape((-1, 1))
    data = (('all', 'features', features),
            ('all', 'targets', targets))

    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'feature'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #22
0
def binarized_mnist(input_directory, save_path):
    """Converts the binarized MNIST dataset to HDF5.

    Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN
    paper [DBN] to an HDF5 dataset compatible with
    :class:`fuel.datasets.BinarizedMNIST`. The converted dataset is
    saved as 'binarized_mnist.hdf5'.

    This method assumes the existence of the files
    `binarized_mnist_{train,valid,test}.amat`, which are accessible
    through Hugo Larochelle's website [HUGO].

    .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative
       Analysis of Deep Belief Networks*, Proceedings of the 25th
       international conference on Machine learning, 2008, pp. 872-879.

    .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/
       binarized_mnist/binarized_mnist_{train,valid,test}.amat

    Parameters
    ----------
    input_directory : str
        Directory in which the required input files reside.
    save_path : str
        Where to save the converted dataset.

    """
    h5file = h5py.File(save_path, mode="w")
    train_set = numpy.loadtxt(
        os.path.join(input_directory, 'binarized_mnist_train.amat')).reshape(
            (-1, 1, 28, 28)).astype('uint8')
    valid_set = numpy.loadtxt(
        os.path.join(input_directory, 'binarized_mnist_valid.amat')).reshape(
            (-1, 1, 28, 28)).astype('uint8')
    test_set = numpy.loadtxt(
        os.path.join(input_directory, 'binarized_mnist_test.amat')).reshape(
            (-1, 1, 28, 28)).astype('uint8')
    data = (('train', 'features', train_set), ('valid', 'features', valid_set),
            ('test', 'features', test_set))
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label

    h5file.flush()
    h5file.close()
Пример #23
0
def convert(input_directory, save_path):
    h5file = h5py.File(save_path, 'w')
    split = ()
    split += read_stands(input_directory, h5file)
    split += read_taxis(input_directory, h5file, 'train')
    print 'First origin_call not present in training set: ', len(origin_call_dict)
    split += read_taxis(input_directory, h5file, 'test')
    split += unique(h5file)

    fill_hdf5_file(h5file, split)

    for name in ['stands_name', 'stands_latitude', 'stands_longitude', 'unique_taxi_id', 'unique_origin_call']:
        h5file[name].dims[0].label = 'index'
    for name in ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude']:
        h5file[name].dims[0].label = 'batch'

    h5file.flush()
    h5file.close()
def convert(input_directory, save_path):
    h5file = h5py.File(save_path, 'w')
    split = ()
    split += read_stands(input_directory, h5file)
    split += read_taxis(input_directory, h5file, 'train')
    print 'First origin_call not present in training set: ', len(origin_call_dict)
    split += read_taxis(input_directory, h5file, 'test')
    split += unique(h5file)

    fill_hdf5_file(h5file, split)

    for name in ['stands_name', 'stands_latitude', 'stands_longitude', 'unique_taxi_id', 'unique_origin_call']:
        h5file[name].dims[0].label = 'index'
    for name in ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude']:
        h5file[name].dims[0].label = 'batch'

    h5file.flush()
    h5file.close()
Пример #25
0
def main(path):
    train_features = []
    train_locations = []
    train_labels = []
    test_features = []
    test_locations = []
    test_labels = []
    for f in listdir('images'):
        if isfile(join('images', f)):
            number, label, x, y = f.split('.')[0].split('_')
            location = np.array((0.28, 0, (int(x) + 14.0 - 50.0) / 50.0, 0, 0.28, (int(y) + 14.0 - 50.0) / 50.0), ndmin=1, dtype=np.float32)
            image = np.array(Image.open(join('images', f)), ndmin=3, dtype=np.uint8)
            label = int(label)
            if int(number) <= 60000:
                train_features.append(image)
                train_locations.append(location)
                train_labels.append(label)
            else:
                test_features.append(image)
                test_locations.append(location)
                test_labels.append(label)

    h5file = h5py.File(path, mode='w')

    data = (
            ('train', 'features', np.array(train_features)),
            ('test', 'features', np.array(test_features)),
            ('train', 'locations', np.array(train_locations)),
            ('test', 'locations', np.array(test_locations)),
            ('train', 'labels', np.array(train_labels, dtype=np.uint8)),
            ('test', 'labels', np.array(test_labels, dtype=np.uint8)),
    )
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['locations'].dims[i].label = label
    for i, label in enumerate(('batch',)):
        h5file['labels'].dims[i].label = label

    h5file.flush()
    h5file.close()

    shutil.rmtree('images')
Пример #26
0
def test_fill_hdf5_file():
    h5file = h5py.File(
        'tmp.hdf5', mode="w", driver='core', backing_store=False)
    train_features = numpy.arange(16, dtype='uint8').reshape((4, 2, 2))
    test_features = numpy.arange(8, dtype='uint8').reshape((2, 2, 2)) + 3
    train_targets = numpy.arange(4, dtype='float32').reshape((4, 1))
    test_targets = numpy.arange(2, dtype='float32').reshape((2, 1)) + 3
    fill_hdf5_file(
        h5file,
        (('train', 'features', train_features),
         ('train', 'targets', train_targets),
         ('test', 'features', test_features),
         ('test', 'targets', test_targets)))
    assert_equal(
        h5file['features'], numpy.vstack([train_features, test_features]))
    assert_equal(
        h5file['targets'], numpy.vstack([train_targets, test_targets]))
    assert h5file['features'].dtype == 'uint8'
    assert h5file['targets'].dtype == 'float32'
    h5file.close()
Пример #27
0
def test_fill_hdf5_file():
    h5file = h5py.File('tmp.hdf5',
                       mode="w",
                       driver='core',
                       backing_store=False)
    train_features = numpy.arange(16, dtype='uint8').reshape((4, 2, 2))
    test_features = numpy.arange(8, dtype='uint8').reshape((2, 2, 2)) + 3
    train_targets = numpy.arange(4, dtype='float32').reshape((4, 1))
    test_targets = numpy.arange(2, dtype='float32').reshape((2, 1)) + 3
    fill_hdf5_file(h5file, (('train', 'features', train_features),
                            ('train', 'targets', train_targets),
                            ('test', 'features', test_features),
                            ('test', 'targets', test_targets)))
    assert_equal(h5file['features'],
                 numpy.vstack([train_features, test_features]))
    assert_equal(h5file['targets'], numpy.vstack([train_targets,
                                                  test_targets]))
    assert h5file['features'].dtype == 'uint8'
    assert h5file['targets'].dtype == 'float32'
    h5file.close()
Пример #28
0
def test_fill_hdf5_file():
    h5file = h5py.File(
        'tmp.hdf5', mode="w", driver='core', backing_store=False)
    train_features = numpy.arange(16, dtype='uint8').reshape((4, 2, 2))
    test_features = numpy.arange(8, dtype='uint8').reshape((2, 2, 2)) + 3
    train_targets = numpy.arange(4, dtype='float32').reshape((4, 1))
    test_targets = numpy.arange(2, dtype='float32').reshape((2, 1)) + 3
    data = ((train_features, test_features), (train_targets, test_targets))
    source_names = ('features', 'targets')
    shapes = ((6, 2, 2), (6, 1))
    dtypes = ('uint8', 'float32')
    split_names = ('train', 'test')
    splits = ((0, 4), (4, 6))
    fill_hdf5_file(
        h5file, data, source_names, shapes, dtypes, split_names, splits)
    assert_equal(h5file.attrs['train'], [0, 4])
    assert_equal(h5file.attrs['test'], [4, 6])
    assert_equal(
        h5file['features'], numpy.vstack([train_features, test_features]))
    assert_equal(
        h5file['targets'], numpy.vstack([train_targets, test_targets]))
    assert h5file['features'].dtype == 'uint8'
    assert h5file['targets'].dtype == 'float32'
    h5file.close()
Пример #29
0
def convert_adult(directory, output_directory, output_filename='adult.hdf5'):
    """
    Convert the Adult dataset to HDF5.

    Converts the Adult dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.Adult`. The converted dataset is saved as
    'adult.hdf5'.
    This method assumes the existence of the file `adult.data` and
    `adult.test`.

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to `adult.hdf5`.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    train_path = os.path.join(directory, 'adult.data')
    test_path = os.path.join(directory, 'adult.test')
    output_path = os.path.join(output_directory, output_filename)

    train_content = open(train_path, 'r').readlines()
    test_content = open(test_path, 'r').readlines()
    train_content = train_content[:-1]
    test_content = test_content[1:-1]

    features_list = []
    targets_list = []
    for content in [train_content, test_content]:
        # strip out examples with missing features
        content = [line for line in content if line.find('?') == -1]
        # strip off endlines, separate entries
        content = list(map(lambda l: l[:-1].split(', '), content))

        features = list(map(lambda l: l[:-1], content))
        targets = list(map(lambda l: l[-1], content))
        del content
        y = list(map(lambda l: [l[0] == '>'], targets))
        y = numpy.array(y)
        del targets

        # Process features into a matrix
        variables = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
        ]
        continuous = set([
            'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
            'hours-per-week'
        ])

        pieces = []
        for i, var in enumerate(variables):
            data = list(map(lambda l: l[i], features))
            if var in continuous:
                data = list(map(lambda l: float(l), data))
                data = numpy.array(data)
                data = data.reshape(data.shape[0], 1)
            else:
                unique_values = list(set(data))
                data = list(map(lambda l: unique_values.index(l), data))
                data = convert_to_one_hot(data)
            pieces.append(data)

        X = numpy.concatenate(pieces, axis=1)

        features_list.append(X)
        targets_list.append(y)

    # the largets value in the last variable of test set is only 40, thus
    # the one hot representation has 40 at the second dimention. While in
    # training set it is 41. Since it lies in the last variable, so it is
    # safe to simply add a last column with zeros.
    features_list[1] = numpy.concatenate(
        (features_list[1],
         numpy.zeros(
             (features_list[1].shape[0], 1), dtype=features_list[1].dtype)),
        axis=1)
    h5file = h5py.File(output_path, mode='w')
    data = (('train', 'features', features_list[0]), ('train', 'targets',
                                                      targets_list[0]),
            ('test', 'features', features_list[1]), ('test', 'targets',
                                                     targets_list[1]))

    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'feature'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path, )
Пример #30
0
def convert_cifar10(directory, output_file):
    """Converts the CIFAR-10 dataset to HDF5.

    Converts the CIFAR-10 dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as
    'cifar10.hdf5'.

    It assumes the existence of the following file:

    * `cifar-10-python.tar.gz`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_file : str
        Where to save the converted dataset.

    """
    h5file = h5py.File(output_file, mode='w')
    input_file = os.path.join(directory, DISTRIBUTION_FILE)
    tar_file = tarfile.open(input_file, 'r:gz')

    train_batches = []
    for batch in range(1, 6):
        file = tar_file.extractfile('cifar-10-batches-py/data_batch_%d' %
                                    batch)
        try:
            if six.PY3:
                array = cPickle.load(file, encoding='latin1')
            else:
                array = cPickle.load(file)
            train_batches.append(array)
        finally:
            file.close()

    train_features = numpy.concatenate([
        batch['data'].reshape(batch['data'].shape[0], 3, 32, 32)
        for batch in train_batches
    ])
    train_labels = numpy.concatenate([
        numpy.array(batch['labels'], dtype=numpy.uint8)
        for batch in train_batches
    ])
    train_labels = numpy.expand_dims(train_labels, 1)

    file = tar_file.extractfile('cifar-10-batches-py/test_batch')
    try:
        if six.PY3:
            test = cPickle.load(file, encoding='latin1')
        else:
            test = cPickle.load(file)
    finally:
        file.close()

    test_features = test['data'].reshape(test['data'].shape[0], 3, 32, 32)
    test_labels = numpy.array(test['labels'], dtype=numpy.uint8)
    test_labels = numpy.expand_dims(test_labels, 1)

    data = (('train', 'features', train_features), ('train', 'targets',
                                                    train_labels),
            ('test', 'features', test_features), ('test', 'targets',
                                                  test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()
Пример #31
0
def main(path, mode):

    # provide empty datasets
    train_features = []
    train_locations = []
    train_labels = []
    test_features = []
    test_locations = []
    test_labels = []

    # open source h5 file as 'f'
    if (os.path.isfile(path)):
        print("\n[INFO] Opening", path, "\n")
    else:
        print("[ERROR]", path, "does not exist\n")
        exit()

    try:
        f = h5py.File(path, 'r')
    except Exception as e:
        print(e)
        exit()

    # access the data
    X = f["X"]
    Y = f["Y"]
    px = f["px"]
    py = f["py"]

    # change format
    print("[INFO] Start processing data for " + mode + "...\n")
    for i in range(70000):

        # centered location of the digit patch in the image
        location = np.array(
            (0.28, 0, (int(px[i]) + 14.0 - 50.0) / 50.0, 0, 0.28,
             (int(py[i]) + 14.0 - 50.0) / 50.0),
            ndmin=1,
            dtype=np.float32)
        # image and down-scaled (coarse) image
        if mode == 'theano':
            # channel first
            image = np.array(X[i, ...], ndmin=3, dtype=np.uint8)
            image_coarse = np.array(cv2.resize(X[i, ...], (12, 12)),
                                    ndmin=3,
                                    dtype=np.uint8)
        else:
            # chanel last
            image = np.array(X[i, ...], ndmin=2, dtype=np.uint8)
            image.shape = image.shape + (1, )
            image_coarse = np.array(cv2.resize(X[i, ...], (12, 12)),
                                    ndmin=2,
                                    dtype=np.uint8)
            image_coarse.shape = image_coarse.shape + (1, )

        # target output
        if mode == 'theano':
            # one-hot to digit label
            j = 0
            while Y[i, j] == 0 and j < 9:
                j += 1
            label = int(j)
        else:
            # one-hot
            label = Y[i, :]

        # first 60.000 examples are training data
        if int(i) < 60000:
            train_features.append(image)
            train_locations.append(location)
            train_labels.append(label)
        else:
            test_features.append(image)
            test_locations.append(location)
            test_labels.append(label)

        if (i + 1) % 1000 == 0:
            print("[INFO] Appended", i + 1, "rows of data")

    # save data
    if mode == 'theano':
        save_path = '/scratch/forch/EDRAM/datasets/mnist_cluttered_test.hdf5'
    elif mode == 'keras':
        save_path = '/scratch/forch/EDRAM/datasets/mnist_cluttered_keras.hdf5'
    else:
        save_path = '/scratch/forch/EDRAM/datasets/mnist_cluttered_' + mode + '.hdf5'

    h5file = h5py.File(save_path, mode='w')

    data = (
        ('train', 'features', np.array(train_features)),
        ('test', 'features', np.array(test_features)),
        ('train', 'locations', np.array(train_locations)),
        ('test', 'locations', np.array(test_locations)),
        ('train', 'labels', np.array(train_labels, dtype=np.uint8)),
        ('test', 'labels', np.array(test_labels, dtype=np.uint8)),
    )
    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['locations'].dims[i].label = label
    for i, label in enumerate(('batch', )):
        h5file['labels'].dims[i].label = label

    h5file.flush()
    h5file.close()

    print("\n[INFO] Saved data to", save_path, "\n")
Пример #32
0
import h5py
import scipy.io.wavfile
from fuel.converters.base import fill_hdf5_file


directory  = '/data/lisatmp4/taesup/data/YouTubeAudio/'
youtube_id = 'XqaJ2Ol5cC4'

wav_file = directory+youtube_id+'.wav'
output_file = directory+youtube_id+'_valid.hdf5'

_, data = scipy.io.wavfile.read(wav_file)
if data.ndim == 1:
    data = data[:, None]
data = data[None, :]

num_total  = data.shape[1]
num_trains = 160000000
num_valids = num_total-num_trains
train_data = data[:,0:num_trains, :]
valid_data = data[:,num_trains:, :]

with h5py.File(output_file, 'w') as h5file:
    fill_data = (('train', 'features', valid_data),)
    print 'train_data : ', train_data.shape
    print 'valid_data : ', valid_data.shape
    fill_hdf5_file(h5file, fill_data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'time'
    h5file['features'].dims[2].label = 'feature'
Пример #33
0
    for ind, f in enumerate(val_features):
        fname = os.path.join(dpth, f + '.fea')
        fi = htkmfc.HTKFeat_read(fname)
        data = fi.getall()[:, :20]
        val_Mask[ind, :data.shape[0]] = 1.0
        pad = maxlen - data.shape[0]
        data = np.vstack((data, np.zeros((pad, 20), dtype='float32')))
        val_Data[ind, :, :] = data

    return Data, Mask, np.asarray(
        labelz, dtype='int32'), val_Data, val_Mask, np.asarray(val_labelz,
                                                               dtype='int32')


Data, Msk, Targets, val_Data, val_Msk, val_tars = load_dataset()

f = h5py.File('dataset.hdf5', mode='w')
data = (('train', 'features', Data), ('train', 'mask', Msk),
        ('train', 'targets', Targets), ('valid', 'features', val_Data),
        ('valid', 'mask', val_Msk), ('valid', 'targets', val_tars))
fill_hdf5_file(f, data)

for i, label in enumerate(('batch', 'maxlen', 'feat_dim')):
    f['features'].dims[i].label = label
for i, label in enumerate(('batch', 'maxlen')):
    f['mask'].dims[i].label = label
for i, label in enumerate(('batch', )):
    f['targets'].dims[i].label = label

f.flush()
f.close()
Пример #34
0
def convert_cifar100(directory, output_directory,
                     output_filename='cifar100.hdf5'):
    """Converts the CIFAR-100 dataset to HDF5.

    Converts the CIFAR-100 dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CIFAR100`. The converted dataset is saved as
    'cifar100.hdf5'.

    This method assumes the existence of the following file:
    `cifar-100-python.tar.gz`

    Parameters
    ----------
    directory : str
        Directory in which the required input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'cifar100.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode="w")
    input_file = os.path.join(directory, 'cifar-100-python.tar.gz')
    tar_file = tarfile.open(input_file, 'r:gz')

    file = tar_file.extractfile('cifar-100-python/train')
    try:
        if six.PY3:
            train = cPickle.load(file, encoding='latin1')
        else:
            train = cPickle.load(file)
    finally:
        file.close()

    train_features = train['data'].reshape(train['data'].shape[0],
                                           3, 32, 32)
    train_coarse_labels = numpy.array(train['coarse_labels'],
                                      dtype=numpy.uint8)
    train_fine_labels = numpy.array(train['fine_labels'],
                                    dtype=numpy.uint8)

    file = tar_file.extractfile('cifar-100-python/test')
    try:
        if six.PY3:
            test = cPickle.load(file, encoding='latin1')
        else:
            test = cPickle.load(file)
    finally:
        file.close()

    test_features = test['data'].reshape(test['data'].shape[0],
                                         3, 32, 32)
    test_coarse_labels = numpy.array(test['coarse_labels'], dtype=numpy.uint8)
    test_fine_labels = numpy.array(test['fine_labels'], dtype=numpy.uint8)

    data = (('train', 'features', train_features),
            ('train', 'coarse_labels', train_coarse_labels.reshape((-1, 1))),
            ('train', 'fine_labels', train_fine_labels.reshape((-1, 1))),
            ('test', 'features', test_features),
            ('test', 'coarse_labels', test_coarse_labels.reshape((-1, 1))),
            ('test', 'fine_labels', test_fine_labels.reshape((-1, 1))))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['coarse_labels'].dims[0].label = 'batch'
    h5file['coarse_labels'].dims[1].label = 'index'
    h5file['fine_labels'].dims[0].label = 'batch'
    h5file['fine_labels'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #35
0
def convert_cifar10(directory, output_directory,
                    output_filename='cifar10.hdf5'):
    """Converts the CIFAR-10 dataset to HDF5.
    Converts the CIFAR-10 dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as
    'cifar10.hdf5'.
    It assumes the existence of the following file:
    * `cifar-10-python.tar.gz`
    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'cifar10.hdf5'.
    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.
    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = File(output_path, mode='w')
    input_file = os.path.join(directory, DISTRIBUTION_FILE)
    tar_file = tarfile.open(input_file, 'r:gz')

    train_batches = []
    for batch in range(1, 6):
        file = tar_file.extractfile(
            'cifar-10-batches-py/data_batch_%d' % batch)
        try:
            if six.PY3:
                array = cPickle.load(file, encoding='latin1')
            else:
                array = cPickle.load(file)
            train_batches.append(array)
        finally:
            file.close()

    train_features = numpy.concatenate(
        [batch['data'].reshape(batch['data'].shape[0], 3, 32, 32)
            for batch in train_batches])
    train_labels = numpy.concatenate(
        [numpy.array(batch['labels'], dtype=numpy.uint8)
            for batch in train_batches])
    train_labels = numpy.expand_dims(train_labels, 1)

    print train_features.shape
    print train_labels.shape

    flipped_train_features = train_features[:,:,:,::-1]

    train_features = numpy.array([val for pair in zip(train_features, flipped_train_features) for val in pair])
    train_labels = numpy.repeat(train_labels, 2, axis=0)

    print train_features.shape
    print train_labels.shape

    file = tar_file.extractfile('cifar-10-batches-py/test_batch')
    try:
        if six.PY3:
            test = cPickle.load(file, encoding='latin1')
        else:
            test = cPickle.load(file)
    finally:
        file.close()

    test_features = test['data'].reshape(test['data'].shape[0],
                                         3, 32, 32)
    test_labels = numpy.array(test['labels'], dtype=numpy.uint8)
    test_labels = numpy.expand_dims(test_labels, 1)

    data = (('train', 'features', train_features),
            ('train', 'targets', train_labels),
            ('test', 'features', test_features),
            ('test', 'targets', test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #36
0
def convert_mnist(directory, output_directory, output_filename=None,
                  dtype=None):
    """Converts the MNIST dataset to HDF5.

    Converts the MNIST dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.MNIST`. The converted dataset is
    saved as 'mnist.hdf5'.

    This method assumes the existence of the following files:
    `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz`
    `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz`

    It assumes the existence of the following files:

    * `train-images-idx3-ubyte.gz`
    * `train-labels-idx1-ubyte.gz`
    * `t10k-images-idx3-ubyte.gz`
    * `t10k-labels-idx1-ubyte.gz`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to `None`, in which case a name
        based on `dtype` will be used.
    dtype : str, optional
        Either 'float32', 'float64', or 'bool'. Defaults to `None`,
        in which case images will be returned in their original
        unsigned byte format.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    if not output_filename:
        if dtype:
            output_filename = 'mnist_{}.hdf5'.format(dtype)
        else:
            output_filename = 'mnist.hdf5'
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode='w')

    train_feat_path = os.path.join(directory, TRAIN_IMAGES)
    train_features = read_mnist_images(train_feat_path, dtype)
    train_lab_path = os.path.join(directory, TRAIN_LABELS)
    train_labels = read_mnist_labels(train_lab_path)
    test_feat_path = os.path.join(directory, TEST_IMAGES)
    test_features = read_mnist_images(test_feat_path, dtype)
    test_lab_path = os.path.join(directory, TEST_LABELS)
    test_labels = read_mnist_labels(test_lab_path)
    data = (('train', 'features', train_features),
            ('train', 'targets', train_labels),
            ('test', 'features', test_features),
            ('test', 'targets', test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #37
0
def convert_adult(directory, output_directory,
                  output_filename='adult.hdf5'):
    """
    Convert the Adult dataset to HDF5.

    Converts the Adult dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.Adult`. The converted dataset is saved as
    'adult.hdf5'.
    This method assumes the existence of the file `adult.data` and
    `adult.test`.

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to `adult.hdf5`.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    train_path = os.path.join(directory, 'adult.data')
    test_path = os.path.join(directory, 'adult.test')
    output_path = os.path.join(output_directory, output_filename)

    train_content = open(train_path, 'r').readlines()
    test_content = open(test_path, 'r').readlines()
    train_content = train_content[:-1]
    test_content = test_content[1:-1]

    features_list = []
    targets_list = []
    for content in [train_content, test_content]:
        # strip out examples with missing features
        content = [line for line in content if line.find('?') == -1]
        # strip off endlines, separate entries
        content = list(map(lambda l: l[:-1].split(', '), content))

        features = list(map(lambda l: l[:-1], content))
        targets = list(map(lambda l: l[-1], content))
        del content
        y = list(map(lambda l: [l[0] == '>'], targets))
        y = numpy.array(y)
        del targets

        # Process features into a matrix
        variables = [
            'age', 'workclass', 'fnlwgt', 'education', 'education-num',
            'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'
        ]
        continuous = set([
            'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
            'hours-per-week'
        ])

        pieces = []
        for i, var in enumerate(variables):
            data = list(map(lambda l: l[i], features))
            if var in continuous:
                data = list(map(lambda l: float(l), data))
                data = numpy.array(data)
                data = data.reshape(data.shape[0], 1)
            else:
                unique_values = list(set(data))
                data = list(map(lambda l: unique_values.index(l), data))
                data = convert_to_one_hot(data)
            pieces.append(data)

        X = numpy.concatenate(pieces, axis=1)

        features_list.append(X)
        targets_list.append(y)

    # the largets value in the last variable of test set is only 40, thus
    # the one hot representation has 40 at the second dimention. While in
    # training set it is 41. Since it lies in the last variable, so it is
    # safe to simply add a last column with zeros.
    features_list[1] = numpy.concatenate(
        (features_list[1],
         numpy.zeros((features_list[1].shape[0], 1),
                     dtype=features_list[1].dtype)),
        axis=1)
    h5file = h5py.File(output_path, mode='w')
    data = (('train', 'features', features_list[0]),
            ('train', 'targets', targets_list[0]),
            ('test', 'features', features_list[1]),
            ('test', 'targets', targets_list[1]))

    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'feature'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #38
0
def main(list_params, gpu_id, dataset_id, model_id, use_checkpoint_weights,
         load_path, batch_size, n_steps, glimpse_size, coarse_size, conv_sizes,
         n_filters, fc_dim, enc_dim, dec_dim, n_classes, clip_value,
         unique_emission, unique_glimpse, output_mode, use_init_matrix,
         output_emotion_dims, headless, scale_inputs, normalize_inputs,
         use_batch_norm, dropout, weighting, iterations, show_steps,
         zoom_factor):

    # mode = 0 if output_init_matrix==0 and mode==0 else 1
    if dataset_id > 0:
        n_classes = 7
    if dataset_id < 2:
        input_shape = config['input_shape']
    elif dataset_id == 2:
        input_shape = config['input_shape_400']
    else:
        input_shape = config['input_shape_200']

    glimpse_size = (glimpse_size, glimpse_size)
    coarse_size = (coarse_size, coarse_size)

    # select a GPU
    print("[Info] Using GPU", gpu_id)
    if gpu_id == -1:
        print(
            '[Error] You need to select a gpu. (e.g. python train.py --gpu=7)\n'
        )
        exit()
    #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)

    # create the model
    print(
        "\n[Info] Loading the model from:",
        load_path + ("model_weights.h5" if use_checkpoint_weights == 0 else
                     "checkpoint_weights.h5"))
    print()
    if model_id == 1:
        model = edram_model(input_shape,
                            learning_rate=1,
                            steps=n_steps,
                            glimpse_size=glimpse_size,
                            coarse_size=coarse_size,
                            hidden_init=0,
                            n_filters=128,
                            filter_sizes=(3, 5),
                            n_features=fc_dim,
                            RNN_size_1=enc_dim,
                            RNN_size_2=dec_dim,
                            n_classes=n_classes,
                            output_mode=output_mode,
                            use_init_matrix=use_init_matrix,
                            clip_value=clip_value,
                            output_emotion_dims=output_emotion_dims,
                            headless=headless,
                            bn=use_batch_norm,
                            dropout=dropout,
                            use_weighted_loss=False,
                            localisation_cost_factor=1)
    elif model_id == 2:
        model = tedram_model(input_shape,
                             learning_rate=1,
                             steps=n_steps,
                             glimpse_size=glimpse_size,
                             coarse_size=coarse_size,
                             hidden_init=0,
                             n_filters=128,
                             filter_sizes=(3, 5),
                             n_features=fc_dim,
                             RNN_size_1=enc_dim,
                             RNN_size_2=dec_dim,
                             n_classes=n_classes,
                             output_mode=output_mode,
                             use_init_matrix=use_init_matrix,
                             clip_value=clip_value,
                             output_emotion_dims=output_emotion_dims,
                             unique_emission=unique_emission,
                             unique_glimpse=unique_glimpse,
                             bn=use_batch_norm,
                             dropout=dropout,
                             use_weighted_loss=False,
                             localisation_cost_factor=1)
    elif model_id == 3:
        model = STN_model(learning_rate=1,
                          n_classes=n_classes,
                          use_weighted_loss=False,
                          output_mode=1)
    else:
        print('[Error] Only model 1 and 2 is available!\n')
        exit()
    # load weights
    if use_checkpoint_weights:
        model.load_weights(load_path + 'checkpoint_weights.h5')
    else:
        model.load_weights(load_path + 'model_weights.h5')

    # load the data
    data_path = datasets[dataset_id]
    print("\n[Info] Opening", data_path)

    try:
        data = h5py.File(data_path, 'r')

    except Exception:
        print("[Error]", data_path, "does not exist.\n")
        exit()

    if dataset_id == 0:
        n_train = 60000
        n_test = 10000
    elif dataset_id == 1:
        n_train = data['features'].shape[0] - 3482
        n_test = data['features'].shape[0] - n_train
    else:
        n_train = data['X'].shape[0] - 3462
        n_test = data['X'].shape[0] - n_train

    if dataset_id < 2:

        features = data['features'][n_train:]
        labels = data['labels'][n_train:]
        locations = data['locations'][n_train:]
        if output_emotion_dims:
            dims1 = data['dimensions'][n_train:]
            dims2 = None
        else:
            dims1 = None
            dims2 = None
    else:

        features = data['X'][n_train:]
        labels = data['Y_lab'][n_train:]
        locations = None
        if output_emotion_dims:
            dims1 = data['Y_val'][n_train:]
            dims2 = data['Y_ars'][n_train:]
            dims1 = np.reshape(dims1, (dims1.shape[0], 1))
            dims2 = np.reshape(dims2, (dims2.shape[0], 1))
        else:
            dims1 = None
            dims2 = None

    # normalize input data
    if normalize_inputs:
        indices = list(range(n_test))
        random.shuffle(indices)
        samples = features[sorted(indices[:1000]), ...] / scale_inputs

        mean = np.mean(samples, axis=0)
        sd = np.std(samples, axis=0).clip(min=0.00001)
    else:
        mean = 0
        sd = 1

    print("[Info] Dataset Size\n")
    print(" using", iterations, "*", batch_size, "out of", n_test,
          "test examples")

    print("\n[Info] Data Dimensions\n")
    print("  Images:   ", features.shape[1], "x", features.shape[2], "x",
          features.shape[3])
    print("  Labels:   ", labels.shape[1])
    if locations is not None:
        print("  Locations:", locations.shape[1], "\n")
    else:
        print("  Locations:", 6, "\n")

    predicted_labels, predicted_dimensions, predicted_locations = [], [], []

    # get sample data
    indices = list(range(n_test))
    random.shuffle(indices)
    samples = sorted(indices[0:batch_size * iterations])

    # prepare the minibatch

    # input image
    if scale_inputs != 1 and scale_inputs != 0:
        I = np.array(features[samples, ...], dtype='float32') / scale_inputs
    if normalize_inputs:
        I = (np.array(features[samples, ...], dtype='float32') - mean) / sd
    else:
        I = np.array(features[samples, ...], dtype='float32')
    # transformation matrix with zoom paramters set to 1
    A = np.zeros((batch_size * iterations, 6), dtype='float32')
    A[:, (0, 4)] = 1
    # initial RNN states
    S1 = np.zeros((batch_size * iterations, enc_dim), dtype='float32')
    S2 = np.zeros((batch_size * iterations, dec_dim), dtype='float32')
    # biases
    if glimpse_size == (26, 26):
        B1 = np.ones((batch_size * iterations, 26, 26, 1), dtype='float32')
        B2 = np.ones((batch_size * iterations, 24, 24, 1), dtype='float32')
        B3 = np.ones((batch_size * iterations, 12, 12, 1), dtype='float32')
        B4 = np.ones((batch_size * iterations, 8, 8, 1), dtype='float32')
        B5 = np.ones((batch_size * iterations, 6, 6, 1), dtype='float32')
        B6 = np.ones((batch_size * iterations, 4, 4, 1), dtype='float32')
    else:
        B1 = np.ones((batch_size * iterations, 16, 16, 1), dtype='float32')
        B2 = np.ones((batch_size * iterations, 16, 16, 1), dtype='float32')
        B3 = np.ones((batch_size * iterations, 8, 8, 1), dtype='float32')
        B4 = np.ones((batch_size * iterations, 8, 8, 1), dtype='float32')
        B5 = np.ones((batch_size * iterations, 6, 6, 1), dtype='float32')
        B6 = np.ones((batch_size * iterations, 4, 4, 1), dtype='float32')
    # concatenation of target outputs for every step
    Y_cla = np.array(labels[samples, ...], dtype='float32')
    if zoom_factor == 1:
        Y_loc = np.array(locations[samples, ...], dtype='float32')
    else:
        Y_loc = np.zeros((batch_size * iterations, 6), dtype='float32')
        Y_loc[:, (0, 4)] = zoom_factor
    if dims1 is not None:
        if dims2 is None:
            Y_dim = np.array(dims1[samples, ...], dtype='float32')
        else:
            Y_dim = np.array(np.hstack(
                [dims1[samples, ...], dims2[samples, ...]]),
                             dtype='float32')

    if model_id == 1 or model_id == 2:
        inputs = {
            'input_image': I,
            'input_matrix': A,
            'initial_hidden_state_1': S1,
            'initial_cell_state_1': S1,
            'initial_cell_state_2': S2,
            'b26': B1,
            'b24': B2,
            'b12': B3,
            'b8': B4,
            'b6': B5,
            'b4': B6
        }
        if dims1 is not None:
            outputs = {
                'classifications': Y_cla,
                'dimensions': Y_dim,
                'localisations': Y_loc
            }
        else:
            outputs = {'classifications': Y_cla, 'localisations': Y_loc}
    elif model_id == 3:
        inputs = {'input_image': I}
        outputs = {'classifications': Y_cla}

    if dims1 is not None:
        predicted_labels, predicted_dimensions, predicted_locations = model.predict(
            inputs, batch_size=batch_size, verbose=1)
    else:
        predicted_labels, predicted_locations = model.predict(
            inputs, batch_size=batch_size, verbose=1)

    batch_size = batch_size * iterations

    # reshape
    if model_id == 1 or model_id == 2:
        if output_mode:
            predicted_locations = np.vstack([
                predicted_locations[:, i, :]
                for i in range(0, n_steps + use_init_matrix)
            ])
        if n_steps > 1 and headless == False:
            predicted_labels = np.vstack(
                [predicted_labels[:, i, :] for i in range(0, n_steps)])
            if dims1 is not None:
                predicted_dimensions = np.vstack(
                    [predicted_dimensions[:, i, :] for i in range(0, n_steps)])

    # save smaple data and predictions
    h5file = h5py.File(load_path + 'predictions.h5', mode='w')

    if dims1 is not None:
        data = (
            ('true', 'features',
             np.array(features[samples, ...], dtype='float32')),
            ('normalized', 'features', np.array(I, dtype='float32')),
            ('true', 'locations', np.array(Y_loc, dtype='float32')),
            ('predicted', 'locations',
             np.array(predicted_locations, dtype='float32')),
            ('true', 'dimension', np.array(Y_dim, dtype='float32')),
            ('predcited', 'dimensions',
             np.array(predicted_dimensions, dtype='float32')),
            ('true', 'labels', np.array(Y_cla, dtype='float32')),
            ('predcited', 'labels', np.array(predicted_labels,
                                             dtype='float32')),
        )
    else:
        data = (
            ('true', 'features',
             np.array(features[samples, ...], dtype='float32')),
            ('normalized', 'features', np.array(I, dtype='float32')),
            ('true', 'locations', np.array(Y_loc, dtype='float32')),
            ('predicted', 'locations',
             np.array(predicted_locations, dtype='float32')),
            ('true', 'labels', np.array(Y_cla, dtype='float32')),
            ('predcited', 'labels', np.array(predicted_labels,
                                             dtype='float32')),
        )
    fill_hdf5_file(h5file, data)

    h5file.flush()
    h5file.close()

    print("\n[INFO] Saved data to", load_path + 'predictions.h5', "\n")

    # some statistics
    hist = np.zeros(n_classes, dtype='int')
    acc = np.zeros((1 if headless else n_steps, n_classes), dtype='int')
    acc_avg = np.zeros((1 if headless else n_steps, n_classes), dtype='int')
    pos = np.zeros((n_steps + use_init_matrix, n_classes, 2), dtype='float')
    zoom = np.zeros((n_steps + use_init_matrix, n_classes, 2), dtype='float')
    mse_pos = np.zeros((n_steps + use_init_matrix, n_classes), dtype='float')
    mse_zoom = np.zeros((n_steps + use_init_matrix, n_classes), dtype='float')
    val_ars = np.zeros((n_steps, n_classes, 2), dtype='float')
    mse_val = np.zeros((n_steps, n_classes), dtype='float')
    mse_ars = np.zeros((n_steps, n_classes), dtype='float')

    if weighting:
        # average predictions per step in inverted order
        for j in range(0, n_steps):
            k = 0
            predicted_labels_avg = predicted_labels[(n_steps - 1) *
                                                    batch_size:(n_steps) *
                                                    batch_size, :]
            for k in range(1, j + 1):
                predicted_labels_avg += predicted_labels[(n_steps - 1 - k) *
                                                         batch_size:(n_steps -
                                                                     k) *
                                                         batch_size, :]
            predicted_labels_avg /= k + 1
            for i in range(0, batch_size):
                # count correct classifications per class
                if np.argmax(Y_cla[i, :]) == np.argmax(
                        predicted_labels_avg[i, :]):
                    acc_avg[j, :] = acc_avg[j, :] + Y_cla[i, :]

    for i in range(0, batch_size):
        # count class occurences
        hist = hist + Y_cla[i, :]
        # count correct classifications per class
        for j in range(0, 1 if headless else n_steps):
            if np.argmax(Y_cla[i, :]) == np.argmax(
                    predicted_labels[i + j * batch_size, :]):
                acc[j, :] = acc[j, :] + Y_cla[i, :]

    # compute accuracy
    acc_mean = np.zeros(1 if headless else n_steps)
    for j in range(0, 1 if headless else n_steps):
        acc_mean[j] = np.dot(hist / batch_size, acc[j, :] / hist)
    hist[hist == 0] = 0.00000001
    acc = np.asarray(acc * 100 / (hist), dtype='int') / 100
    acc_avg = np.asarray(acc_avg * 100 / (hist), dtype='int') / 100
    hist[hist < 1] = 0
    # compute bb info per class and mse
    for j in range(0, n_steps + use_init_matrix):
        for i in range(0, n_classes):
            pos[j, i, :] = np.mean(
                predicted_locations[j * batch_size:(j + 1) *
                                    batch_size, :][Y_cla[:,
                                                         i] == 1, :][:,
                                                                     (2, 5)],
                axis=0)
            zoom[j, i, :] = np.mean(
                predicted_locations[j * batch_size:(j + 1) *
                                    batch_size, :][Y_cla[:,
                                                         i] == 1, :][:,
                                                                     (0, 4)],
                axis=0)
            mse_pos[j, i] = np.mean(
                np.square(Y_loc[Y_cla[:, i] == 1, :][:, (2, 5)] -
                          predicted_locations[j * batch_size:(j + 1) *
                                              batch_size, :][Y_cla[:, i] ==
                                                             1, :][:, (2, 5)]))
            mse_zoom[j, i] = np.mean(
                np.square(Y_loc[Y_cla[:, i] == 1, :][:, (0, 4)] -
                          predicted_locations[j * batch_size:(j + 1) *
                                              batch_size, :][Y_cla[:, i] ==
                                                             1, :][:, (0, 4)]))
    # compute mean dimensional ratings and mse
    if dims1 is not None:
        for j in range(0, n_steps):
            for i in range(0, n_classes):
                val_ars[j, i, :] = np.mean(
                    predicted_dimensions[j * batch_size:(j + 1) *
                                         batch_size, :][Y_cla[:, i] == 1, :],
                    axis=0)
                mse_val[j, i] = np.mean(
                    np.square(Y_dim[Y_cla[:, i] == 1, :][:, 0] -
                              predicted_dimensions[j * batch_size:(j + 1) *
                                                   batch_size, :]
                              [Y_cla[:, i] == 1, :][:, 0]))
                mse_ars[j, i] = np.mean(
                    np.square(Y_dim[Y_cla[:, i] == 1, :][:, 1] -
                              predicted_dimensions[j * batch_size:(j + 1) *
                                                   batch_size, :]
                              [Y_cla[:, i] == 1, :][:, 1]))

    print("Sample Class Distribution:", hist, "\n")
    print("Accuracy per Class:")
    for j in range(0, 1 if headless else n_steps):
        print("  Step " + str(j + 1) + ":                 ", acc[j, :],
              "= %.3f" % acc_mean[j])
    if weighting:
        print("\nWeighted Accuracy per Class:")
        for j in range(0, n_steps):
            print(
                "  Step " + str(n_steps - j) + " to " + str(n_steps) +
                ":             ", acc_avg[n_steps - 1 - j, :], "= %.3f" %
                np.dot(hist / batch_size, acc_avg[n_steps - 1 - j, :]))
    if dims1 is not None:
        print("\nValence Error per Class:")
        for j in range(n_steps - show_steps, n_steps):
            print("  Step " + str(j + 1) + ":                 ",
                  np.asarray(mse_val[j, :] * 100, dtype='int') / 100,
                  "= %.3f" % np.dot(hist / batch_size, mse_val[j, :]))
        print("\nAverage Valence per Class:")
        for j in range(n_steps - show_steps, n_steps):
            print("  Step " + str(j + 1) + ":                 ",
                  np.asarray(val_ars[j, :, 0] * 100, dtype='int') / 100,
                  "= %.3f" % np.dot(hist / batch_size, val_ars[j, :, 0]))
        print("\nArousal Error per Class:")
        for j in range(n_steps - show_steps, n_steps):
            print("  Step " + str(j + 1) + ":                 ",
                  np.asarray(mse_ars[j, :] * 100, dtype='int') / 100,
                  "= %.3f" % np.dot(hist / batch_size, mse_ars[j, :]))
        print("\nAverage Arousal per Class:")
        for j in range(n_steps - show_steps, n_steps):
            print("  Step " + str(j + 1) + ":                 ",
                  np.asarray(val_ars[j, :, 1] * 100, dtype='int') / 100,
                  "= %.3f" % np.dot(hist / batch_size, val_ars[j, :, 1]))
    print("\nAverage Position per Class:")
    for j in range(0, n_steps + use_init_matrix):
        print("  Step " + str(j) + ":                 ",
              np.asarray(pos[j, :, 0] * 100, dtype='int') / 100,
              "= %.3f" % np.dot(hist / batch_size, pos[j, :, 0]))
        print("                          ",
              np.asarray(pos[j, :, 1] * 100, dtype='int') / 100,
              "= %.3f" % np.dot(hist / batch_size, pos[j, :, 1]))
    print("\nPosition Variance: %.3f" % np.var(np.asarray(pos)),
          "  Position SD: %.3f" % np.std(np.asarray(pos)))
    if False:
        print("\nLocation Error per Class:")
        for j in range(0, n_steps + use_init_matrix):
            print("  Step " + str(j) + ":                 ",
                  np.asarray(mse_pos[j, :] * 100, dtype='int') / 100,
                  "= %.3f" % np.dot(hist / batch_size, mse_pos[j, :]))
        print("\nZoom Error per Class:")
        for j in range(0, n_steps + use_init_matrix):
            print("  Step " + str(j) + ":                 ",
                  np.asarray(mse_zoom[j, :] * 100, dtype='int') / 100,
                  "= %.3f" % np.dot(hist / batch_size, mse_zoom[j, :]))
    print("\nAverage Zoom per Class:")
    for j in range(0, n_steps + use_init_matrix):
        print("  Step " + str(j) + ":                 ",
              np.asarray(zoom[j, :, 0] * 100, dtype='int') / 100,
              "= %.3f" % np.dot(hist / batch_size, zoom[j, :, 0]))
        print("                          ",
              np.asarray(zoom[j, :, 1] * 100, dtype='int') / 100,
              "= %.3f" % np.dot(hist / batch_size, zoom[j, :, 1]))
    print("")

    exit()
Пример #39
0
def convert_cifar10(directory, output_file):
    """Converts the CIFAR-10 dataset to HDF5.

    Converts the CIFAR-10 dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as
    'cifar10.hdf5'.

    It assumes the existence of the following file:

    * `cifar-10-python.tar.gz`

    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_file : str
        Where to save the converted dataset.

    """
    h5file = h5py.File(output_file, mode='w')
    input_file = os.path.join(directory, DISTRIBUTION_FILE)
    tar_file = tarfile.open(input_file, 'r:gz')

    train_batches = []
    for batch in range(1, 6):
        file = tar_file.extractfile(
            'cifar-10-batches-py/data_batch_%d' % batch)
        try:
            if six.PY3:
                array = cPickle.load(file, encoding='latin1')
            else:
                array = cPickle.load(file)
            train_batches.append(array)
        finally:
            file.close()

    train_features = numpy.concatenate(
        [batch['data'].reshape(batch['data'].shape[0], 3, 32, 32)
            for batch in train_batches])
    train_labels = numpy.concatenate(
        [numpy.array(batch['labels'], dtype=numpy.uint8)
            for batch in train_batches])
    train_labels = numpy.expand_dims(train_labels, 1)

    file = tar_file.extractfile('cifar-10-batches-py/test_batch')
    try:
        if six.PY3:
            test = cPickle.load(file, encoding='latin1')
        else:
            test = cPickle.load(file)
    finally:
        file.close()

    test_features = test['data'].reshape(test['data'].shape[0],
                                         3, 32, 32)
    test_labels = numpy.array(test['labels'], dtype=numpy.uint8)
    test_labels = numpy.expand_dims(test_labels, 1)

    data = (('train', 'features', train_features),
            ('train', 'targets', train_labels),
            ('test', 'features', test_features),
            ('test', 'targets', test_labels))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()
Пример #40
0
def convert_data(directory, output_file, dtype=None):
    """
    Convert new data to fuel HDF5 dataset.
    Parameters
    ----------
    directory : str
        Directory in which input files reside.
    output_file : str
        Where to save the converted dataset.
    dtype : str, optional
        Either 'float32', 'float64', or 'bool'. Defaults to `None`,
        in which case images will be returned in their original
        unsigned byte format.
"""
    output_file = directory +"/new_dataset.hdf5"
    #print('output_file', output_file)
    #print('directory', directory)
    h5file = h5py.File(output_file, mode='w')

    train_file = os.path.join(directory, 'train_data.h5')
    train = h5py.File(train_file)
    x_train = np.array(train["chars"])
    y_train = np.array(train["target"])
    #Balancing the train set by selecting the first 18k responses from each class
    zeroes = np.transpose(np.array(np.where(y_train==0)))
    ones = np.transpose(np.array(np.where(y_train==1)))
    twos = np.transpose(np.array(np.where(y_train==2)))
    threes = np.transpose(np.array(np.where(y_train==3)))
    x_train = np.concatenate((x_train[zeroes[:18000, 0]], x_train[ones[:18000, 0]], x_train[twos[:18000, 0]], x_train[threes[:18000, 0]]), axis=0)
    y_train = np.concatenate((y_train[zeroes[:18000, 0]], y_train[ones[:18000, 0]], y_train[twos[:18000, 0]], y_train[threes[:18000, 0]]), axis=0)
    y_train = np.reshape(y_train, (len(y_train),1))
    print('x_train.shape', x_train.shape)
    print('y_train.shape', y_train.shape)
    #print('zeroes', len(y_train[y_train==0]))
    #print('ones', len(y_train[y_train==1]))
    #print('twos', len(y_train[y_train==2]))
    #print('threes', len(y_train[y_train==3]))
    valid_file = os.path.join(directory, 'valid_data.h5')
    valid = h5py.File(valid_file)
    x_valid = valid["chars"]
    y_valid = valid["target"]
    y_valid = np.reshape(y_valid, (len(y_valid), 1))
    test_file = os.path.join(directory, 'test_data.h5')
    test = h5py.File(test_file)
    x_test = test["chars"]
    y_test = test["target"]
    y_test = np.reshape(y_test, (len(y_test),1))
    features = x_train[:,:].astype('float32')
    targets = y_train[:].astype('uint8')
    train_features = features
    train_targets = targets
    print(train_targets.shape)
    features = x_valid[:,:].astype('float32')
    targets = y_valid[:].astype('uint8')
    valid_features = features
    valid_targets = targets
    print(valid_targets.shape)
    features = x_test[:,:].astype('float32')
    targets = y_test[:].astype('uint8')
    test_features = features
    test_targets = targets
    print(test_targets.shape)
    data = (('train', 'features', train_features),
            ('train', 'targets', train_targets),
            ('valid', 'features', valid_features),
            ('valid', 'targets', valid_targets),
            ('test', 'features', test_features),
            ('test', 'targets', test_targets))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'feature'
    h5file['targets'].dims[0].label = 'batch'
    h5file['targets'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()
Пример #41
0
def convert_cifar100(directory, output_directory,
                     output_filename='cifar100.hdf5'):
    """Converts the CIFAR-100 dataset to HDF5.

    Converts the CIFAR-100 dataset to an HDF5 dataset compatible with
    :class:`fuel.datasets.CIFAR100`. The converted dataset is saved as
    'cifar100.hdf5'.

    This method assumes the existence of the following file:
    `cifar-100-python.tar.gz`

    Parameters
    ----------
    directory : str
        Directory in which the required input files reside.
    output_directory : str
        Directory in which to save the converted dataset.
    output_filename : str, optional
        Name of the saved dataset. Defaults to 'cifar100.hdf5'.

    Returns
    -------
    output_paths : tuple of str
        Single-element tuple containing the path to the converted dataset.

    """
    output_path = os.path.join(output_directory, output_filename)
    h5file = h5py.File(output_path, mode="w")
    input_file = os.path.join(directory, 'cifar-100-python.tar.gz')
    tar_file = tarfile.open(input_file, 'r:gz')

    file = tar_file.extractfile('cifar-100-python/train')
    try:
        if six.PY3:
            train = cPickle.load(file, encoding='latin1')
        else:
            train = cPickle.load(file)
    finally:
        file.close()

    train_features = train['data'].reshape(train['data'].shape[0],
                                           3, 32, 32)
    train_coarse_labels = numpy.array(train['coarse_labels'],
                                      dtype=numpy.uint8)
    train_fine_labels = numpy.array(train['fine_labels'],
                                    dtype=numpy.uint8)

    file = tar_file.extractfile('cifar-100-python/test')
    try:
        if six.PY3:
            test = cPickle.load(file, encoding='latin1')
        else:
            test = cPickle.load(file)
    finally:
        file.close()

    test_features = test['data'].reshape(test['data'].shape[0],
                                         3, 32, 32)
    test_coarse_labels = numpy.array(test['coarse_labels'], dtype=numpy.uint8)
    test_fine_labels = numpy.array(test['fine_labels'], dtype=numpy.uint8)

    data = (('train', 'features', train_features),
            ('train', 'coarse_labels', train_coarse_labels.reshape((-1, 1))),
            ('train', 'fine_labels', train_fine_labels.reshape((-1, 1))),
            ('test', 'features', test_features),
            ('test', 'coarse_labels', test_coarse_labels.reshape((-1, 1))),
            ('test', 'fine_labels', test_fine_labels.reshape((-1, 1))))
    fill_hdf5_file(h5file, data)
    h5file['features'].dims[0].label = 'batch'
    h5file['features'].dims[1].label = 'channel'
    h5file['features'].dims[2].label = 'height'
    h5file['features'].dims[3].label = 'width'
    h5file['coarse_labels'].dims[0].label = 'batch'
    h5file['coarse_labels'].dims[1].label = 'index'
    h5file['fine_labels'].dims[0].label = 'batch'
    h5file['fine_labels'].dims[1].label = 'index'

    h5file.flush()
    h5file.close()

    return (output_path,)
Пример #42
0
def main(path, mode):

    # empty datasets
    train_features = []
    train_features_100 = []
    train_locations = []
    train_labels = []
    train_dims = []
    test_features = []
    test_features_100 = []
    test_locations = []
    test_labels = []
    test_dims = []

    # open h5 file as 'f'
    if (os.path.isfile(path)):
        print("Opening", path, "\n")
    else:
        print(path, "does not exist\n")
        exit()

    try:
        f = h5py.File(path, 'r')
    except Exception as e:
        print(e)
        exit()

    # access the data
    X = f['X']
    X_100 = f['X_100']
    Y = f['Y_lab']
    Y_val = f['Y_val']
    Y_ars = f['Y_ars']
    Y_loc = f['Y_loc']
    data = f['Train']

    hist=np.zeros(7)

    # change format
    print("[INFO] Start processing data...")
    for i in range(0, X.shape[0]):

        # target output
        if mode == 'theano':
            # one-hot to emotion category
            j = 0
            while Y[i,j] == 0 and j<6:
                j += 1
            label = int(j)
        else:
            # one-hot
            label = Y[i,:]

        j = 0
        while Y[i,j] == 0 and j<6:
            j += 1
        hist[j] = hist[j]+1

        # select focal point (www.pyimagesearch.com/wp-content/uploads/2017/04/facial_landmarks_68markup.jpg)
        # for a first try, simply the mouth region (point 62)
        px, py = Y_loc[i,61,:]
        # zoom, skew, x, skew, zoom, y
        location = np.array((0.28, 0, (int(px) - 50.0) / 50.0, 0, 0.28, (int(py) - 50.0) / 50.0), ndmin=1, dtype=np.float32)

        # image and down-scaled (coarse) image
        if mode == 'theano':
            # change image dim from (100, 100, 1) to (1, 100, 100)
            image = np.array(X[i,:,:,0], ndmin=3, dtype=np.float32)
            image_100 = np.array(X_100[i,:,:,0], ndmin=3, dtype=np.float32)
            # image_coarse = np.array(cv2.resize(X[i,:,:,0], (12,12)), ndmin=3, dtype=np.uint8)
        else:
            # keep image dim
            image = np.array(X[i, ...], ndmin=2, dtype=np.float32)
            image_100 = np.array(X_100[i, ...], ndmin=2, dtype=np.float32)
            # image_coarse = np.array(cv2.resize(X[i, ...], (12,12)), ndmin=2, dtype=np.uint8)

        # valence and arousal
        dims = np.array((Y_val[i], Y_ars[i]), ndmin=1, dtype=np.float32)

        # append data row
        if data[i] == b'train':
            train_features.append(image)
            train_features_100.append(image_100)
            train_locations.append(location)
            train_labels.append(label)
            train_dims.append(dims)
        else:
            test_features.append(image)
            test_features_100.append(image_100)
            test_locations.append(location)
            test_labels.append(label)
            test_dims.append(dims)

        # feedback
        if (i+1)%1000==0:
            print("[INFO] Appended", i+1, "rows of data")

    print('\n',hist,'\n')

    # save data
    save_path = '/scratch/forch/EDRAM/datasets/AffectNet_train_data_'+mode+'.hdf5'

    h5file = h5py.File(save_path, mode='w')

    dfata = (
            ('train', 'features', np.array(train_features, dtype=np.float32)),
            ('test', 'features', np.array(test_features, dtype=np.float32)),
            ('train', 'features_100', np.array(train_features_100, dtype=np.float32)),
            ('test', 'features_100', np.array(test_features_100, dtype=np.float32)),
            ('train', 'locations', np.array(train_locations, dtype=np.float32)),
            ('test', 'locations', np.array(test_locations, dtype=np.float32)),
            ('train', 'labels', np.array(train_labels, dtype=np.uint8)),
            ('test', 'labels', np.array(test_labels, dtype=np.uint8)),
            ('train', 'dimensions', np.array(train_dims, dtype=np.float32)),
            ('test', 'dimensions', np.array(test_dims, dtype=np.float32)),
    )

    fill_hdf5_file(h5file, data)
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features'].dims[i].label = label
    for i, label in enumerate(('batch', 'channel', 'height', 'width')):
        h5file['features_100'].dims[i].label = label
    for i, label in enumerate(('batch', 'index')):
        h5file['locations'].dims[i].label = label
    for i, label in enumerate(('batch',)):
        h5file['labels'].dims[i].label = label
    for i, label in enumerate(('batch', 'val|ars')):
        h5file['dimensions'].dims[i].label = label

    h5file.flush()
    h5file.close()

    print("[INFO] Saved data to", save_path,"\n")
Пример #43
0
        fi = htkmfc.HTKFeat_read(fname)
        data = fi.getall()[:,:20]
        val_Mask[ind,:data.shape[0]] = 1.0
        pad = maxlen - data.shape[0]
        data = np.vstack((data, np.zeros((pad,20), dtype='float32')))
        val_Data[ind,:,:] = data


    return Data, Mask, np.asarray(labelz, dtype='int32'), val_Data, val_Mask, np.asarray(val_labelz, dtype='int32')

Data, Msk, Targets, val_Data, val_Msk, val_tars = load_dataset()


f = h5py.File('dataset.hdf5', mode='w')
data = (('train', 'features', Data),
        ('train', 'mask', Msk),
        ('train', 'targets', Targets),
        ('valid', 'features', val_Data),
        ('valid', 'mask', val_Msk),
        ('valid', 'targets', val_tars))
fill_hdf5_file(f, data)

for i, label in enumerate(('batch', 'maxlen', 'feat_dim')):
    f['features'].dims[i].label = label
for i, label in enumerate(('batch', 'maxlen')):
    f['mask'].dims[i].label = label
for i, label in enumerate(('batch',)):
    f['targets'].dims[i].label = label

f.flush()
f.close()
Пример #44
0
         ('valid', 'codes', codes_valid),
         ('test', 'features', data_test.reshape(-1, width*height)),
         ('test', 'mask', grps_test.reshape((-1, nr_shapes, width*height))),
         ('test', 'targets', targets_test),
         ('test', 'codes', codes_test),
         ('single', 'features', data_single.reshape(-1, width*height)),
         ('single', 'mask', grps_single.reshape((-1, nr_shapes, width*height))),
         ('single', 'targets', targets_single),
         ('single', 'codes', codes_single))

for n, m, d in split:
    print(n, m, d.shape)

h5file = h5py.File(os.path.join(data_dir, 'shapes.h5f'), mode='w')

fill_hdf5_file(h5file, split)


h5file.attrs['description'] = """
Shapes Problem
==============

Binary images containing 3 random shapes each. Introduced in [1] to investigate
binding in deep networks.
All images are of size 1 x {height} x {width}.
There are {nr_train_examples} training examples and {nr_test_examples} test
examples with {nr_shapes} random shapes each.
There are also {nr_single_examples} examples with just a single random shape.
There are three different shapes: ['square', 'up-triangle', 'down-triangle'].

[1] David P. Reichert and Thomas Serre,