def save_fuel_dataset(output_filename, train_features, valid_features, test_features): datasource_dir = fuel.config.data_path[0] """Converts the dataset to HDF5. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(datasource_dir, output_filename) h5file = h5py.File(output_path, mode='w') data = (('train', 'features', train_features), ('valid', 'features', valid_features), ('test', 'features', test_features)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file.flush() h5file.close() return (output_path, )
def convert_iris(directory, output_directory, output_filename='iris.hdf5'): output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') classes = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} data = numpy.loadtxt( os.path.join(directory, 'iris.data'), converters={4: lambda x: classes[x]}, delimiter=',') numpy.random.shuffle(data) features = data[:, :-1].astype('float32') targets = data[:, -1].astype('uint8') train_features = features[:100] train_targets = targets[:100] valid_features = features[100:120] valid_targets = targets[100:120] test_features = features[120:] data = (('train', 'features', train_features), ('train', 'targets', train_targets), ('valid', 'features', valid_features), ('valid', 'targets', valid_targets), ('test', 'features', test_features)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def read_in(size=64, limit=10, class_limit=10): all_vs = None vectors = [] targets = [] for i, c in zip(label_ids, labels)[:class_limit]: print(c) for j, fname in enumerate(os.listdir('food-101/images/' + c)[:limit]): infile = os.path.join('food-101/images', c, fname) outfname = re.sub('jpe?g$', 'png', fname) outfile = 'food-101-{0}x{0}/{1}_{2}'.format(size, c, outfname) try: im = Image.open(infile) im = ImageOps.fit(im, (size, size), Image.ANTIALIAS) if j < 10: # Save the sample 10 images im.save(outfile, "png") im = np.array(im, dtype=np.uint8) if len(im.shape) != 3: continue imarr = im.transpose([2, 0, 1]) if imarr.shape != (3, size, size): continue targets.append(i) vectors.append(imarr) if len(vectors) >= 10000: if all_vs is None: all_vs = np.stack(vectors, 0) else: vs = np.stack(vectors, 0) all_vs = np.concatenate([all_vs, vs], 0) vectors = [] except IOError: print "cannot create thumbnail for '%s'" % infile vectors = np.stack(vectors, 0) if all_vs is not None: vectors = np.concatenate([all_vs, vectors], 0) targets = np.array(targets).reshape(-1, 1) assert len(vectors) == len(targets) data = (('train', 'features', vectors), ('train', 'targets', targets)) h5file = h5py.File('food-101-{0}x{0}.hdf5'.format(size), mode='w') try: fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() finally: h5file.close() return vectors, targets print('Done!')
def test_dtype(self): fill_hdf5_file(self.h5file, (('train', 'features', self.train_features), ('train', 'targets', self.train_targets), ('test', 'features', self.test_features), ('test', 'targets', self.test_targets))) assert_equal(str(self.h5file['features'].dtype), 'uint8') assert_equal(str(self.h5file['targets'].dtype), 'float32')
def convert_binarized_mnist(directory, output_directory, output_filename='binarized_mnist.hdf5'): """Converts the binarized MNIST dataset to HDF5. Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN paper [DBN] to an HDF5 dataset compatible with :class:`fuel.datasets.BinarizedMNIST`. The converted dataset is saved as 'binarized_mnist.hdf5'. This method assumes the existence of the files `binarized_mnist_{train,valid,test}.amat`, which are accessible through Hugo Larochelle's website [HUGO]. .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative Analysis of Deep Belief Networks*, Proceedings of the 25th international conference on Machine learning, 2008, pp. 872-879. .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/ binarized_mnist/binarized_mnist_{train,valid,test}.amat Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'binarized_mnist.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') train_set = numpy.loadtxt( os.path.join(directory, TRAIN_FILE)).reshape( (-1, 1, 28, 28)).astype('uint8') valid_set = numpy.loadtxt( os.path.join(directory, VALID_FILE)).reshape( (-1, 1, 28, 28)).astype('uint8') test_set = numpy.loadtxt( os.path.join(directory, TEST_FILE)).reshape( (-1, 1, 28, 28)).astype('uint8') data = (('train', 'features', train_set), ('valid', 'features', valid_set), ('test', 'features', test_set)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label h5file.flush() h5file.close() return (output_path,)
def convert_svhn_format_2(directory, output_directory, output_filename='svhn_format_2.hdf5'): """Converts the SVHN dataset (format 2) to HDF5. This method assumes the existence of the files `{train,test,extra}_32x32.mat`, which are accessible through the official website [SVHNSITE]. Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'svhn_format_2.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') train_set = loadmat(os.path.join(directory, FORMAT_2_TRAIN_FILE)) train_features = train_set['X'].transpose(3, 2, 0, 1) train_targets = train_set['y'] train_targets[train_targets == 10] = 0 test_set = loadmat(os.path.join(directory, FORMAT_2_TEST_FILE)) test_features = test_set['X'].transpose(3, 2, 0, 1) test_targets = test_set['y'] test_targets[test_targets == 10] = 0 extra_set = loadmat(os.path.join(directory, FORMAT_2_EXTRA_FILE)) extra_features = extra_set['X'].transpose(3, 2, 0, 1) extra_targets = extra_set['y'] extra_targets[extra_targets == 10] = 0 data = (('train', 'features', train_features), ('test', 'features', test_features), ('extra', 'features', extra_features), ('train', 'targets', train_targets), ('test', 'targets', test_targets), ('extra', 'targets', extra_targets)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label h5file.flush() h5file.close() return (output_path, )
def test_dtype(self): fill_hdf5_file( self.h5file, (('train', 'features', self.train_features), ('train', 'targets', self.train_targets), ('test', 'features', self.test_features), ('test', 'targets', self.test_targets))) assert_equal(str(self.h5file['features'].dtype), 'uint8') assert_equal(str(self.h5file['targets'].dtype), 'float32')
def convert_svhn_format_2(directory, output_directory, output_filename='svhn_format_2.hdf5'): """Converts the SVHN dataset (format 2) to HDF5. This method assumes the existence of the files `{train,test,extra}_32x32.mat`, which are accessible through the official website [SVHNSITE]. Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'svhn_format_2.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') train_set = loadmat(os.path.join(directory, FORMAT_2_TRAIN_FILE)) train_features = train_set['X'].transpose(3, 2, 0, 1) train_targets = train_set['y'] train_targets[train_targets == 10] = 0 test_set = loadmat(os.path.join(directory, FORMAT_2_TEST_FILE)) test_features = test_set['X'].transpose(3, 2, 0, 1) test_targets = test_set['y'] test_targets[test_targets == 10] = 0 extra_set = loadmat(os.path.join(directory, FORMAT_2_EXTRA_FILE)) extra_features = extra_set['X'].transpose(3, 2, 0, 1) extra_targets = extra_set['y'] extra_targets[extra_targets == 10] = 0 data = (('train', 'features', train_features), ('test', 'features', test_features), ('extra', 'features', extra_features), ('train', 'targets', train_targets), ('test', 'targets', test_targets), ('extra', 'targets', extra_targets)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label h5file.flush() h5file.close() return (output_path,)
def convert_mnist(directory, output_file, dtype=None): """Converts the MNIST dataset to HDF5. Converts the MNIST dataset to an HDF5 dataset compatible with :class:`fuel.datasets.MNIST`. The converted dataset is saved as 'mnist.hdf5'. This method assumes the existence of the following files: `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz` `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz` It assumes the existence of the following files: * `train-images-idx3-ubyte.gz` * `train-labels-idx1-ubyte.gz` * `t10k-images-idx3-ubyte.gz` * `t10k-labels-idx1-ubyte.gz` Parameters ---------- directory : str Directory in which input files reside. output_file : str Where to save the converted dataset. dtype : str, optional Either 'float32', 'float64', or 'bool'. Defaults to `None`, in which case images will be returned in their original unsigned byte format. """ h5file = h5py.File(output_file, mode="w") train_feat_path = os.path.join(directory, TRAIN_IMAGES) train_features = read_mnist_images(train_feat_path, dtype) train_lab_path = os.path.join(directory, TRAIN_LABELS) train_labels = read_mnist_labels(train_lab_path) test_feat_path = os.path.join(directory, TEST_IMAGES) test_features = read_mnist_images(test_feat_path, dtype) test_lab_path = os.path.join(directory, TEST_LABELS) test_labels = read_mnist_labels(test_lab_path) data = ( ("train", "features", train_features), ("train", "targets", train_labels), ("test", "features", test_features), ("test", "targets", test_labels), ) fill_hdf5_file(h5file, data) h5file["features"].dims[0].label = "batch" h5file["features"].dims[1].label = "channel" h5file["features"].dims[2].label = "height" h5file["features"].dims[3].label = "width" h5file["targets"].dims[0].label = "batch" h5file["targets"].dims[1].label = "index" h5file.flush() h5file.close()
def test_data(self): fill_hdf5_file(self.h5file, (('train', 'features', self.train_features, '.'), ('train', 'targets', self.train_targets), ('test', 'features', self.test_features), ('test', 'targets', self.test_targets))) assert_equal(self.h5file['features'], numpy.vstack([self.train_features, self.test_features])) assert_equal(self.h5file['targets'], numpy.vstack([self.train_targets, self.test_targets]))
def convert_silhouettes(size, directory, output_directory, output_filename=None): """ Convert the CalTech 101 Silhouettes Datasets. Parameters ---------- size : {16, 28} Convert either the 16x16 or 28x28 sized version of the dataset. directory : str Directory in which the required input files reside. output_filename : str Where to save the converted dataset. """ if size not in (16, 28): raise ValueError('size must be 16 or 28') if output_filename is None: output_filename = 'caltech101_silhouettes{}.hdf5'.format(size) output_file = os.path.join(output_directory, output_filename) input_file = 'caltech101_silhouettes_{}_split1.mat'.format(size) input_file = os.path.join(directory, input_file) if not os.path.isfile(input_file): raise MissingInputFiles('Required files missing', [input_file]) with h5py.File(output_file, mode="w") as h5file: mat = loadmat(input_file) train_features = mat['train_data'].reshape([-1, 1, size, size]) train_targets = mat['train_labels'] valid_features = mat['val_data'].reshape([-1, 1, size, size]) valid_targets = mat['val_labels'] test_features = mat['test_data'].reshape([-1, 1, size, size]) test_targets = mat['test_labels'] data = ( ('train', 'features', train_features), ('train', 'targets', train_targets), ('valid', 'features', valid_features), ('valid', 'targets', valid_targets), ('test', 'features', test_features), ('test', 'targets', test_targets), ) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label return (output_file, )
def preprocess_svhn(main_loop, save_path): h5file = h5py.File(save_path, mode='w') ali, = Selector(main_loop.model.top_bricks).select('/ali').bricks x = tensor.tensor4('features') y = tensor.imatrix('targets') params = ali.encoder.apply(x) mu = params[:, :ali.encoder._nlat] acts = [] acts += [mu] acts += VariableFilter(bricks=[ ali.encoder.layers[-9], ali.encoder.layers[-6], ali.encoder.layers[-3] ], roles=[OUTPUT])(ComputationGraph([mu]).variables) output = tensor.concatenate([act.flatten(ndim=2) for act in acts], axis=1) preprocess = theano.function([x, y], [output.flatten(ndim=2), y]) train_set = SVHN(2, which_sets=('train', ), sources=('features', 'targets')) train_stream = DataStream.default_stream(train_set, iteration_scheme=SequentialScheme( train_set.num_examples, 100)) train_features, train_targets = map( numpy.vstack, list( zip(*[ preprocess(*batch) for batch in train_stream.get_epoch_iterator() ]))) test_set = SVHN(2, which_sets=('test', ), sources=('features', 'targets')) test_stream = DataStream.default_stream(test_set, iteration_scheme=SequentialScheme( test_set.num_examples, 100)) test_features, test_targets = map( numpy.vstack, list( zip(*[ preprocess(*batch) for batch in test_stream.get_epoch_iterator() ]))) data = (('train', 'features', train_features), ('test', 'features', test_features), ('train', 'targets', train_targets), ('test', 'targets', test_targets)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'feature')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label h5file.flush() h5file.close()
def convert_mnist(directory, output_file, dtype=None): """Converts the MNIST dataset to HDF5. Converts the MNIST dataset to an HDF5 dataset compatible with :class:`fuel.datasets.MNIST`. The converted dataset is saved as 'mnist.hdf5'. This method assumes the existence of the following files: `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz` `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz` It assumes the existence of the following files: * `train-images-idx3-ubyte.gz` * `train-labels-idx1-ubyte.gz` * `t10k-images-idx3-ubyte.gz` * `t10k-labels-idx1-ubyte.gz` Parameters ---------- directory : str Directory in which input files reside. output_file : str Where to save the converted dataset. dtype : str, optional Either 'float32', 'float64', or 'bool'. Defaults to `None`, in which case images will be returned in their original unsigned byte format. """ h5file = h5py.File(output_file, mode='w') train_feat_path = os.path.join(directory, TRAIN_IMAGES) train_features = read_mnist_images(train_feat_path, dtype) train_lab_path = os.path.join(directory, TRAIN_LABELS) train_labels = read_mnist_labels(train_lab_path) test_feat_path = os.path.join(directory, TEST_IMAGES) test_features = read_mnist_images(test_feat_path, dtype) test_lab_path = os.path.join(directory, TEST_LABELS) test_labels = read_mnist_labels(test_lab_path) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close()
def convert_lfw(directory, basename, output_directory): tgz_filename = "{}.tgz".format(basename) tar_filename = "{}.tar".format(basename) output_filename = "{}.hdf5".format(basename) tar_subdir = "lfw_funneled" if basename == "lfw-funneled" else basename # it will be faster to decompress this tar file all at once print("--> Converting {} to tar".format(tgz_filename)) with gzip.open(tgz_filename, 'rb') as f_in, open(tar_filename, 'wb') as f_out: shutil.copyfileobj(f_in, f_out) tar = tarfile.open(tar_filename) print("--> Building test/train lists") # build lists, throwing away heading with open('pairsDevTrain.txt', 'rb') as csvfile: trainrows = list(csv.reader(csvfile, delimiter='\t'))[1:] with open('pairsDevTest.txt', 'rb') as csvfile: testrows = list(csv.reader(csvfile, delimiter='\t'))[1:] print("--> Converting") # extract all images in set train_images = load_images("train", tar, tar_subdir, trainrows) test_images = load_images("test", tar, tar_subdir, testrows) train_labels = np.array(map(lambda r:loadLabelsFromRow(r), trainrows)) test_labels = np.array(map(lambda r:loadLabelsFromRow(r), testrows)) train_features = np.array([[f[0,:,:,0], f[0,:,:,1], f[0,:,:,2], f[1,:,:,0], f[1,:,:,1], f[1,:,:,2]] for f in train_images]) test_features = np.array([[f[0,:,:,0], f[0,:,:,1], f[0,:,:,2], f[1,:,:,0], f[1,:,:,1], f[1,:,:,2]] for f in test_images]) train_targets = np.array([[n] for n in train_labels]) test_targets = np.array([[n] for n in test_labels]) print("train shapes: ", train_features.shape, train_targets.shape) print("test shapes: ", test_features.shape, test_targets.shape) print("--> Writing hdf5 output file") output_path = os.path.join(output_directory, output_filename) with h5py.File(output_path, mode="w") as h5file: data = (('train', 'features', train_features), ('train', 'targets', train_targets), ('test', 'features', test_features), ('test', 'targets', test_targets)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label print("--> Done, removing tar file") os.remove(tar_filename) return (output_path,)
def test_data(self): fill_hdf5_file( self.h5file, (('train', 'features', self.train_features, '.'), ('train', 'targets', self.train_targets), ('test', 'features', self.test_features), ('test', 'targets', self.test_targets))) assert_equal(self.h5file['features'], numpy.vstack([self.train_features, self.test_features])) assert_equal(self.h5file['targets'], numpy.vstack([self.train_targets, self.test_targets]))
def convert_silhouettes(size, directory, output_directory, output_file=None): """ Convert the CalTech 101 Silhouettes Datasets. Parameters ---------- size : {16, 28} Convert either the 16x16 or 28x28 sized version of the dataset. directory : str Directory in which the required input files reside. output_file : str Where to save the converted dataset. """ if size not in (16, 28): raise ValueError('size must be 16 or 28') if output_file is None: output_file = 'caltech101_silhouettes{}.hdf5'.format(size) output_file = os.path.join(output_directory, output_file) input_file = 'caltech101_silhouettes_{}_split1.mat'.format(size) input_file = os.path.join(directory, input_file) if not os.path.isfile(input_file): raise MissingInputFiles('Required files missing', [input_file]) with h5py.File(output_file, mode="w") as h5file: mat = loadmat(input_file) train_features = mat['train_data'].reshape([-1, 1, size, size]) train_targets = mat['train_labels'] valid_features = mat['val_data'].reshape([-1, 1, size, size]) valid_targets = mat['val_labels'] test_features = mat['test_data'].reshape([-1, 1, size, size]) test_targets = mat['test_labels'] data = ( ('train', 'features', train_features), ('train', 'targets', train_targets), ('valid', 'features', valid_features), ('valid', 'targets', valid_targets), ('test', 'features', test_features), ('test', 'targets', test_targets), ) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['targets'].dims[i].label = label return (output_file,)
def convert_youtube_audio(directory, output_directory, youtube_id, channels, sample, output_filename=None): """Converts downloaded YouTube audio to HDF5 format. Requires `ffmpeg` to be installed and available on the command line (i.e. available on your `PATH`). Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. youtube_id : str 11-character video ID (taken from YouTube URL) channels : int The number of audio channels to use in the PCM Wave file. sample : int The sampling rate to use in Hz, e.g. 44100 or 16000. output_filename : str, optional Name of the saved dataset. If `None` (the default), `youtube_id.hdf5` is used. """ input_file = os.path.join(directory, '{}.m4a'.format(youtube_id)) wav_filename = '{}.wav'.format(youtube_id) wav_file = os.path.join(directory, wav_filename) ffmpeg_not_available = subprocess.call(['ffmpeg', '-version']) if ffmpeg_not_available: raise RuntimeError('conversion requires ffmpeg') subprocess.check_call(['ffmpeg', '-y', '-i', input_file, '-ac', str(channels), '-ar', str(sample), wav_file], stdout=sys.stdout) # Load WAV into array _, data = scipy.io.wavfile.read(wav_file) if data.ndim == 1: data = data[:, None] data = data[None, :] # Store in HDF5 if output_filename is None: output_filename = '{}.hdf5'.format(youtube_id) output_file = os.path.join(output_directory, output_filename) with h5py.File(output_file, 'w') as h5file: fill_hdf5_file(h5file, (('train', 'features', data),)) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'time' h5file['features'].dims[2].label = 'feature' return (output_file,)
def main(path): train_features = [] train_locations = [] train_labels = [] test_features = [] test_locations = [] test_labels = [] for f in listdir('images'): if isfile(join('images', f)): number, label, x, y = f.split('.')[0].split('_') location = np.array( (0.28, 0, (int(x) + 14.0 - 50.0) / 50.0, 0, 0.28, (int(y) + 14.0 - 50.0) / 50.0), ndmin=1, dtype=np.float32) image = np.array(Image.open(join('images', f)), ndmin=3, dtype=np.uint8) label = int(label) if int(number) <= 60000: train_features.append(image) train_locations.append(location) train_labels.append(label) else: test_features.append(image) test_locations.append(location) test_labels.append(label) h5file = h5py.File(path, mode='w') data = ( ('train', 'features', np.array(train_features)), ('test', 'features', np.array(test_features)), ('train', 'locations', np.array(train_locations)), ('test', 'locations', np.array(test_locations)), ('train', 'labels', np.array(train_labels, dtype=np.uint8)), ('test', 'labels', np.array(test_labels, dtype=np.uint8)), ) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['locations'].dims[i].label = label for i, label in enumerate(('batch', )): h5file['labels'].dims[i].label = label h5file.flush() h5file.close() shutil.rmtree('images')
def binarized_mnist(input_directory, save_path): """Converts the binarized MNIST dataset to HDF5. Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN paper [DBN] to an HDF5 dataset compatible with :class:`fuel.datasets.BinarizedMNIST`. The converted dataset is saved as 'binarized_mnist.hdf5'. This method assumes the existence of the files `binarized_mnist_{train,valid,test}.amat`, which are accessible through Hugo Larochelle's website [HUGO]. .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative Analysis of Deep Belief Networks*, Proceedings of the 25th international conference on Machine learning, 2008, pp. 872-879. .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/ binarized_mnist/binarized_mnist_{train,valid,test}.amat Parameters ---------- input_directory : str Directory in which the required input files reside. save_path : str Where to save the converted dataset. """ h5file = h5py.File(save_path, mode="w") train_set = numpy.loadtxt( os.path.join(input_directory, 'binarized_mnist_train.amat')).reshape( (-1, 1, 28, 28)) valid_set = numpy.loadtxt( os.path.join(input_directory, 'binarized_mnist_valid.amat')).reshape( (-1, 1, 28, 28)) test_set = numpy.loadtxt( os.path.join(input_directory, 'binarized_mnist_test.amat')).reshape( (-1, 1, 28, 28)) data = ((train_set, valid_set, test_set),) source_names = ('features',) shapes = ((70000, 1, 28, 28),) dtypes = ('uint8',) split_names = ('train', 'valid', 'test') splits = ((0, 50000), (50000, 60000), (60000, 70000)) fill_hdf5_file( h5file, data, source_names, shapes, dtypes, split_names, splits) h5file.flush() h5file.close()
def mnist(input_directory, save_path, dtype=None): """Converts the MNIST dataset to HDF5. Converts the MNIST dataset to an HDF5 dataset compatible with :class:`fuel.datasets.MNIST`. The converted dataset is saved as 'mnist.hdf5'. This method assumes the existence of the following files: `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz` `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz` Parameters ---------- input_directory : str Directory in which the required input files reside. save_path : str Where to save the converted dataset. dtype : 'float32', 'float64', or 'bool' If unspecified, images will be returned in their original unsigned byte format. """ h5file = h5py.File(save_path, mode="w") train_feat_path = os.path.join(input_directory, 'train-images-idx3-ubyte.gz') train_features = read_mnist_images(train_feat_path, dtype) train_lab_path = os.path.join(input_directory, 'train-labels-idx1-ubyte.gz') train_labels = read_mnist_labels(train_lab_path) test_feat_path = os.path.join(input_directory, 't10k-images-idx3-ubyte.gz') test_features = read_mnist_images(test_feat_path, dtype) test_lab_path = os.path.join(input_directory, 't10k-labels-idx1-ubyte.gz') test_labels = read_mnist_labels(test_lab_path) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close()
def convert_iris(directory, output_directory, output_filename='iris.hdf5'): """Convert the Iris dataset to HDF5. Converts the Iris dataset to an HDF5 dataset compatible with :class:`fuel.datasets.Iris`. The converted dataset is saved as 'iris.hdf5'. This method assumes the existence of the file `iris.data`. Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to `None`, in which case a name based on `dtype` will be used. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ classes = {b'Iris-setosa': 0, b'Iris-versicolor': 1, b'Iris-virginica': 2} data = numpy.loadtxt( os.path.join(directory, 'iris.data'), converters={4: lambda x: classes[x]}, delimiter=',') features = data[:, :-1].astype('float32') targets = data[:, -1].astype('uint8').reshape((-1, 1)) data = (('all', 'features', features), ('all', 'targets', targets)) output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def binarized_mnist(input_directory, save_path): """Converts the binarized MNIST dataset to HDF5. Converts the binarized MNIST dataset used in R. Salakhutdinov's DBN paper [DBN] to an HDF5 dataset compatible with :class:`fuel.datasets.BinarizedMNIST`. The converted dataset is saved as 'binarized_mnist.hdf5'. This method assumes the existence of the files `binarized_mnist_{train,valid,test}.amat`, which are accessible through Hugo Larochelle's website [HUGO]. .. [DBN] Ruslan Salakhutdinov and Iain Murray, *On the Quantitative Analysis of Deep Belief Networks*, Proceedings of the 25th international conference on Machine learning, 2008, pp. 872-879. .. [HUGO] http://www.cs.toronto.edu/~larocheh/public/datasets/ binarized_mnist/binarized_mnist_{train,valid,test}.amat Parameters ---------- input_directory : str Directory in which the required input files reside. save_path : str Where to save the converted dataset. """ h5file = h5py.File(save_path, mode="w") train_set = numpy.loadtxt( os.path.join(input_directory, 'binarized_mnist_train.amat')).reshape( (-1, 1, 28, 28)).astype('uint8') valid_set = numpy.loadtxt( os.path.join(input_directory, 'binarized_mnist_valid.amat')).reshape( (-1, 1, 28, 28)).astype('uint8') test_set = numpy.loadtxt( os.path.join(input_directory, 'binarized_mnist_test.amat')).reshape( (-1, 1, 28, 28)).astype('uint8') data = (('train', 'features', train_set), ('valid', 'features', valid_set), ('test', 'features', test_set)) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label h5file.flush() h5file.close()
def convert(input_directory, save_path): h5file = h5py.File(save_path, 'w') split = () split += read_stands(input_directory, h5file) split += read_taxis(input_directory, h5file, 'train') print 'First origin_call not present in training set: ', len(origin_call_dict) split += read_taxis(input_directory, h5file, 'test') split += unique(h5file) fill_hdf5_file(h5file, split) for name in ['stands_name', 'stands_latitude', 'stands_longitude', 'unique_taxi_id', 'unique_origin_call']: h5file[name].dims[0].label = 'index' for name in ['trip_id', 'call_type', 'origin_call', 'origin_stand', 'taxi_id', 'timestamp', 'day_type', 'missing_data', 'latitude', 'longitude']: h5file[name].dims[0].label = 'batch' h5file.flush() h5file.close()
def main(path): train_features = [] train_locations = [] train_labels = [] test_features = [] test_locations = [] test_labels = [] for f in listdir('images'): if isfile(join('images', f)): number, label, x, y = f.split('.')[0].split('_') location = np.array((0.28, 0, (int(x) + 14.0 - 50.0) / 50.0, 0, 0.28, (int(y) + 14.0 - 50.0) / 50.0), ndmin=1, dtype=np.float32) image = np.array(Image.open(join('images', f)), ndmin=3, dtype=np.uint8) label = int(label) if int(number) <= 60000: train_features.append(image) train_locations.append(location) train_labels.append(label) else: test_features.append(image) test_locations.append(location) test_labels.append(label) h5file = h5py.File(path, mode='w') data = ( ('train', 'features', np.array(train_features)), ('test', 'features', np.array(test_features)), ('train', 'locations', np.array(train_locations)), ('test', 'locations', np.array(test_locations)), ('train', 'labels', np.array(train_labels, dtype=np.uint8)), ('test', 'labels', np.array(test_labels, dtype=np.uint8)), ) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['locations'].dims[i].label = label for i, label in enumerate(('batch',)): h5file['labels'].dims[i].label = label h5file.flush() h5file.close() shutil.rmtree('images')
def test_fill_hdf5_file(): h5file = h5py.File( 'tmp.hdf5', mode="w", driver='core', backing_store=False) train_features = numpy.arange(16, dtype='uint8').reshape((4, 2, 2)) test_features = numpy.arange(8, dtype='uint8').reshape((2, 2, 2)) + 3 train_targets = numpy.arange(4, dtype='float32').reshape((4, 1)) test_targets = numpy.arange(2, dtype='float32').reshape((2, 1)) + 3 fill_hdf5_file( h5file, (('train', 'features', train_features), ('train', 'targets', train_targets), ('test', 'features', test_features), ('test', 'targets', test_targets))) assert_equal( h5file['features'], numpy.vstack([train_features, test_features])) assert_equal( h5file['targets'], numpy.vstack([train_targets, test_targets])) assert h5file['features'].dtype == 'uint8' assert h5file['targets'].dtype == 'float32' h5file.close()
def test_fill_hdf5_file(): h5file = h5py.File('tmp.hdf5', mode="w", driver='core', backing_store=False) train_features = numpy.arange(16, dtype='uint8').reshape((4, 2, 2)) test_features = numpy.arange(8, dtype='uint8').reshape((2, 2, 2)) + 3 train_targets = numpy.arange(4, dtype='float32').reshape((4, 1)) test_targets = numpy.arange(2, dtype='float32').reshape((2, 1)) + 3 fill_hdf5_file(h5file, (('train', 'features', train_features), ('train', 'targets', train_targets), ('test', 'features', test_features), ('test', 'targets', test_targets))) assert_equal(h5file['features'], numpy.vstack([train_features, test_features])) assert_equal(h5file['targets'], numpy.vstack([train_targets, test_targets])) assert h5file['features'].dtype == 'uint8' assert h5file['targets'].dtype == 'float32' h5file.close()
def test_fill_hdf5_file(): h5file = h5py.File( 'tmp.hdf5', mode="w", driver='core', backing_store=False) train_features = numpy.arange(16, dtype='uint8').reshape((4, 2, 2)) test_features = numpy.arange(8, dtype='uint8').reshape((2, 2, 2)) + 3 train_targets = numpy.arange(4, dtype='float32').reshape((4, 1)) test_targets = numpy.arange(2, dtype='float32').reshape((2, 1)) + 3 data = ((train_features, test_features), (train_targets, test_targets)) source_names = ('features', 'targets') shapes = ((6, 2, 2), (6, 1)) dtypes = ('uint8', 'float32') split_names = ('train', 'test') splits = ((0, 4), (4, 6)) fill_hdf5_file( h5file, data, source_names, shapes, dtypes, split_names, splits) assert_equal(h5file.attrs['train'], [0, 4]) assert_equal(h5file.attrs['test'], [4, 6]) assert_equal( h5file['features'], numpy.vstack([train_features, test_features])) assert_equal( h5file['targets'], numpy.vstack([train_targets, test_targets])) assert h5file['features'].dtype == 'uint8' assert h5file['targets'].dtype == 'float32' h5file.close()
def convert_adult(directory, output_directory, output_filename='adult.hdf5'): """ Convert the Adult dataset to HDF5. Converts the Adult dataset to an HDF5 dataset compatible with :class:`fuel.datasets.Adult`. The converted dataset is saved as 'adult.hdf5'. This method assumes the existence of the file `adult.data` and `adult.test`. Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to `adult.hdf5`. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ train_path = os.path.join(directory, 'adult.data') test_path = os.path.join(directory, 'adult.test') output_path = os.path.join(output_directory, output_filename) train_content = open(train_path, 'r').readlines() test_content = open(test_path, 'r').readlines() train_content = train_content[:-1] test_content = test_content[1:-1] features_list = [] targets_list = [] for content in [train_content, test_content]: # strip out examples with missing features content = [line for line in content if line.find('?') == -1] # strip off endlines, separate entries content = list(map(lambda l: l[:-1].split(', '), content)) features = list(map(lambda l: l[:-1], content)) targets = list(map(lambda l: l[-1], content)) del content y = list(map(lambda l: [l[0] == '>'], targets)) y = numpy.array(y) del targets # Process features into a matrix variables = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' ] continuous = set([ 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) pieces = [] for i, var in enumerate(variables): data = list(map(lambda l: l[i], features)) if var in continuous: data = list(map(lambda l: float(l), data)) data = numpy.array(data) data = data.reshape(data.shape[0], 1) else: unique_values = list(set(data)) data = list(map(lambda l: unique_values.index(l), data)) data = convert_to_one_hot(data) pieces.append(data) X = numpy.concatenate(pieces, axis=1) features_list.append(X) targets_list.append(y) # the largets value in the last variable of test set is only 40, thus # the one hot representation has 40 at the second dimention. While in # training set it is 41. Since it lies in the last variable, so it is # safe to simply add a last column with zeros. features_list[1] = numpy.concatenate( (features_list[1], numpy.zeros( (features_list[1].shape[0], 1), dtype=features_list[1].dtype)), axis=1) h5file = h5py.File(output_path, mode='w') data = (('train', 'features', features_list[0]), ('train', 'targets', targets_list[0]), ('test', 'features', features_list[1]), ('test', 'targets', targets_list[1])) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path, )
def convert_cifar10(directory, output_file): """Converts the CIFAR-10 dataset to HDF5. Converts the CIFAR-10 dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as 'cifar10.hdf5'. It assumes the existence of the following file: * `cifar-10-python.tar.gz` Parameters ---------- directory : str Directory in which input files reside. output_file : str Where to save the converted dataset. """ h5file = h5py.File(output_file, mode='w') input_file = os.path.join(directory, DISTRIBUTION_FILE) tar_file = tarfile.open(input_file, 'r:gz') train_batches = [] for batch in range(1, 6): file = tar_file.extractfile('cifar-10-batches-py/data_batch_%d' % batch) try: if six.PY3: array = cPickle.load(file, encoding='latin1') else: array = cPickle.load(file) train_batches.append(array) finally: file.close() train_features = numpy.concatenate([ batch['data'].reshape(batch['data'].shape[0], 3, 32, 32) for batch in train_batches ]) train_labels = numpy.concatenate([ numpy.array(batch['labels'], dtype=numpy.uint8) for batch in train_batches ]) train_labels = numpy.expand_dims(train_labels, 1) file = tar_file.extractfile('cifar-10-batches-py/test_batch') try: if six.PY3: test = cPickle.load(file, encoding='latin1') else: test = cPickle.load(file) finally: file.close() test_features = test['data'].reshape(test['data'].shape[0], 3, 32, 32) test_labels = numpy.array(test['labels'], dtype=numpy.uint8) test_labels = numpy.expand_dims(test_labels, 1) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close()
def main(path, mode): # provide empty datasets train_features = [] train_locations = [] train_labels = [] test_features = [] test_locations = [] test_labels = [] # open source h5 file as 'f' if (os.path.isfile(path)): print("\n[INFO] Opening", path, "\n") else: print("[ERROR]", path, "does not exist\n") exit() try: f = h5py.File(path, 'r') except Exception as e: print(e) exit() # access the data X = f["X"] Y = f["Y"] px = f["px"] py = f["py"] # change format print("[INFO] Start processing data for " + mode + "...\n") for i in range(70000): # centered location of the digit patch in the image location = np.array( (0.28, 0, (int(px[i]) + 14.0 - 50.0) / 50.0, 0, 0.28, (int(py[i]) + 14.0 - 50.0) / 50.0), ndmin=1, dtype=np.float32) # image and down-scaled (coarse) image if mode == 'theano': # channel first image = np.array(X[i, ...], ndmin=3, dtype=np.uint8) image_coarse = np.array(cv2.resize(X[i, ...], (12, 12)), ndmin=3, dtype=np.uint8) else: # chanel last image = np.array(X[i, ...], ndmin=2, dtype=np.uint8) image.shape = image.shape + (1, ) image_coarse = np.array(cv2.resize(X[i, ...], (12, 12)), ndmin=2, dtype=np.uint8) image_coarse.shape = image_coarse.shape + (1, ) # target output if mode == 'theano': # one-hot to digit label j = 0 while Y[i, j] == 0 and j < 9: j += 1 label = int(j) else: # one-hot label = Y[i, :] # first 60.000 examples are training data if int(i) < 60000: train_features.append(image) train_locations.append(location) train_labels.append(label) else: test_features.append(image) test_locations.append(location) test_labels.append(label) if (i + 1) % 1000 == 0: print("[INFO] Appended", i + 1, "rows of data") # save data if mode == 'theano': save_path = '/scratch/forch/EDRAM/datasets/mnist_cluttered_test.hdf5' elif mode == 'keras': save_path = '/scratch/forch/EDRAM/datasets/mnist_cluttered_keras.hdf5' else: save_path = '/scratch/forch/EDRAM/datasets/mnist_cluttered_' + mode + '.hdf5' h5file = h5py.File(save_path, mode='w') data = ( ('train', 'features', np.array(train_features)), ('test', 'features', np.array(test_features)), ('train', 'locations', np.array(train_locations)), ('test', 'locations', np.array(test_locations)), ('train', 'labels', np.array(train_labels, dtype=np.uint8)), ('test', 'labels', np.array(test_labels, dtype=np.uint8)), ) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['locations'].dims[i].label = label for i, label in enumerate(('batch', )): h5file['labels'].dims[i].label = label h5file.flush() h5file.close() print("\n[INFO] Saved data to", save_path, "\n")
import h5py import scipy.io.wavfile from fuel.converters.base import fill_hdf5_file directory = '/data/lisatmp4/taesup/data/YouTubeAudio/' youtube_id = 'XqaJ2Ol5cC4' wav_file = directory+youtube_id+'.wav' output_file = directory+youtube_id+'_valid.hdf5' _, data = scipy.io.wavfile.read(wav_file) if data.ndim == 1: data = data[:, None] data = data[None, :] num_total = data.shape[1] num_trains = 160000000 num_valids = num_total-num_trains train_data = data[:,0:num_trains, :] valid_data = data[:,num_trains:, :] with h5py.File(output_file, 'w') as h5file: fill_data = (('train', 'features', valid_data),) print 'train_data : ', train_data.shape print 'valid_data : ', valid_data.shape fill_hdf5_file(h5file, fill_data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'time' h5file['features'].dims[2].label = 'feature'
for ind, f in enumerate(val_features): fname = os.path.join(dpth, f + '.fea') fi = htkmfc.HTKFeat_read(fname) data = fi.getall()[:, :20] val_Mask[ind, :data.shape[0]] = 1.0 pad = maxlen - data.shape[0] data = np.vstack((data, np.zeros((pad, 20), dtype='float32'))) val_Data[ind, :, :] = data return Data, Mask, np.asarray( labelz, dtype='int32'), val_Data, val_Mask, np.asarray(val_labelz, dtype='int32') Data, Msk, Targets, val_Data, val_Msk, val_tars = load_dataset() f = h5py.File('dataset.hdf5', mode='w') data = (('train', 'features', Data), ('train', 'mask', Msk), ('train', 'targets', Targets), ('valid', 'features', val_Data), ('valid', 'mask', val_Msk), ('valid', 'targets', val_tars)) fill_hdf5_file(f, data) for i, label in enumerate(('batch', 'maxlen', 'feat_dim')): f['features'].dims[i].label = label for i, label in enumerate(('batch', 'maxlen')): f['mask'].dims[i].label = label for i, label in enumerate(('batch', )): f['targets'].dims[i].label = label f.flush() f.close()
def convert_cifar100(directory, output_directory, output_filename='cifar100.hdf5'): """Converts the CIFAR-100 dataset to HDF5. Converts the CIFAR-100 dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CIFAR100`. The converted dataset is saved as 'cifar100.hdf5'. This method assumes the existence of the following file: `cifar-100-python.tar.gz` Parameters ---------- directory : str Directory in which the required input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'cifar100.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode="w") input_file = os.path.join(directory, 'cifar-100-python.tar.gz') tar_file = tarfile.open(input_file, 'r:gz') file = tar_file.extractfile('cifar-100-python/train') try: if six.PY3: train = cPickle.load(file, encoding='latin1') else: train = cPickle.load(file) finally: file.close() train_features = train['data'].reshape(train['data'].shape[0], 3, 32, 32) train_coarse_labels = numpy.array(train['coarse_labels'], dtype=numpy.uint8) train_fine_labels = numpy.array(train['fine_labels'], dtype=numpy.uint8) file = tar_file.extractfile('cifar-100-python/test') try: if six.PY3: test = cPickle.load(file, encoding='latin1') else: test = cPickle.load(file) finally: file.close() test_features = test['data'].reshape(test['data'].shape[0], 3, 32, 32) test_coarse_labels = numpy.array(test['coarse_labels'], dtype=numpy.uint8) test_fine_labels = numpy.array(test['fine_labels'], dtype=numpy.uint8) data = (('train', 'features', train_features), ('train', 'coarse_labels', train_coarse_labels.reshape((-1, 1))), ('train', 'fine_labels', train_fine_labels.reshape((-1, 1))), ('test', 'features', test_features), ('test', 'coarse_labels', test_coarse_labels.reshape((-1, 1))), ('test', 'fine_labels', test_fine_labels.reshape((-1, 1)))) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['coarse_labels'].dims[0].label = 'batch' h5file['coarse_labels'].dims[1].label = 'index' h5file['fine_labels'].dims[0].label = 'batch' h5file['fine_labels'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def convert_cifar10(directory, output_directory, output_filename='cifar10.hdf5'): """Converts the CIFAR-10 dataset to HDF5. Converts the CIFAR-10 dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as 'cifar10.hdf5'. It assumes the existence of the following file: * `cifar-10-python.tar.gz` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to 'cifar10.hdf5'. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ output_path = os.path.join(output_directory, output_filename) h5file = File(output_path, mode='w') input_file = os.path.join(directory, DISTRIBUTION_FILE) tar_file = tarfile.open(input_file, 'r:gz') train_batches = [] for batch in range(1, 6): file = tar_file.extractfile( 'cifar-10-batches-py/data_batch_%d' % batch) try: if six.PY3: array = cPickle.load(file, encoding='latin1') else: array = cPickle.load(file) train_batches.append(array) finally: file.close() train_features = numpy.concatenate( [batch['data'].reshape(batch['data'].shape[0], 3, 32, 32) for batch in train_batches]) train_labels = numpy.concatenate( [numpy.array(batch['labels'], dtype=numpy.uint8) for batch in train_batches]) train_labels = numpy.expand_dims(train_labels, 1) print train_features.shape print train_labels.shape flipped_train_features = train_features[:,:,:,::-1] train_features = numpy.array([val for pair in zip(train_features, flipped_train_features) for val in pair]) train_labels = numpy.repeat(train_labels, 2, axis=0) print train_features.shape print train_labels.shape file = tar_file.extractfile('cifar-10-batches-py/test_batch') try: if six.PY3: test = cPickle.load(file, encoding='latin1') else: test = cPickle.load(file) finally: file.close() test_features = test['data'].reshape(test['data'].shape[0], 3, 32, 32) test_labels = numpy.array(test['labels'], dtype=numpy.uint8) test_labels = numpy.expand_dims(test_labels, 1) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def convert_mnist(directory, output_directory, output_filename=None, dtype=None): """Converts the MNIST dataset to HDF5. Converts the MNIST dataset to an HDF5 dataset compatible with :class:`fuel.datasets.MNIST`. The converted dataset is saved as 'mnist.hdf5'. This method assumes the existence of the following files: `train-images-idx3-ubyte.gz`, `train-labels-idx1-ubyte.gz` `t10k-images-idx3-ubyte.gz`, `t10k-labels-idx1-ubyte.gz` It assumes the existence of the following files: * `train-images-idx3-ubyte.gz` * `train-labels-idx1-ubyte.gz` * `t10k-images-idx3-ubyte.gz` * `t10k-labels-idx1-ubyte.gz` Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to `None`, in which case a name based on `dtype` will be used. dtype : str, optional Either 'float32', 'float64', or 'bool'. Defaults to `None`, in which case images will be returned in their original unsigned byte format. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ if not output_filename: if dtype: output_filename = 'mnist_{}.hdf5'.format(dtype) else: output_filename = 'mnist.hdf5' output_path = os.path.join(output_directory, output_filename) h5file = h5py.File(output_path, mode='w') train_feat_path = os.path.join(directory, TRAIN_IMAGES) train_features = read_mnist_images(train_feat_path, dtype) train_lab_path = os.path.join(directory, TRAIN_LABELS) train_labels = read_mnist_labels(train_lab_path) test_feat_path = os.path.join(directory, TEST_IMAGES) test_features = read_mnist_images(test_feat_path, dtype) test_lab_path = os.path.join(directory, TEST_LABELS) test_labels = read_mnist_labels(test_lab_path) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def convert_adult(directory, output_directory, output_filename='adult.hdf5'): """ Convert the Adult dataset to HDF5. Converts the Adult dataset to an HDF5 dataset compatible with :class:`fuel.datasets.Adult`. The converted dataset is saved as 'adult.hdf5'. This method assumes the existence of the file `adult.data` and `adult.test`. Parameters ---------- directory : str Directory in which input files reside. output_directory : str Directory in which to save the converted dataset. output_filename : str, optional Name of the saved dataset. Defaults to `adult.hdf5`. Returns ------- output_paths : tuple of str Single-element tuple containing the path to the converted dataset. """ train_path = os.path.join(directory, 'adult.data') test_path = os.path.join(directory, 'adult.test') output_path = os.path.join(output_directory, output_filename) train_content = open(train_path, 'r').readlines() test_content = open(test_path, 'r').readlines() train_content = train_content[:-1] test_content = test_content[1:-1] features_list = [] targets_list = [] for content in [train_content, test_content]: # strip out examples with missing features content = [line for line in content if line.find('?') == -1] # strip off endlines, separate entries content = list(map(lambda l: l[:-1].split(', '), content)) features = list(map(lambda l: l[:-1], content)) targets = list(map(lambda l: l[-1], content)) del content y = list(map(lambda l: [l[0] == '>'], targets)) y = numpy.array(y) del targets # Process features into a matrix variables = [ 'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' ] continuous = set([ 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week' ]) pieces = [] for i, var in enumerate(variables): data = list(map(lambda l: l[i], features)) if var in continuous: data = list(map(lambda l: float(l), data)) data = numpy.array(data) data = data.reshape(data.shape[0], 1) else: unique_values = list(set(data)) data = list(map(lambda l: unique_values.index(l), data)) data = convert_to_one_hot(data) pieces.append(data) X = numpy.concatenate(pieces, axis=1) features_list.append(X) targets_list.append(y) # the largets value in the last variable of test set is only 40, thus # the one hot representation has 40 at the second dimention. While in # training set it is 41. Since it lies in the last variable, so it is # safe to simply add a last column with zeros. features_list[1] = numpy.concatenate( (features_list[1], numpy.zeros((features_list[1].shape[0], 1), dtype=features_list[1].dtype)), axis=1) h5file = h5py.File(output_path, mode='w') data = (('train', 'features', features_list[0]), ('train', 'targets', targets_list[0]), ('test', 'features', features_list[1]), ('test', 'targets', targets_list[1])) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close() return (output_path,)
def main(list_params, gpu_id, dataset_id, model_id, use_checkpoint_weights, load_path, batch_size, n_steps, glimpse_size, coarse_size, conv_sizes, n_filters, fc_dim, enc_dim, dec_dim, n_classes, clip_value, unique_emission, unique_glimpse, output_mode, use_init_matrix, output_emotion_dims, headless, scale_inputs, normalize_inputs, use_batch_norm, dropout, weighting, iterations, show_steps, zoom_factor): # mode = 0 if output_init_matrix==0 and mode==0 else 1 if dataset_id > 0: n_classes = 7 if dataset_id < 2: input_shape = config['input_shape'] elif dataset_id == 2: input_shape = config['input_shape_400'] else: input_shape = config['input_shape_200'] glimpse_size = (glimpse_size, glimpse_size) coarse_size = (coarse_size, coarse_size) # select a GPU print("[Info] Using GPU", gpu_id) if gpu_id == -1: print( '[Error] You need to select a gpu. (e.g. python train.py --gpu=7)\n' ) exit() #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) # create the model print( "\n[Info] Loading the model from:", load_path + ("model_weights.h5" if use_checkpoint_weights == 0 else "checkpoint_weights.h5")) print() if model_id == 1: model = edram_model(input_shape, learning_rate=1, steps=n_steps, glimpse_size=glimpse_size, coarse_size=coarse_size, hidden_init=0, n_filters=128, filter_sizes=(3, 5), n_features=fc_dim, RNN_size_1=enc_dim, RNN_size_2=dec_dim, n_classes=n_classes, output_mode=output_mode, use_init_matrix=use_init_matrix, clip_value=clip_value, output_emotion_dims=output_emotion_dims, headless=headless, bn=use_batch_norm, dropout=dropout, use_weighted_loss=False, localisation_cost_factor=1) elif model_id == 2: model = tedram_model(input_shape, learning_rate=1, steps=n_steps, glimpse_size=glimpse_size, coarse_size=coarse_size, hidden_init=0, n_filters=128, filter_sizes=(3, 5), n_features=fc_dim, RNN_size_1=enc_dim, RNN_size_2=dec_dim, n_classes=n_classes, output_mode=output_mode, use_init_matrix=use_init_matrix, clip_value=clip_value, output_emotion_dims=output_emotion_dims, unique_emission=unique_emission, unique_glimpse=unique_glimpse, bn=use_batch_norm, dropout=dropout, use_weighted_loss=False, localisation_cost_factor=1) elif model_id == 3: model = STN_model(learning_rate=1, n_classes=n_classes, use_weighted_loss=False, output_mode=1) else: print('[Error] Only model 1 and 2 is available!\n') exit() # load weights if use_checkpoint_weights: model.load_weights(load_path + 'checkpoint_weights.h5') else: model.load_weights(load_path + 'model_weights.h5') # load the data data_path = datasets[dataset_id] print("\n[Info] Opening", data_path) try: data = h5py.File(data_path, 'r') except Exception: print("[Error]", data_path, "does not exist.\n") exit() if dataset_id == 0: n_train = 60000 n_test = 10000 elif dataset_id == 1: n_train = data['features'].shape[0] - 3482 n_test = data['features'].shape[0] - n_train else: n_train = data['X'].shape[0] - 3462 n_test = data['X'].shape[0] - n_train if dataset_id < 2: features = data['features'][n_train:] labels = data['labels'][n_train:] locations = data['locations'][n_train:] if output_emotion_dims: dims1 = data['dimensions'][n_train:] dims2 = None else: dims1 = None dims2 = None else: features = data['X'][n_train:] labels = data['Y_lab'][n_train:] locations = None if output_emotion_dims: dims1 = data['Y_val'][n_train:] dims2 = data['Y_ars'][n_train:] dims1 = np.reshape(dims1, (dims1.shape[0], 1)) dims2 = np.reshape(dims2, (dims2.shape[0], 1)) else: dims1 = None dims2 = None # normalize input data if normalize_inputs: indices = list(range(n_test)) random.shuffle(indices) samples = features[sorted(indices[:1000]), ...] / scale_inputs mean = np.mean(samples, axis=0) sd = np.std(samples, axis=0).clip(min=0.00001) else: mean = 0 sd = 1 print("[Info] Dataset Size\n") print(" using", iterations, "*", batch_size, "out of", n_test, "test examples") print("\n[Info] Data Dimensions\n") print(" Images: ", features.shape[1], "x", features.shape[2], "x", features.shape[3]) print(" Labels: ", labels.shape[1]) if locations is not None: print(" Locations:", locations.shape[1], "\n") else: print(" Locations:", 6, "\n") predicted_labels, predicted_dimensions, predicted_locations = [], [], [] # get sample data indices = list(range(n_test)) random.shuffle(indices) samples = sorted(indices[0:batch_size * iterations]) # prepare the minibatch # input image if scale_inputs != 1 and scale_inputs != 0: I = np.array(features[samples, ...], dtype='float32') / scale_inputs if normalize_inputs: I = (np.array(features[samples, ...], dtype='float32') - mean) / sd else: I = np.array(features[samples, ...], dtype='float32') # transformation matrix with zoom paramters set to 1 A = np.zeros((batch_size * iterations, 6), dtype='float32') A[:, (0, 4)] = 1 # initial RNN states S1 = np.zeros((batch_size * iterations, enc_dim), dtype='float32') S2 = np.zeros((batch_size * iterations, dec_dim), dtype='float32') # biases if glimpse_size == (26, 26): B1 = np.ones((batch_size * iterations, 26, 26, 1), dtype='float32') B2 = np.ones((batch_size * iterations, 24, 24, 1), dtype='float32') B3 = np.ones((batch_size * iterations, 12, 12, 1), dtype='float32') B4 = np.ones((batch_size * iterations, 8, 8, 1), dtype='float32') B5 = np.ones((batch_size * iterations, 6, 6, 1), dtype='float32') B6 = np.ones((batch_size * iterations, 4, 4, 1), dtype='float32') else: B1 = np.ones((batch_size * iterations, 16, 16, 1), dtype='float32') B2 = np.ones((batch_size * iterations, 16, 16, 1), dtype='float32') B3 = np.ones((batch_size * iterations, 8, 8, 1), dtype='float32') B4 = np.ones((batch_size * iterations, 8, 8, 1), dtype='float32') B5 = np.ones((batch_size * iterations, 6, 6, 1), dtype='float32') B6 = np.ones((batch_size * iterations, 4, 4, 1), dtype='float32') # concatenation of target outputs for every step Y_cla = np.array(labels[samples, ...], dtype='float32') if zoom_factor == 1: Y_loc = np.array(locations[samples, ...], dtype='float32') else: Y_loc = np.zeros((batch_size * iterations, 6), dtype='float32') Y_loc[:, (0, 4)] = zoom_factor if dims1 is not None: if dims2 is None: Y_dim = np.array(dims1[samples, ...], dtype='float32') else: Y_dim = np.array(np.hstack( [dims1[samples, ...], dims2[samples, ...]]), dtype='float32') if model_id == 1 or model_id == 2: inputs = { 'input_image': I, 'input_matrix': A, 'initial_hidden_state_1': S1, 'initial_cell_state_1': S1, 'initial_cell_state_2': S2, 'b26': B1, 'b24': B2, 'b12': B3, 'b8': B4, 'b6': B5, 'b4': B6 } if dims1 is not None: outputs = { 'classifications': Y_cla, 'dimensions': Y_dim, 'localisations': Y_loc } else: outputs = {'classifications': Y_cla, 'localisations': Y_loc} elif model_id == 3: inputs = {'input_image': I} outputs = {'classifications': Y_cla} if dims1 is not None: predicted_labels, predicted_dimensions, predicted_locations = model.predict( inputs, batch_size=batch_size, verbose=1) else: predicted_labels, predicted_locations = model.predict( inputs, batch_size=batch_size, verbose=1) batch_size = batch_size * iterations # reshape if model_id == 1 or model_id == 2: if output_mode: predicted_locations = np.vstack([ predicted_locations[:, i, :] for i in range(0, n_steps + use_init_matrix) ]) if n_steps > 1 and headless == False: predicted_labels = np.vstack( [predicted_labels[:, i, :] for i in range(0, n_steps)]) if dims1 is not None: predicted_dimensions = np.vstack( [predicted_dimensions[:, i, :] for i in range(0, n_steps)]) # save smaple data and predictions h5file = h5py.File(load_path + 'predictions.h5', mode='w') if dims1 is not None: data = ( ('true', 'features', np.array(features[samples, ...], dtype='float32')), ('normalized', 'features', np.array(I, dtype='float32')), ('true', 'locations', np.array(Y_loc, dtype='float32')), ('predicted', 'locations', np.array(predicted_locations, dtype='float32')), ('true', 'dimension', np.array(Y_dim, dtype='float32')), ('predcited', 'dimensions', np.array(predicted_dimensions, dtype='float32')), ('true', 'labels', np.array(Y_cla, dtype='float32')), ('predcited', 'labels', np.array(predicted_labels, dtype='float32')), ) else: data = ( ('true', 'features', np.array(features[samples, ...], dtype='float32')), ('normalized', 'features', np.array(I, dtype='float32')), ('true', 'locations', np.array(Y_loc, dtype='float32')), ('predicted', 'locations', np.array(predicted_locations, dtype='float32')), ('true', 'labels', np.array(Y_cla, dtype='float32')), ('predcited', 'labels', np.array(predicted_labels, dtype='float32')), ) fill_hdf5_file(h5file, data) h5file.flush() h5file.close() print("\n[INFO] Saved data to", load_path + 'predictions.h5', "\n") # some statistics hist = np.zeros(n_classes, dtype='int') acc = np.zeros((1 if headless else n_steps, n_classes), dtype='int') acc_avg = np.zeros((1 if headless else n_steps, n_classes), dtype='int') pos = np.zeros((n_steps + use_init_matrix, n_classes, 2), dtype='float') zoom = np.zeros((n_steps + use_init_matrix, n_classes, 2), dtype='float') mse_pos = np.zeros((n_steps + use_init_matrix, n_classes), dtype='float') mse_zoom = np.zeros((n_steps + use_init_matrix, n_classes), dtype='float') val_ars = np.zeros((n_steps, n_classes, 2), dtype='float') mse_val = np.zeros((n_steps, n_classes), dtype='float') mse_ars = np.zeros((n_steps, n_classes), dtype='float') if weighting: # average predictions per step in inverted order for j in range(0, n_steps): k = 0 predicted_labels_avg = predicted_labels[(n_steps - 1) * batch_size:(n_steps) * batch_size, :] for k in range(1, j + 1): predicted_labels_avg += predicted_labels[(n_steps - 1 - k) * batch_size:(n_steps - k) * batch_size, :] predicted_labels_avg /= k + 1 for i in range(0, batch_size): # count correct classifications per class if np.argmax(Y_cla[i, :]) == np.argmax( predicted_labels_avg[i, :]): acc_avg[j, :] = acc_avg[j, :] + Y_cla[i, :] for i in range(0, batch_size): # count class occurences hist = hist + Y_cla[i, :] # count correct classifications per class for j in range(0, 1 if headless else n_steps): if np.argmax(Y_cla[i, :]) == np.argmax( predicted_labels[i + j * batch_size, :]): acc[j, :] = acc[j, :] + Y_cla[i, :] # compute accuracy acc_mean = np.zeros(1 if headless else n_steps) for j in range(0, 1 if headless else n_steps): acc_mean[j] = np.dot(hist / batch_size, acc[j, :] / hist) hist[hist == 0] = 0.00000001 acc = np.asarray(acc * 100 / (hist), dtype='int') / 100 acc_avg = np.asarray(acc_avg * 100 / (hist), dtype='int') / 100 hist[hist < 1] = 0 # compute bb info per class and mse for j in range(0, n_steps + use_init_matrix): for i in range(0, n_classes): pos[j, i, :] = np.mean( predicted_locations[j * batch_size:(j + 1) * batch_size, :][Y_cla[:, i] == 1, :][:, (2, 5)], axis=0) zoom[j, i, :] = np.mean( predicted_locations[j * batch_size:(j + 1) * batch_size, :][Y_cla[:, i] == 1, :][:, (0, 4)], axis=0) mse_pos[j, i] = np.mean( np.square(Y_loc[Y_cla[:, i] == 1, :][:, (2, 5)] - predicted_locations[j * batch_size:(j + 1) * batch_size, :][Y_cla[:, i] == 1, :][:, (2, 5)])) mse_zoom[j, i] = np.mean( np.square(Y_loc[Y_cla[:, i] == 1, :][:, (0, 4)] - predicted_locations[j * batch_size:(j + 1) * batch_size, :][Y_cla[:, i] == 1, :][:, (0, 4)])) # compute mean dimensional ratings and mse if dims1 is not None: for j in range(0, n_steps): for i in range(0, n_classes): val_ars[j, i, :] = np.mean( predicted_dimensions[j * batch_size:(j + 1) * batch_size, :][Y_cla[:, i] == 1, :], axis=0) mse_val[j, i] = np.mean( np.square(Y_dim[Y_cla[:, i] == 1, :][:, 0] - predicted_dimensions[j * batch_size:(j + 1) * batch_size, :] [Y_cla[:, i] == 1, :][:, 0])) mse_ars[j, i] = np.mean( np.square(Y_dim[Y_cla[:, i] == 1, :][:, 1] - predicted_dimensions[j * batch_size:(j + 1) * batch_size, :] [Y_cla[:, i] == 1, :][:, 1])) print("Sample Class Distribution:", hist, "\n") print("Accuracy per Class:") for j in range(0, 1 if headless else n_steps): print(" Step " + str(j + 1) + ": ", acc[j, :], "= %.3f" % acc_mean[j]) if weighting: print("\nWeighted Accuracy per Class:") for j in range(0, n_steps): print( " Step " + str(n_steps - j) + " to " + str(n_steps) + ": ", acc_avg[n_steps - 1 - j, :], "= %.3f" % np.dot(hist / batch_size, acc_avg[n_steps - 1 - j, :])) if dims1 is not None: print("\nValence Error per Class:") for j in range(n_steps - show_steps, n_steps): print(" Step " + str(j + 1) + ": ", np.asarray(mse_val[j, :] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, mse_val[j, :])) print("\nAverage Valence per Class:") for j in range(n_steps - show_steps, n_steps): print(" Step " + str(j + 1) + ": ", np.asarray(val_ars[j, :, 0] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, val_ars[j, :, 0])) print("\nArousal Error per Class:") for j in range(n_steps - show_steps, n_steps): print(" Step " + str(j + 1) + ": ", np.asarray(mse_ars[j, :] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, mse_ars[j, :])) print("\nAverage Arousal per Class:") for j in range(n_steps - show_steps, n_steps): print(" Step " + str(j + 1) + ": ", np.asarray(val_ars[j, :, 1] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, val_ars[j, :, 1])) print("\nAverage Position per Class:") for j in range(0, n_steps + use_init_matrix): print(" Step " + str(j) + ": ", np.asarray(pos[j, :, 0] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, pos[j, :, 0])) print(" ", np.asarray(pos[j, :, 1] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, pos[j, :, 1])) print("\nPosition Variance: %.3f" % np.var(np.asarray(pos)), " Position SD: %.3f" % np.std(np.asarray(pos))) if False: print("\nLocation Error per Class:") for j in range(0, n_steps + use_init_matrix): print(" Step " + str(j) + ": ", np.asarray(mse_pos[j, :] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, mse_pos[j, :])) print("\nZoom Error per Class:") for j in range(0, n_steps + use_init_matrix): print(" Step " + str(j) + ": ", np.asarray(mse_zoom[j, :] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, mse_zoom[j, :])) print("\nAverage Zoom per Class:") for j in range(0, n_steps + use_init_matrix): print(" Step " + str(j) + ": ", np.asarray(zoom[j, :, 0] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, zoom[j, :, 0])) print(" ", np.asarray(zoom[j, :, 1] * 100, dtype='int') / 100, "= %.3f" % np.dot(hist / batch_size, zoom[j, :, 1])) print("") exit()
def convert_cifar10(directory, output_file): """Converts the CIFAR-10 dataset to HDF5. Converts the CIFAR-10 dataset to an HDF5 dataset compatible with :class:`fuel.datasets.CIFAR10`. The converted dataset is saved as 'cifar10.hdf5'. It assumes the existence of the following file: * `cifar-10-python.tar.gz` Parameters ---------- directory : str Directory in which input files reside. output_file : str Where to save the converted dataset. """ h5file = h5py.File(output_file, mode='w') input_file = os.path.join(directory, DISTRIBUTION_FILE) tar_file = tarfile.open(input_file, 'r:gz') train_batches = [] for batch in range(1, 6): file = tar_file.extractfile( 'cifar-10-batches-py/data_batch_%d' % batch) try: if six.PY3: array = cPickle.load(file, encoding='latin1') else: array = cPickle.load(file) train_batches.append(array) finally: file.close() train_features = numpy.concatenate( [batch['data'].reshape(batch['data'].shape[0], 3, 32, 32) for batch in train_batches]) train_labels = numpy.concatenate( [numpy.array(batch['labels'], dtype=numpy.uint8) for batch in train_batches]) train_labels = numpy.expand_dims(train_labels, 1) file = tar_file.extractfile('cifar-10-batches-py/test_batch') try: if six.PY3: test = cPickle.load(file, encoding='latin1') else: test = cPickle.load(file) finally: file.close() test_features = test['data'].reshape(test['data'].shape[0], 3, 32, 32) test_labels = numpy.array(test['labels'], dtype=numpy.uint8) test_labels = numpy.expand_dims(test_labels, 1) data = (('train', 'features', train_features), ('train', 'targets', train_labels), ('test', 'features', test_features), ('test', 'targets', test_labels)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'channel' h5file['features'].dims[2].label = 'height' h5file['features'].dims[3].label = 'width' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close()
def convert_data(directory, output_file, dtype=None): """ Convert new data to fuel HDF5 dataset. Parameters ---------- directory : str Directory in which input files reside. output_file : str Where to save the converted dataset. dtype : str, optional Either 'float32', 'float64', or 'bool'. Defaults to `None`, in which case images will be returned in their original unsigned byte format. """ output_file = directory +"/new_dataset.hdf5" #print('output_file', output_file) #print('directory', directory) h5file = h5py.File(output_file, mode='w') train_file = os.path.join(directory, 'train_data.h5') train = h5py.File(train_file) x_train = np.array(train["chars"]) y_train = np.array(train["target"]) #Balancing the train set by selecting the first 18k responses from each class zeroes = np.transpose(np.array(np.where(y_train==0))) ones = np.transpose(np.array(np.where(y_train==1))) twos = np.transpose(np.array(np.where(y_train==2))) threes = np.transpose(np.array(np.where(y_train==3))) x_train = np.concatenate((x_train[zeroes[:18000, 0]], x_train[ones[:18000, 0]], x_train[twos[:18000, 0]], x_train[threes[:18000, 0]]), axis=0) y_train = np.concatenate((y_train[zeroes[:18000, 0]], y_train[ones[:18000, 0]], y_train[twos[:18000, 0]], y_train[threes[:18000, 0]]), axis=0) y_train = np.reshape(y_train, (len(y_train),1)) print('x_train.shape', x_train.shape) print('y_train.shape', y_train.shape) #print('zeroes', len(y_train[y_train==0])) #print('ones', len(y_train[y_train==1])) #print('twos', len(y_train[y_train==2])) #print('threes', len(y_train[y_train==3])) valid_file = os.path.join(directory, 'valid_data.h5') valid = h5py.File(valid_file) x_valid = valid["chars"] y_valid = valid["target"] y_valid = np.reshape(y_valid, (len(y_valid), 1)) test_file = os.path.join(directory, 'test_data.h5') test = h5py.File(test_file) x_test = test["chars"] y_test = test["target"] y_test = np.reshape(y_test, (len(y_test),1)) features = x_train[:,:].astype('float32') targets = y_train[:].astype('uint8') train_features = features train_targets = targets print(train_targets.shape) features = x_valid[:,:].astype('float32') targets = y_valid[:].astype('uint8') valid_features = features valid_targets = targets print(valid_targets.shape) features = x_test[:,:].astype('float32') targets = y_test[:].astype('uint8') test_features = features test_targets = targets print(test_targets.shape) data = (('train', 'features', train_features), ('train', 'targets', train_targets), ('valid', 'features', valid_features), ('valid', 'targets', valid_targets), ('test', 'features', test_features), ('test', 'targets', test_targets)) fill_hdf5_file(h5file, data) h5file['features'].dims[0].label = 'batch' h5file['features'].dims[1].label = 'feature' h5file['targets'].dims[0].label = 'batch' h5file['targets'].dims[1].label = 'index' h5file.flush() h5file.close()
def main(path, mode): # empty datasets train_features = [] train_features_100 = [] train_locations = [] train_labels = [] train_dims = [] test_features = [] test_features_100 = [] test_locations = [] test_labels = [] test_dims = [] # open h5 file as 'f' if (os.path.isfile(path)): print("Opening", path, "\n") else: print(path, "does not exist\n") exit() try: f = h5py.File(path, 'r') except Exception as e: print(e) exit() # access the data X = f['X'] X_100 = f['X_100'] Y = f['Y_lab'] Y_val = f['Y_val'] Y_ars = f['Y_ars'] Y_loc = f['Y_loc'] data = f['Train'] hist=np.zeros(7) # change format print("[INFO] Start processing data...") for i in range(0, X.shape[0]): # target output if mode == 'theano': # one-hot to emotion category j = 0 while Y[i,j] == 0 and j<6: j += 1 label = int(j) else: # one-hot label = Y[i,:] j = 0 while Y[i,j] == 0 and j<6: j += 1 hist[j] = hist[j]+1 # select focal point (www.pyimagesearch.com/wp-content/uploads/2017/04/facial_landmarks_68markup.jpg) # for a first try, simply the mouth region (point 62) px, py = Y_loc[i,61,:] # zoom, skew, x, skew, zoom, y location = np.array((0.28, 0, (int(px) - 50.0) / 50.0, 0, 0.28, (int(py) - 50.0) / 50.0), ndmin=1, dtype=np.float32) # image and down-scaled (coarse) image if mode == 'theano': # change image dim from (100, 100, 1) to (1, 100, 100) image = np.array(X[i,:,:,0], ndmin=3, dtype=np.float32) image_100 = np.array(X_100[i,:,:,0], ndmin=3, dtype=np.float32) # image_coarse = np.array(cv2.resize(X[i,:,:,0], (12,12)), ndmin=3, dtype=np.uint8) else: # keep image dim image = np.array(X[i, ...], ndmin=2, dtype=np.float32) image_100 = np.array(X_100[i, ...], ndmin=2, dtype=np.float32) # image_coarse = np.array(cv2.resize(X[i, ...], (12,12)), ndmin=2, dtype=np.uint8) # valence and arousal dims = np.array((Y_val[i], Y_ars[i]), ndmin=1, dtype=np.float32) # append data row if data[i] == b'train': train_features.append(image) train_features_100.append(image_100) train_locations.append(location) train_labels.append(label) train_dims.append(dims) else: test_features.append(image) test_features_100.append(image_100) test_locations.append(location) test_labels.append(label) test_dims.append(dims) # feedback if (i+1)%1000==0: print("[INFO] Appended", i+1, "rows of data") print('\n',hist,'\n') # save data save_path = '/scratch/forch/EDRAM/datasets/AffectNet_train_data_'+mode+'.hdf5' h5file = h5py.File(save_path, mode='w') dfata = ( ('train', 'features', np.array(train_features, dtype=np.float32)), ('test', 'features', np.array(test_features, dtype=np.float32)), ('train', 'features_100', np.array(train_features_100, dtype=np.float32)), ('test', 'features_100', np.array(test_features_100, dtype=np.float32)), ('train', 'locations', np.array(train_locations, dtype=np.float32)), ('test', 'locations', np.array(test_locations, dtype=np.float32)), ('train', 'labels', np.array(train_labels, dtype=np.uint8)), ('test', 'labels', np.array(test_labels, dtype=np.uint8)), ('train', 'dimensions', np.array(train_dims, dtype=np.float32)), ('test', 'dimensions', np.array(test_dims, dtype=np.float32)), ) fill_hdf5_file(h5file, data) for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features'].dims[i].label = label for i, label in enumerate(('batch', 'channel', 'height', 'width')): h5file['features_100'].dims[i].label = label for i, label in enumerate(('batch', 'index')): h5file['locations'].dims[i].label = label for i, label in enumerate(('batch',)): h5file['labels'].dims[i].label = label for i, label in enumerate(('batch', 'val|ars')): h5file['dimensions'].dims[i].label = label h5file.flush() h5file.close() print("[INFO] Saved data to", save_path,"\n")
fi = htkmfc.HTKFeat_read(fname) data = fi.getall()[:,:20] val_Mask[ind,:data.shape[0]] = 1.0 pad = maxlen - data.shape[0] data = np.vstack((data, np.zeros((pad,20), dtype='float32'))) val_Data[ind,:,:] = data return Data, Mask, np.asarray(labelz, dtype='int32'), val_Data, val_Mask, np.asarray(val_labelz, dtype='int32') Data, Msk, Targets, val_Data, val_Msk, val_tars = load_dataset() f = h5py.File('dataset.hdf5', mode='w') data = (('train', 'features', Data), ('train', 'mask', Msk), ('train', 'targets', Targets), ('valid', 'features', val_Data), ('valid', 'mask', val_Msk), ('valid', 'targets', val_tars)) fill_hdf5_file(f, data) for i, label in enumerate(('batch', 'maxlen', 'feat_dim')): f['features'].dims[i].label = label for i, label in enumerate(('batch', 'maxlen')): f['mask'].dims[i].label = label for i, label in enumerate(('batch',)): f['targets'].dims[i].label = label f.flush() f.close()
('valid', 'codes', codes_valid), ('test', 'features', data_test.reshape(-1, width*height)), ('test', 'mask', grps_test.reshape((-1, nr_shapes, width*height))), ('test', 'targets', targets_test), ('test', 'codes', codes_test), ('single', 'features', data_single.reshape(-1, width*height)), ('single', 'mask', grps_single.reshape((-1, nr_shapes, width*height))), ('single', 'targets', targets_single), ('single', 'codes', codes_single)) for n, m, d in split: print(n, m, d.shape) h5file = h5py.File(os.path.join(data_dir, 'shapes.h5f'), mode='w') fill_hdf5_file(h5file, split) h5file.attrs['description'] = """ Shapes Problem ============== Binary images containing 3 random shapes each. Introduced in [1] to investigate binding in deep networks. All images are of size 1 x {height} x {width}. There are {nr_train_examples} training examples and {nr_test_examples} test examples with {nr_shapes} random shapes each. There are also {nr_single_examples} examples with just a single random shape. There are three different shapes: ['square', 'up-triangle', 'down-triangle']. [1] David P. Reichert and Thomas Serre,