def test_transform_source_example(self): assert_equal(list(self.stream_example.get_epoch_iterator()), [(numpy.ones( (2, 2)), numpy.array([[1, 0, 0, 0, 1, 0, 0, 1]])), (numpy.ones( (2, 2)), numpy.array([[0, 1, 0, 1, 0, 0, 1, 0]])), (numpy.ones( (2, 2)), numpy.array([[0, 1, 0, 0, 1, 0, 1, 0]])), (numpy.ones( (2, 2)), numpy.array([[0, 0, 1, 1, 0, 1, 0, 0]]))]) stream_example_invalid = StructuredOneHotEncoding( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=[2, 3, 3], which_sources=('targets', )) assert_raises(ValueError, list, stream_example_invalid.get_epoch_iterator()) source_example_negative = StructuredOneHotEncoding( DataStream(IndexableDataset(self.neg_data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, which_sources=('targets', )) assert_raises(ValueError, list, source_example_negative.get_epoch_iterator())
def test_concatenated_scheme_infers_request_type(): assert not ConcatenatedScheme(schemes=[ ConstantScheme(batch_size=10, times=5), ConstantScheme(batch_size=10, times=5) ]).requests_examples assert ConcatenatedScheme(schemes=[ SequentialExampleScheme(examples=10), SequentialExampleScheme(examples=10) ]).requests_examples
def test(self, req_vars): prefix_stream = DataStream(self.test_dataset, iteration_scheme=SequentialExampleScheme( self.test_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) if not data.tvt: prefix_stream = transformers.taxi_remove_test_only_clients( prefix_stream) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = self.candidate_stream( self.config.test_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def __init__(self, bs=64, num=82611, dataset='train', img_size=32, lang_N=57, **kwargs): self.provides_sources = ('features', 'captions', 'mask') super(MSCoco, self).__init__(**kwargs) self.num_examples = num + (num % bs) self.bs = bs self.num = num self.lang_N = lang_N self.example_iteration_scheme = SequentialExampleScheme( self.num_examples) self.index = 0 self.done = False self.imgs = np.load('coco/' + dataset + '-images-' + str(img_size) + 'x' + str(img_size) + '.npy') self.imgs = self.imgs.reshape((num, 3, img_size * img_size)) self.caps = np.load('coco/' + dataset + '-captions.npy') self.caps = self.caps.reshape( (num, self.caps.shape[0] / num, self.caps.shape[-1])) if self.caps.shape[-1] != self.lang_N: c = np.zeros((num, self.caps.shape[1], self.lang_N)) c[:, :, :self.caps.shape[-1]] = self.caps self.caps = c self.images = np.zeros((bs, 3, img_size * img_size)).astype('float32') self.captions = np.zeros((bs, self.caps.shape[-1])).astype(int) self.mask = np.ones((bs, self.caps.shape[-1])).astype(int)
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) datastream = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: datastream.default_transformers = uint8_pixels_to_floatX( ('features', )) train_stream = DataStream.default_stream( dataset=datastream, iteration_scheme=SequentialExampleScheme(datastream.num_examples)) it = train_stream.get_epoch_iterator() return it
def __init__(self, file_or_path, which_sets, subset=None, load_in_memory=False, driver=None, sort_indices=True, **kwargs): if isinstance(file_or_path, h5py.File): self.path = file_or_path.filename self.external_file_handle = file_or_path else: self.path = file_or_path self.path = cache_file(self.path) self.external_file_handle = None which_sets_invalid_value = ( isinstance(which_sets, six.string_types) or not all(isinstance(s, six.string_types) for s in which_sets)) if which_sets_invalid_value: raise ValueError('`which_sets` should be an iterable of strings') self.which_sets = which_sets self.user_given_subset = subset if subset else slice(None) self.load_in_memory = load_in_memory self.driver = driver self.sort_indices = sort_indices self._parse_dataset_info() kwargs.setdefault('axis_labels', self.default_axis_labels) super(H5PYDataset, self).__init__(**kwargs) # It is really important to do it here, because self.num_examples # call will cause a crash if done before calling # super(...).__init__ self.example_iteration_scheme = SequentialExampleScheme( self.num_examples)
def valid(self, req_vars): prefix_stream = DataStream(self.valid_dataset, iteration_scheme=SequentialExampleScheme( self.valid_dataset.num_examples)) #prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.balanced_batch( prefix_stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.valid_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def test_flatten_examples(self): wrapper = Flatten(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), which_sources=('features', )) assert_equal(list(wrapper.get_epoch_iterator()), [(numpy.ones(4), 0), (numpy.ones(4), 1)] * 2)
def __init__(self, banned, bs=64, num=10000, dataset='train', **kwargs): self.provides_sources = ('features', 'captions', 'mask') super(CaptionedMNIST, self).__init__(**kwargs) self.num_examples = num + (num % bs) self.bs = bs self.example_iteration_scheme = SequentialExampleScheme( self.num_examples) self.index = -1 self.done = False self.banned = banned f = gzip.open('mnist.pkl.gz', 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() if dataset == 'train': self.labels = train_set[1] self.data = train_set[0] elif dataset == 'valid': self.labels = valid_set[1] self.data = valid_set[0] elif dataset == 'test': self.labels = test_set[1] self.data = test_set[0] print self.labels.shape self.images = np.zeros((bs, 60 * 60)).astype('float32') self.captions = np.zeros((bs, 12)).astype(int) self.mask = np.ones((bs, 12)).astype(int)
def get_stream_raw(dataset, which_set, mini_batch_size): data = get_data(dataset) # dataset is a 3D array of shape: Time X Batch X Features dataset = data[which_set] time, batch, features = dataset.shape nb_mini_batches = batch / mini_batch_size dataset = dataset[:, :nb_mini_batches * mini_batch_size, :] # Create the target_dataset targets_dataset = dataset[1:, :, :] # Cut the dataset into several minibatches # dataset is now 4D (nb_mini_batches X Time X mini_batch_size X Features) dataset = numpy.swapaxes(dataset, 0, 1) targets_dataset = numpy.swapaxes(targets_dataset, 0, 1) dataset = numpy.reshape(dataset, (nb_mini_batches, mini_batch_size, time, features)) targets_dataset = numpy.reshape( targets_dataset, (nb_mini_batches, mini_batch_size, time - 1, features)) dataset = numpy.swapaxes(dataset, 1, 2) targets_dataset = numpy.swapaxes(targets_dataset, 1, 2) # Create fuel dataset dataset = IndexableDataset({ 'features': dataset, 'targets': targets_dataset }) stream = DataStream( dataset, iteration_scheme=SequentialExampleScheme(nb_mini_batches)) return stream
def test_one_hot_examples_invalid_inputs(self): wrapper = OneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=2, which_sources=('targets', )) assert_raises(ValueError, list, wrapper.get_epoch_iterator())
def test(self, req_vars): prefix_stream = DataStream(self.test_dataset, iteration_scheme=SequentialExampleScheme( self.test_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) if not data.tvt: prefix_stream = transformers.taxi_remove_test_only_clients( prefix_stream) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.test_candidate_size, False) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def test_progressbar_iter_per_epoch_indices(): iter_per_epoch = 100 progress_bar = ProgressBar() main_loop = setup_mainloop( None, iteration_scheme=SequentialExampleScheme(iter_per_epoch)) progress_bar.main_loop = main_loop assert progress_bar.get_iter_per_epoch() == iter_per_epoch
def get_dev_stream(valid_file, **kwargs): valid_data = cPickle.load(open(valid_file)) images = [example[0] for example in valid_data] targets = [example[1] for example in valid_data] dataset = IndexableDataset( OrderedDict([('input', images), ('output', targets)])) return DataStream(dataset, iteration_scheme=SequentialExampleScheme(len(images)))
def common_setup(self): ex_scheme = SequentialExampleScheme(self.dataset.num_examples) self.example_stream = DataStream(self.dataset, iteration_scheme=ex_scheme) self.batch_size = 2 scheme = ShuffledScheme(self.dataset.num_examples, batch_size=self.batch_size) self.batch_stream = DataStream(self.dataset, iteration_scheme=scheme)
def test_axis_labels_on_flatten_examples(self): wrapper = Flatten( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4), axis_labels={'features': ('batch', 'width', 'height'), 'targets': ('batch', 'index')}), which_sources=('features',)) assert_equal(wrapper.axis_labels, {'features': ('feature',), 'targets': ('index',)})
def __init__(self, which_set, filename='data.hdf5', iteration_scheme=None, **kwargs): dataset = TaxiDataset(which_set, filename, **kwargs) if iteration_scheme is None: iteration_scheme = SequentialExampleScheme(dataset.num_examples) super(TaxiStream, self).__init__(dataset, iteration_scheme=iteration_scheme)
def setup_datastream(batch_size, **kwargs): ds = ToyDataset(**kwargs) stream = DataStream(ds, iteration_scheme=SequentialExampleScheme( kwargs['nb_examples'])) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def test_one_hot_examples(self): wrapper = OneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=4, which_sources=('targets', )) assert_equal(list(wrapper.get_epoch_iterator()), [(numpy.ones((2, 2)), numpy.array([[1, 0, 0, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1, 0, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 0, 1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 0, 0, 1]]))])
def test_ignore_groups(self): stream_example = StructuredOneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, ignore_groups=[0, 2], which_sources=('targets', )) assert_equal(list(stream_example.get_epoch_iterator()), [(numpy.ones((2, 2)), numpy.array([[0, 1]])), (numpy.ones((2, 2)), numpy.array([[1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1]])), (numpy.ones((2, 2)), numpy.array([[1, 0]]))]) stream_example2 = StructuredOneHotEncoding( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, ignore_groups=[1, 2], which_sources=('targets', )) assert_equal(list(stream_example2.get_epoch_iterator()), [(numpy.ones((2, 2)), numpy.array([[1, 0, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 0, 1]]))]) stream_batch = StructuredOneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialScheme(4, 2)), num_classes=self.num_classes, ignore_groups=[0, 2], which_sources=('targets', )) assert_equal(list(stream_batch.get_epoch_iterator()), [ (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])), (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])), ])
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: stream = Mapping(stream, _AddLabel(self.eos_label)) if self.add_bos: stream = Mapping(stream, _AddLabel(self.bos_label, append=False, times=self.add_bos)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def setUp(self): self.data = OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([[0, 1, 2], [1, 0, 1], [1, 1, 1], [2, 0, 0]]))]) self.neg_data = OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([[0, -1, 2], [1, 0, -3], [1, 1, 1], [2, 0, 0]]))]) self.num_classes = (3, 2, 3) self.stream_example = StructuredOneHotEncoding( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, which_sources=('targets', ))
def get_dataset_iterator(dataset, split, include_features=True, include_targets=False, unit_scale=True, label_transforms=False, return_length=False): """Get iterator for dataset, split, targets (labels) and scaling (from 255 to 1.0)""" sources = [] sources = sources + ['features'] if include_features else sources sources = sources + ['targets'] if include_targets else sources if split == "all": splits = ('train', 'valid', 'test') elif split == "nontrain": splits = ('valid', 'test') else: splits = (split, ) dataset_fname = find_in_data_path("{}.hdf5".format(dataset)) h5_dataset = H5PYDataset(dataset_fname, which_sets=splits, sources=sources) if unit_scale: h5_dataset.default_transformers = uint8_pixels_to_floatX( ('features', )) datastream = DataStream.default_stream( dataset=h5_dataset, iteration_scheme=SequentialExampleScheme(h5_dataset.num_examples)) if label_transforms: # TODO: maybe refactor this common bit with get_custom_streams below datastream = AddLabelUncertainty(datastream, chance=0, which_sources=('targets', )) datastream = RandomLabelStrip(datastream, chance=0, which_sources=('targets', )) # HACK: allow variable stretch datastream = StretchLabels(datastream, length=128, which_sources=('targets', )) it = datastream.get_epoch_iterator() if return_length: return it, h5_dataset.num_examples else: return it
def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, )
def stream_handwriting( which_sets, batch_size, seq_size, num_letters, sorting_mult=20): assert sorting_mult > 0 dataset = Handwriting(which_sets) sorting_size = batch_size * sorting_mult num_examples = sorting_size * (dataset.num_examples / sorting_size) if which_sets == ('train',): print "Random order." scheme = ShuffledExampleScheme(num_examples) else: print "Sequential order." scheme = SequentialExampleScheme(num_examples) data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme) # Sort by length of the data sequence. data_stream = Batch( data_stream, iteration_scheme=ConstantScheme(sorting_size)) data_stream = Mapping(data_stream, SortMapping(_length)) data_stream = Unpack(data_stream) data_stream = Batch( data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_stream = SourceMapping( data_stream, _transpose, which_sources=('features', 'features_mask')) data_stream = SegmentSequence( data_stream, seq_size=seq_size + 1, share_value=True, return_last=True, which_sources=('features', 'features_mask'), add_flag=True) return data_stream
def _construct_sequential_stream(self, dataset, for_type='train'): '''Construc a sequencial stream from an IndexableDataset object Subclass should add transformation on the stream, e.g., 1.Sort samples by size 2.Batch dataset 3.Add mask on samples :param dataset: fuel.IndexableDataset This is constructed by self._construct_dataset method. :return: fuel.stream.Datastream An object of fuel.stream.Datastream with SequentialExampleScheme A fuel sequential stream with basic transformations, ''' it = SequentialExampleScheme(dataset.num_examples) stream = DataStream(dataset, iteration_scheme=it) # # Batch examples # stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) # Add mask on inputs # for source in self.need_mask_sources.iteritems(): # stream = Padding(stream, mask_sources=[source[0]], mask_dtype=source[1]) return stream
def __init__(self, indexables, start=None, stop=None, **kwargs): if isinstance(indexables, dict): self.provides_sources = tuple(indexables.keys()) else: self.provides_sources = ('data',) super(IndexableDataset, self).__init__(**kwargs) if isinstance(indexables, dict): self.indexables = [indexables[source][start:stop] for source in self.sources] if not all(len(indexable) == len(self.indexables[0]) for indexable in self.indexables): raise ValueError("sources have different lengths") else: self.indexables = [indexables] self.example_iteration_scheme = SequentialExampleScheme( self.num_examples) self.start = start self.stop = stop self.subset = Subset(slice(start, stop), self.num_examples)
def get_stream_char(dataset, which_set, time_length, mini_batch_size, total_train_chars=None): data = get_data(dataset) # dataset is one long string containing the whole sequence of indexes dataset = data[which_set] if total_train_chars is None: total_train_chars = dataset.shape[0] nb_mini_batches = total_train_chars / (mini_batch_size * time_length) total_train_chars = nb_mini_batches * mini_batch_size * time_length dataset = dataset[:total_train_chars] dataset = dataset.reshape(mini_batch_size, total_train_chars / mini_batch_size) dataset = dataset.T targets_dataset = dataset[1:, :] targets_dataset = numpy.concatenate( (targets_dataset, numpy.zeros( (1, mini_batch_size)).astype(numpy.int64)), axis=0) dataset = dataset.reshape(nb_mini_batches, time_length, mini_batch_size) targets_dataset = targets_dataset.reshape(nb_mini_batches, time_length, mini_batch_size) dataset = IndexableDataset({ 'features': dataset, 'targets': targets_dataset }) stream = DataStream( dataset, iteration_scheme=SequentialExampleScheme(nb_mini_batches)) return stream
def get_stream_char(dataset, which_set, time_length, mini_batch_size, total_train_chars=None): data = get_data(dataset) dataset = data[which_set] if total_train_chars is None: total_train_chars = dataset.shape[0] nb_mini_batches = total_train_chars / (mini_batch_size * time_length) total_train_chars = nb_mini_batches * mini_batch_size * time_length dataset = dataset[:total_train_chars] dataset = dataset.reshape( mini_batch_size, total_train_chars / mini_batch_size) dataset = dataset.T targets_dataset = dataset[1:, :] targets_dataset = numpy.concatenate( (targets_dataset, numpy.zeros((1, mini_batch_size)).astype(numpy.int64)), axis=0) dataset = dataset.reshape( total_train_chars / (mini_batch_size * time_length), time_length, mini_batch_size) targets_dataset = targets_dataset.reshape( total_train_chars / (mini_batch_size * time_length), time_length, mini_batch_size) # print dataset.shape # print targets_dataset.shape dataset = IndexableDataset({'features': dataset, 'targets': targets_dataset}) stream = DataStream(dataset, iteration_scheme=SequentialExampleScheme( nb_mini_batches)) # stream = MakeRecurrent(time_length, stream) return stream, total_train_chars
def valid(self, req_vars): valid_dataset = TaxiDataset(self.config.valid_set, 'valid.hdf5') train_dataset = TaxiDataset('train') valid_trips_ids = valid_dataset.get_data( None, slice(0, valid_dataset.num_examples))[ valid_dataset.sources.index('trip_id')] prefix_stream = DataStream(valid_dataset, iteration_scheme=SequentialExampleScheme( valid_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = DataStream(train_dataset, iteration_scheme=ShuffledExampleScheme( train_dataset.num_examples)) candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) candidate_stream = transformers.taxi_add_first_last_len( candidate_stream, self.config.n_begin_end_pts) candidate_stream = Batch(candidate_stream, iteration_scheme=ConstantScheme( self.config.valid_candidate_size)) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream