def test_two_sources(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert len(next(transformer.get_epoch_iterator())) == 4
def test_mask_dtype(self): transformer = Padding( Batch( DataStream(IterableDataset(dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2) ), mask_dtype="uint8", ) assert_equal(str(next(transformer.get_epoch_iterator())[1].dtype), "uint8")
def indexData(self): labCounts = graph_helper.getLabelCounts( self.G, self.trainNodes + self.validationNodes) trainXY, trainIDs = encode_data_VarLen( self.G, self.trainNodes, self.attrKey, self.maxNeighbors, usePrevWeights=self.usePrevWeights, useActualLabs=self.useActualLabs, onlyLabs=self.onlyLabs, useInputX2=self.useInputX2, labCounts=labCounts, dataAug=self.dataAug, pageRankOrder=self.pageRankOrder, usePro=self.usePro, lastH=self.lastHH, nodeIDs=True) validationXY, testIDs = encode_data_VarLen( self.G, self.validationNodes, self.attrKey, self.maxNeighbors, labCounts=labCounts, usePrevWeights=self.usePrevWeights, useActualLabs=self.useActualLabs, onlyLabs=self.onlyLabs, useInputX2=self.useInputX2, pageRankOrder=self.pageRankOrder, usePro=self.usePro, lastH=self.lastHH, nodeIDs=True) self.input_dimx1 = trainXY['x'][0].shape[1] if 'x2' in trainXY: self.input_dimx2 = trainXY['x2'].shape[1] dataset_train = IndexableDataset(trainXY) dataset_valid = IndexableDataset(validationXY) self.num_examples_train = dataset_train.num_examples self.num_examples_valid = dataset_valid.num_examples if self.usePro: transpose_stream = self.transpose_streamPro else: transpose_stream = self.transpose_stream self.stream_train = DataStream(dataset=dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples, batch_size=self.batch_size)) self.stream_train = Padding(self.stream_train, mask_sources=['x']) self.stream_train = Mapping(self.stream_train, transpose_stream) self.stream_valid = DataStream(dataset=dataset_valid, iteration_scheme=ShuffledScheme( examples=dataset_valid.num_examples, batch_size=self.batch_size)) self.stream_valid = Padding(self.stream_valid, mask_sources=['x']) self.stream_valid = Mapping(self.stream_valid, transpose_stream)
def test_mask_sources(self): transformer = Padding(Batch( DataStream( IterableDataset( OrderedDict([('features', [[1], [2, 3]]), ('targets', [[4, 5, 6], [7]])]))), ConstantScheme(2)), mask_sources=('features',)) assert_equal(len(next(transformer.get_epoch_iterator())), 3)
def test_mask_dtype(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2)), mask_dtype='uint8') assert_equal( str(next(transformer.get_epoch_iterator())[1].dtype), 'uint8')
def test_1d_sequences(self): stream = Batch( DataStream(IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])), ConstantScheme(2)) transformer = Padding(stream) assert_equal(transformer.sources, ("data", "data_mask")) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.array([[1, 0], [2, 3]]), numpy.array([[1, 0], [1, 1]])), (numpy.array([[0, 0, 0], [4, 5, 6]]), numpy.array([[0, 0, 0], [1, 1, 1]])), (numpy.array([[7]]), numpy.array([[1]]))])
def setup_datastream(path, batch_size, sort_batch_count, valid=False): A = numpy.load( os.path.join(path, ('valid_x_raw.npy' if valid else 'train_x_raw.npy'))) B = numpy.load( os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy'))) C = numpy.load( os.path.join( path, ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy'))) D = [B[x[0]:x[1], 2] for x in C] ds = IndexableDataset({'input': A, 'output': D}) stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A))) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('input')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size, num_examples=len(A))) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def candidate_stream(self, n_candidates, sortmap=True): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, self.valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) if sortmap: candidate_stream = transformers.balanced_batch( candidate_stream, key='latitude', batch_size=n_candidates, batch_sort_size=self.config.batch_sort_size) else: candidate_stream = Batch( candidate_stream, iteration_scheme=ConstantScheme(n_candidates)) candidate_stream = Padding(candidate_stream, mask_sources=['latitude', 'longitude']) return candidate_stream
def setup_datastream(path, vocab_file, config): ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question) it = QAIterator(path, shuffle=config.shuffle_questions) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper( stream.sources.index( 'question' if config.concat_ctx_and_question else 'context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) print('sources') print(stream.sources) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32') print('sources2') print(stream.sources) return ds, stream
def framewise_timit_datastream(path, which_set, batch_size, local_copy=False): # load frame-wise dataset timit_dataset = FramewiseTimit(which_set=which_set, path=path, local_copy=local_copy) # set shuffle range shuffle_rng = numpy.random.RandomState(123) # set iterator scheme iterator_scheme = SequentialShuffledScheme( num_examples=timit_dataset.num_examples, batch_size=batch_size, rng=shuffle_rng) # base data stream base_stream = DataStream(dataset=timit_dataset, iteration_scheme=iterator_scheme) # reshape data stream data_source, shape_source reshape_stream = Reshape(data_source='features', shape_source='features_shapes', data_stream=base_stream, iteration_scheme=iterator_scheme) # sort data stream sort_stream = Mapping(data_stream=reshape_stream, mapping=SortMapping(key=lambda x: x[0].shape[0])) # padding data stream padded_stream = Padding(data_stream=sort_stream) return padded_stream
def test_value_error_on_request(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert_raises(ValueError, transformer.get_data, [0, 1])
def valid(self, req_vars): prefix_stream = DataStream(self.valid_dataset, iteration_scheme=SequentialExampleScheme( self.valid_dataset.num_examples)) #prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.balanced_batch( prefix_stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.valid_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def train(self, req_vars): valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = TaxiDataset('train') stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme( stream.num_examples)) stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiExcludeEmptyTrips(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.add_destination(stream) stream = transformers.Select( stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = transformers.balanced_batch( stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) return stream
def test(self, req_vars): prefix_stream = DataStream(self.test_dataset, iteration_scheme=SequentialExampleScheme( self.test_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) if not data.tvt: prefix_stream = transformers.taxi_remove_test_only_clients( prefix_stream) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.test_candidate_size, False) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def setup_squad_ranker_datastream(path, vocab_file, config, example_count=1836975): ds = SQuADRankerDataset(path, vocab_file) it = ShuffledExampleScheme(examples=example_count) stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('question')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'question', 'answer', 'better', 'worse', 'b_left', 'b_right', 'w_left', 'w_right' ], mask_dtype='int32') return ds, stream
def train(self, req_vars): prefix_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: prefix_stream = transformers.TaxiExcludeTrips( prefix_stream, self.valid_trips_ids) prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.TaxiGenerateSplits( prefix_stream, max_splits=self.config.max_splits) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.balanced_batch( prefix_stream, key='latitude', batch_size=self.config.batch_size, batch_sort_size=self.config.batch_sort_size) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.train_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def get_stream(self, part, batch_size, seed=None, raw_text=False): d = self.get_dataset(part) print("Dataset with {} examples".format(d.num_examples)) it = ShuffledExampleScheme(d.num_examples, rng=numpy.random.RandomState(seed)) stream = DataStream(d, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) if self._retrieval: stream = FixedMapping( stream, functools.partial(retrieve_and_pad_snli, self._retrieval), add_sources=("defs", "def_mask", "sentence1_def_map", "sentence2_def_map") ) # This is because there is bug in Fuel :( Cannot concatenate tuple and list if not raw_text: stream = SourcewiseMapping(stream, functools.partial(digitize, self.vocab), which_sources=('sentence1', 'sentence2')) stream = Padding( stream, mask_sources=('sentence1', 'sentence2')) # Increases amount of outputs by x2 return stream
def get_datastream(path, which_set, batch_size=1, norm_path=None, use_ivectors=False, truncate_ivectors=False, ivector_dim=100, shuffled=True): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) if shuffled: iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) else: iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) if norm_path: data_mean_std = numpy.load(norm_path) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) if use_ivectors: fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if truncate_ivectors: fs = TruncateTransformer(fs, 'ivectors', ivector_dim) # fs = ConcatenateTransformer(fs, ['features', 'ivectors'], 'features') else: fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) return Padding(fs)
def setup_squad_datastream(path, vocab_file, config): ds = SQuADDataset(path, vocab_file) it = SQuADIterator(path) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<DUMMY>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'context', 'question', 'answer', 'ans_indices', 'ans_boundaries' ], mask_dtype='int32') return ds, stream
def get_stream(self, part, batch_size=None, max_length=None, seed=None, remove_keys=False, add_bos_=True, remove_n_identical_keys=True): dataset = self.get_dataset(part, max_length) if self._layout == 'lambada' and part == 'train': stream = DataStream(dataset, iteration_scheme=RandomSpanScheme( dataset.num_examples, max_length, seed)) stream = Mapping(stream, listify) else: stream = dataset.get_example_stream() if add_bos_: stream = SourcewiseMapping(stream, functools.partial( add_bos, Vocabulary.BOS), which_sources=('words')) if max_length != None: stream = SourcewiseMapping(stream, functools.partial( cut_if_too_long, max_length), which_sources=('words')) stream = SourcewiseMapping(stream, vectorize, which_sources=('words')) stream = SourcewiseMapping(stream, word_to_singleton_list, which_sources=('keys')) stream = SourcewiseMapping(stream, vectorize, which_sources=('keys')) stream = Flatten(stream, which_sources=('keys')) if self._layout == 'dict': if remove_keys: stream = FilterSources( stream, [source for source in stream.sources if source != 'keys']) if remove_n_identical_keys: print "remove identical keys" stream = FilterSources(stream, [ source for source in stream.sources if source != 'n_identical_keys' ]) if not batch_size: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=('words')) #stream = Flatten(stream, which_sources=('n_identical_keys')) #if self._layout == 'dict': # stream = FilterSources(stream, [source for source in stream.sources # if source != 'keys_mask']) # stream = FilterSources(stream, [source for source in stream.sources # if source != 'n_identical_keys_mask']) return stream
def get_test_stream(sfiles, svocab_dict): dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') stream = Merge([dataset.get_example_stream(),], ('source', )) stream = Batch( stream, iteration_scheme=ConstantScheme(10)) stream = Padding(stream) return stream
def get_feat_stream(path, which_set='test_eval92', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) print path, which_set iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features']) padded_stream = Padding(data_stream=fs) return padded_stream
def get_datastream(path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) print path, which_set iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def replaceTestData(self, testNodes, maxNeighbors=1000, maskNames=['x']): if self.batchesInferences: batch_size = self.batch_size else: batch_size = 1 testing, testIDs = encode_data_VarLen(self.G, testNodes, self.attrKey, maxNeighbors, useActualLabs=self.useActualLabs, useInputX2=self.useInputX2, onlyLabs=self.onlyLabs, lastH=self.lastHH, nodeIDs=True) dataset_test = IndexableDataset(testing) self.stream_test = DataStream(dataset=dataset_test, iteration_scheme=SequentialScheme( examples=dataset_test.num_examples, batch_size=batch_size)) #add masks, have to do individually to avoid all dimensions must be equal error #write own padding transformer, their's sucks ... self.stream_test = Padding(self.stream_test, mask_sources=maskNames) #transpose them for rnn input self.stream_test = Mapping(self.stream_test, self.transpose_streamTest) self.num_examples_test = dataset_test.num_examples #replace shareddata with test_all data self.test_all, names = self.iterateShared(self.stream_test, makeShared=False, name="test") #if we are doing test in batches if self.batchesInferences: for key in self.test_all: totalTestBatches = len(self.test_all[key]) if key != 'nodeID': for i in range(0, totalTestBatches): #if test data has more batches, we add more to shared data list #else we just reset if i >= self.totalBatches: newKey = key + '_myinput' self.sharedData[key].append( shared(self.test_all[key][i], name=self.sharedName + '_' + newKey + '_test_' + str(i))) else: self.sharedData[key][i].set_value( self.test_all[key][i], borrow=True) self.sharedBatch[key].set_value( self.sharedData[key][0].get_value(borrow=True), borrow=True) self.stream_test_int = IntStream(0, totalTestBatches, 1, 'int_stream')
def setup_datastream(batch_size, **kwargs): ds = ToyDataset(**kwargs) stream = DataStream(ds, iteration_scheme=SequentialExampleScheme( kwargs['nb_examples'])) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def valid(self, req_vars): stream = TaxiStream(self.config.valid_set, 'valid.hdf5') stream = transformers.taxi_add_datetime(stream) stream = transformers.add_destination(stream) stream = transformers.Select( stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = Batch(stream, iteration_scheme=ConstantScheme(1)) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) return stream
def _liacl_data_stream(dataset, rel2index, batch_size, word2index, target='negative_sampling', name="", k=3, shuffle=False, neg_sample_kwargs={}): batches_per_epoch = int(np.ceil(dataset.num_examples / float(batch_size))) if shuffle: iteration_scheme = ShuffledScheme(dataset.num_examples, batch_size) else: iteration_scheme = SequentialScheme(dataset.num_examples, batch_size) data_stream = DataStream(dataset, iteration_scheme=iteration_scheme) data_stream = NumberizeWords(data_stream, word2index, default=word2index[UNKNOWN_TOKEN], which_sources=('head', 'tail')) data_stream = NumberizeWords(data_stream, rel2index, which_sources=('rel')) if target == "score": data_stream = Rename(data_stream, {'score': 'target'}) else: data_stream = FilterSources(data_stream, sources=('head', 'tail', 'rel')) data_stream = Padding(data_stream, mask_sources=('head, tail'), mask_dtype=np.float32) if target == 'negative_sampling': logger.info('target for data stream ' + str(name) + ' is negative sampling') data_stream = NegativeSampling(data_stream, k=k) elif target == 'filtered_negative_sampling': logger.info('target for data stream ' + str(name) + ' is filtered negative sampling') data_stream = FilteredNegativeSampling(data_stream, k=k, **neg_sample_kwargs) elif target == 'score': logger.info('target for data stream ' + str(name) + ' is score') else: raise NotImplementedError( 'target ', target, ' must be one of "score" or "negative_sampling"') data_stream = MergeSource(data_stream, merge_sources=('head', 'tail', 'head_mask', 'tail_mask', 'rel'), merge_name='input') return data_stream, batches_per_epoch
def test(self, req_vars): stream = TaxiStream('test') stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_remove_test_only_clients(stream) stream = transformers.Select( stream, tuple(v for v in req_vars if not v.endswith('_mask'))) stream = Batch(stream, iteration_scheme=ConstantScheme(1)) stream = Padding(stream, mask_sources=['latitude', 'longitude']) stream = transformers.Select(stream, req_vars) return stream
def construct_stream(dataset, rng, pool_size, maximum_frames, window_features, **kwargs): """Construct data stream. Parameters: ----------- dataset : Dataset Dataset to use. rng : numpy.random.RandomState Random number generator. pool_size : int Pool size for TIMIT dataset. maximum_frames : int Maximum frames for TIMIT datset. subsample : bool, optional Subsample features. pretrain_alignment : bool, optional Use phoneme alignment for pretraining. uniform_alignment : bool, optional Use uniform alignment for pretraining. """ kwargs.setdefault('subsample', False) kwargs.setdefault('pretrain_alignment', False) kwargs.setdefault('uniform_alignment', False) stream = DataStream(dataset, iteration_scheme=SequentialShuffledScheme( dataset.num_examples, pool_size, rng)) if kwargs['pretrain_alignment'] and kwargs['uniform_alignment']: stream = AddUniformAlignmentMask(stream) stream = Reshape('features', 'features_shapes', data_stream=stream) means, stds = dataset.get_normalization_factors() stream = Normalize(stream, means, stds) if not window_features == 1: stream = WindowFeatures(stream, 'features', window_features) if kwargs['pretrain_alignment']: stream = Reshape('alignments', 'alignments_shapes', data_stream=stream) stream = Mapping(stream, SortMapping(key=key)) stream = MaximumFrameCache(max_frames=maximum_frames, data_stream=stream, rng=rng) stream = Padding(data_stream=stream, mask_sources=['features', 'phonemes']) if kwargs['pretrain_alignment']: stream = AlignmentPadding(stream, 'alignments') stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0), (2, 1, 0)]) else: stream = Transpose(stream, [(1, 0, 2), (1, 0), (1, 0), (1, 0)]) stream = ForceFloatX(stream) if kwargs['subsample']: stream = Subsample(stream, 'features', 5) stream = Subsample(stream, 'features_mask', 5) return stream
def test_2d_sequences(self): stream = Batch( DataStream( IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])), ConstantScheme(2)) it = Padding(stream).get_epoch_iterator() data, mask = next(it) assert data.shape == (2, 3, 4) assert (data[0, :, :] == 1).all() assert (data[1, :2, :] == 2).all() assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()
def test_padding(): # 1-D sequences stream = Batch( DataStream( IterableDataset([[1], [2, 3], [], [4, 5, 6], [7]])), ConstantScheme(2)) mask_stream = Padding(stream) assert mask_stream.sources == ("data", "data_mask") it = mask_stream.get_epoch_iterator() data, mask = next(it) assert (data == numpy.array([[1, 0], [2, 3]])).all() assert (mask == numpy.array([[1, 0], [1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[0, 0, 0], [4, 5, 6]])).all() assert (mask == numpy.array([[0, 0, 0], [1, 1, 1]])).all() data, mask = next(it) assert (data == numpy.array([[7]])).all() assert (mask == numpy.array([[1]])).all() # 2D sequences stream2 = Batch( DataStream( IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])), ConstantScheme(2)) it = Padding(stream2).get_epoch_iterator() data, mask = next(it) assert data.shape == (2, 3, 4) assert (data[0, :, :] == 1).all() assert (data[1, :2, :] == 2).all() assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all() # 2 sources stream3 = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert len(next(stream3.get_epoch_iterator())) == 4
batch_size = 20 frame_size = 3 k = 20 target_size = frame_size * k hidden_size_recurrent = 400 readout_size = 6 * k + 1 lr = 3e-4 dataset = Handwriting(("train",)) data_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme(dataset.num_examples, batch_size)) # data_stream = FilterSources(data_stream, # sources = ('features',)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # data_stream = ForceFloatX(data_stream) dataset = Handwriting(("valid",)) valid_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme(dataset.num_examples, 10 * batch_size) ) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) x_tr = next(data_stream.get_epoch_iterator()) x = tensor.tensor3("features") x_mask = tensor.matrix("features_mask")