def combine_datastreams(ds_labeled, ds_unlabeled): # Rename the sources for clarity if ds_labeled is not None: names = {'features': 'features_labeled', 'targets': 'targets_labeled'} if 'mask' in ds_labeled.sources: names['mask'] = 'masks_labeled' ds_labeled = Rename(ds_labeled, names) # Rename the source for input pixels and hide its labels! if ds_unlabeled is not None: sources = list(ds_unlabeled.sources) # Mask away the features # Remove targets del sources[sources.index('targets')] names = {'features': 'features_unlabeled'} if 'mask' in ds_unlabeled.sources: names['mask'] = 'masks_unlabeled' ds_unlabeled = Rename(FilterSources(ds_unlabeled, sources), names=names) if ds_labeled is None: return ds_unlabeled if ds_unlabeled is None: return ds_labeled return SemiDataStream(data_stream_labeled=ds_labeled, data_stream_unlabeled=ds_unlabeled)
def setUp(self): self.stream = DataStream( IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))]), axis_labels={'X': ('batch', 'width', 'height'), 'y': ('batch',)}), iteration_scheme=SequentialScheme(4, 2)) self.transformer = Rename( self.stream, {'X': 'features', 'y': 'targets'})
def test_rename(): stream = DataStream(IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2)) transformer = Rename(stream, {'X': 'features', 'y': 'targets'}) assert_equal(transformer.sources, ('features', 'targets')) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))]) assert_raises(ValueError, transformer.get_data, [0, 1]) assert_raises(KeyError, Rename, stream, {'Z': 'features'})
def test_rename(): stream = DataStream( IndexableDataset(OrderedDict([("X", numpy.ones((4, 2, 2))), ("y", numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2), ) transformer = Rename(stream, {"X": "features", "y": "targets"}) assert_equal(transformer.sources, ("features", "targets")) assert_equal( list(transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))], ) assert_raises(ValueError, transformer.get_data, [0, 1]) assert_raises(KeyError, Rename, stream, {"Z": "features"})
def test_rename(): stream = DataStream( IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2)) transformer = Rename(stream, {'X': 'features', 'y': 'targets'}) assert_equal(transformer.sources, ('features', 'targets')) assert_equal( list(transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))]) assert_raises(ValueError, transformer.get_data, [0, 1]) assert_raises(KeyError, Rename, stream, {'Z': 'features'})
class TestRename(object): def setUp(self): self.stream = DataStream( IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))]), axis_labels={'X': ('batch', 'width', 'height'), 'y': ('batch',)}), iteration_scheme=SequentialScheme(4, 2)) self.transformer = Rename( self.stream, {'X': 'features', 'y': 'targets'}) def test_renames_sources(self): assert_equal(self.transformer.sources, ('features', 'targets')) def test_leaves_data_unchanged(self): assert_equal( list(self.transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))]) def test_raises_error_on_nonexistent_source_name(self): assert_raises(KeyError, Rename, self.stream, {'Z': 'features'}) def test_renames_axis_labels(self): assert_equal(self.transformer.axis_labels, {'features': ('batch', 'width', 'height'), 'targets': ('batch',)})
class TestRename(object): def setUp(self): self.stream = DataStream( IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))]), axis_labels={'X': ('batch', 'width', 'height'), 'y': ('batch',)}), iteration_scheme=SequentialScheme(4, 2)) self.transformer = Rename( self.stream, {'X': 'features', 'y': 'targets'}) def test_renames_sources(self): assert_equal(self.transformer.sources, ('features', 'targets')) def test_leaves_data_unchanged(self): assert_equal( list(self.transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))]) def test_raises_error_on_nonexistent_source_name(self): assert_raises(KeyError, Rename, self.stream, {'Z': 'features'}) def test_raises_on_invalid_kwargs(self): assert_raises(ValueError, Rename, self.stream, {'X': 'features'}, on_non_existent='foo') def test_name_clash(self): assert_raises(KeyError, Rename, self.stream, {'X': 'y'}) def test_name_swap(self): assert_equal(Rename(self.stream, {'X': 'y', 'y': 'X'}, on_non_existent='ignore').sources, ('y', 'X')) def test_raises_on_not_one_to_one(self): assert_raises(KeyError, Rename, self.stream, {'X': 'features', 'y': 'features'}) def test_intentionally_ignore_missing(self): assert_equal(Rename(self.stream, {'X': 'features', 'y': 'targets', 'Z': 'fudgesicle'}, on_non_existent='ignore').sources, ('features', 'targets')) def test_not_one_to_one_ok_if_not_a_source_in_data_stream(self): assert_equal(Rename(self.stream, {'X': 'features', 'y': 'targets', 'Z': 'targets'}, on_non_existent='ignore').sources, ('features', 'targets')) def test_renames_axis_labels(self): assert_equal(self.transformer.axis_labels, {'features': ('batch', 'width', 'height'), 'targets': ('batch',)})
def test_not_really_a_name_clash(self): try: # This should not raise an error, because we're ignoring # non-existent sources. So renaming a non-existent source # cannot create a name clash. Rename(self.stream, {'foobar': 'y'}, on_non_existent='ignore') except KeyError: assert False # Regression.
def test_not_one_to_one_ok_if_not_a_source_in_data_stream(self): assert_equal( Rename(self.stream, { 'X': 'features', 'y': 'targets', 'Z': 'targets' }, on_non_existent='ignore').sources, ('features', 'targets'))
def test_intentionally_ignore_missing(self): assert_equal( Rename(self.stream, { 'X': 'features', 'y': 'targets', 'Z': 'fudgesicle' }, on_non_existent='ignore').sources, ('features', 'targets'))
def stream_loader(self, split, split_args): # This is technically not respecting in the interface as it also uses imported functions. # However, the final call to fuel.datastreams is defined here and the piping to the imported # functions is intricate, so it is not too farfetched to use this method. if not split_args.has_key('ssl'): print "INFO: No ssl specified for split {}, defaulting to regular loading way".format( split) return self.load_stream_external_func(split, split_args) dataset = self.config['dataset'] split_args = dictify_type(split_args, dict) ssl_args = split_args.pop('ssl') print_epoch_done = ssl_args.pop('print_epoch_done', False) def popargs(args): args.pop('split', None) # it should not exist anyway but just to make sure try: batch_size = args.pop('batch_size') except KeyError: raise KeyError( "No batch size could be found for stream {} and \ split {}".format(dataset, split)) return batch_size ssl_kwargs = [ 'normalization', 'load_in_memory', 'test', 'nb_class', 'examples_per_class', 'examples_start' ] kwargs = [ 'batch_size', 'sources', 'normalization', 'load_in_memory', 'test' ] ssl_args = self.merge_config_with_priority_args(ssl_args, ssl_kwargs) ssl_args.update({'normalization': '01'}) split_args = self.merge_config_with_priority_args(split_args, kwargs) batch_size = popargs(split_args) ssl_batch_size = popargs(ssl_args) split = assert_iterable_return_iterable(split) split_args.update({'split': split}) ssl_args.update({'split': split}) stream = create_stream(dataset, batch_size, **split_args) ssl_stream = create_ssl_stream(dataset, ssl_batch_size, **ssl_args) name_dict = {'features': 'ssl_features', 'targets': 'ssl_targets'} ssl_stream = Rename(ssl_stream, name_dict) streams = [stream, ssl_stream] sources = flatten([s.sources for s in streams]) rval = MultipleStreams(streams, sources, print_other_streams_epoch_done=print_epoch_done) return rval
def _liacl_data_stream(dataset, rel2index, batch_size, word2index, target='negative_sampling', name="", k=3, shuffle=False, neg_sample_kwargs={}): batches_per_epoch = int(np.ceil(dataset.num_examples / float(batch_size))) if shuffle: iteration_scheme = ShuffledScheme(dataset.num_examples, batch_size) else: iteration_scheme = SequentialScheme(dataset.num_examples, batch_size) data_stream = DataStream(dataset, iteration_scheme=iteration_scheme) data_stream = NumberizeWords(data_stream, word2index, default=word2index[UNKNOWN_TOKEN], which_sources=('head', 'tail')) data_stream = NumberizeWords(data_stream, rel2index, which_sources=('rel')) if target == "score": data_stream = Rename(data_stream, {'score': 'target'}) else: data_stream = FilterSources(data_stream, sources=('head', 'tail', 'rel')) data_stream = Padding(data_stream, mask_sources=('head, tail'), mask_dtype=np.float32) if target == 'negative_sampling': logger.info('target for data stream ' + str(name) + ' is negative sampling') data_stream = NegativeSampling(data_stream, k=k) elif target == 'filtered_negative_sampling': logger.info('target for data stream ' + str(name) + ' is filtered negative sampling') data_stream = FilteredNegativeSampling(data_stream, k=k, **neg_sample_kwargs) elif target == 'score': logger.info('target for data stream ' + str(name) + ' is score') else: raise NotImplementedError( 'target ', target, ' must be one of "score" or "negative_sampling"') data_stream = MergeSource(data_stream, merge_sources=('head', 'tail', 'head_mask', 'tail_mask', 'rel'), merge_name='input') return data_stream, batches_per_epoch
def pair_data_stream(dataset, batch_size): data_streams = [ Rename(_data_stream(dataset=dataset, batch_size=batch_size), names={ source: '{}_{}'.format(source, i) for source in dataset.sources }) for i in [1, 2] ] data_stream = Merge(data_streams=data_streams, sources=data_streams[0].sources + data_streams[1].sources) _ = data_streams[0].get_epoch_iterator() # make sure not same order return data_stream
def get_one_stream(self, part, lang=None, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None, num_result=None, soften_distributions=None, only_stream=False): assert lang in self.langs dataset = self.get_dataset(part, lang, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) if num_result is None: num_result = num_examples if lang != self.langs[0] and not only_stream: iteration_scheme = RandomExampleScheme(num_examples, num_result=num_result, rng=rng) stream = DataStream( dataset, iteration_scheme=iteration_scheme) if soften_distributions: stream = Mapping(stream, SoftenResult(self.default_sources, soften_distributions)) for bconv in self._binary_convertable_data: if bconv in self.default_sources: stream = Mapping(stream, ConvertToMask(self.default_sources, bconv, self.num_features(bconv))) if self.add_eos: stream = Mapping(stream, _AddLabel( self.eos_label, index=stream.sources.index(self.sources_map['labels']))) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = Mapping(stream, _AddLabel( self.bos_label, append=False, times=self.add_bos, index=stream.sources.index(self.sources_map['labels']))) if self.max_length: stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) # # Hardcode 0 for source on which to sort. This will be good, as # most source lengths are correlated and, furthermore, the # labels will typically be the last source, thus in a single-input # case this sorts on input lengths # stream = Mapping(stream, SortMapping(_Length( index=0))) stream = Unpack(stream) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) stream = Rename(stream, names=dict_subset({v: k for (k, v) in self.sources_map.items()}, stream.sources, must_have=False)) if not batches: return stream, num_examples stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream._produces_examples = False return stream, num_examples
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) if self.add_eos: stream = Mapping(stream, _AddLabel( self.eos_label, index=stream.sources.index(self.sources_map['labels']))) if self.add_bos: if self.bos_label is None: raise Exception('No bos label given') stream = Mapping(stream, _AddLabel( self.bos_label, append=False, times=self.add_bos, index=stream.sources.index(self.sources_map['labels']))) if self.max_length: stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) # # Hardcode 0 for source on which to sort. This will be good, as # most source lengths are correlated and, furthermore, the # labels will typically be the last source, thus in a single-input # case this sorts on input lengths # stream = Mapping(stream, SortMapping(_Length( index=0))) stream = Unpack(stream) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) stream = Rename(stream, names=dict_subset({v: k for (k, v) in self.sources_map.items()}, stream.sources, must_have=False)) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) stream._produces_examples = False return stream
def parrot_stream(voice, use_speaker=False, which_sets=('train', ), batch_size=32, seq_size=50, num_examples=None, sorting_mult=4, noise_level=None, labels_type='full_labels', check_ratio=False, raw_data=True, q_type='mu-law', q_level=256): assert labels_type in [ 'full_labels', 'phonemes', 'unconditional', 'unaligned_phonemes', 'text' ] dataset = VoiceData(voice=voice, which_sets=which_sets) sorting_size = batch_size * sorting_mult if not num_examples: num_examples = dataset.num_examples if 'train' in which_sets: scheme = ShuffledExampleScheme(num_examples) else: scheme = SequentialExampleScheme(num_examples) data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme) if check_ratio and labels_type in ['unaligned_phonemes', 'text']: idx = data_stream.sources.index(labels_type) min_val = 8 if labels_type == 'text' else 12. max_val = 16 if labels_type == 'text' else 25. data_stream = Filter( data_stream, lambda x: _check_ratio(x, 0, idx, min_val, max_val)) segment_sources = ('features', 'features_mask') all_sources = segment_sources if raw_data: raw_sources = ('raw_audio', ) all_sources += raw_sources else: raw_sources = () if labels_type != 'unconditional': all_sources += ('labels', ) data_stream = Rename(data_stream, {labels_type: 'labels'}) if labels_type in ['full_labels', 'phonemes']: segment_sources += ('labels', ) elif labels_type in ['unaligned_phonemes', 'text']: all_sources += ('labels_mask', ) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(sorting_size)) data_stream = Mapping(data_stream, SortMapping(_length)) data_stream = Unpack(data_stream) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Filter(data_stream, lambda x: _check_batch_size(x, batch_size)) data_stream = Padding(data_stream) if use_speaker: data_stream = FilterSources(data_stream, all_sources + ('speaker_index', )) else: data_stream = FilterSources(data_stream, all_sources) data_stream = SourceMapping(data_stream, _transpose, which_sources=segment_sources) # The conditional is not necessary, but I'm still adding it for clarity. if raw_data: data_stream = SourceMapping(data_stream, _chunk, which_sources=raw_sources) raw_transformer = get_raw_transformer(q_type, q_level) data_stream = SourceMapping(data_stream, raw_transformer, which_sources=raw_sources) data_stream = SegmentSequence(data_stream, seq_size=seq_size + 1, share_value=1, return_last=False, add_flag=True, which_sources=segment_sources + raw_sources) if noise_level is not None: data_stream = AddConstantSource(data_stream, noise_level, 'feedback_noise_level') return data_stream
def test_name_swap(self): assert_equal(Rename(self.stream, {'X': 'y', 'y': 'X'}, on_non_existent='ignore').sources, ('y', 'X'))
class TestRename(object): def setUp(self): self.stream = DataStream( IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))]), axis_labels={'X': ('batch', 'width', 'height'), 'y': ('batch',)}), iteration_scheme=SequentialScheme(4, 2)) self.transformer = Rename( self.stream, {'X': 'features', 'y': 'targets'}) def test_renames_sources(self): assert_equal(self.transformer.sources, ('features', 'targets')) def test_leaves_data_unchanged(self): assert_equal( list(self.transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))]) def test_raises_error_on_nonexistent_source_name(self): assert_raises(KeyError, Rename, self.stream, {'Z': 'features'}) def test_raises_on_invalid_kwargs(self): assert_raises(ValueError, Rename, self.stream, {'X': 'features'}, on_non_existent='foo') def test_name_clash(self): assert_raises(KeyError, Rename, self.stream, {'X': 'y'}) def test_not_really_a_name_clash(self): try: # This should not raise an error, because we're ignoring # non-existent sources. So renaming a non-existent source # cannot create a name clash. Rename(self.stream, {'foobar': 'y'}, on_non_existent='ignore') except KeyError: assert False # Regression. def test_name_swap(self): assert_equal(Rename(self.stream, {'X': 'y', 'y': 'X'}, on_non_existent='ignore').sources, ('y', 'X')) def test_raises_on_not_one_to_one(self): assert_raises(KeyError, Rename, self.stream, {'X': 'features', 'y': 'features'}) def test_intentionally_ignore_missing(self): assert_equal(Rename(self.stream, {'X': 'features', 'y': 'targets', 'Z': 'fudgesicle'}, on_non_existent='ignore').sources, ('features', 'targets')) def test_not_one_to_one_ok_if_not_a_source_in_data_stream(self): assert_equal(Rename(self.stream, {'X': 'features', 'y': 'targets', 'Z': 'targets'}, on_non_existent='ignore').sources, ('features', 'targets')) def test_renames_axis_labels(self): assert_equal(self.transformer.axis_labels, {'features': ('batch', 'width', 'height'), 'targets': ('batch',)})