def get_datastream(path, which_set, batch_size=1, norm_path=None, use_ivectors=False, truncate_ivectors=False, ivector_dim=100, shuffled=True): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) if shuffled: iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) else: iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) if norm_path: data_mean_std = numpy.load(norm_path) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) if use_ivectors: fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if truncate_ivectors: fs = TruncateTransformer(fs, 'ivectors', ivector_dim) # fs = ConcatenateTransformer(fs, ['features', 'ivectors'], 'features') else: fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) return Padding(fs)
def test_works_on_all_sources(self): transformer = FilterSources( self.stream, sources=("features", "targets")) assert_equal(transformer.sources, ('features', 'targets')) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))])
def get_stream(self, part, batch_size=None, max_length=None, seed=None, remove_keys=False, add_bos_=True, remove_n_identical_keys=True): dataset = self.get_dataset(part, max_length) if self._layout == 'lambada' and part == 'train': stream = DataStream(dataset, iteration_scheme=RandomSpanScheme( dataset.num_examples, max_length, seed)) stream = Mapping(stream, listify) else: stream = dataset.get_example_stream() if add_bos_: stream = SourcewiseMapping(stream, functools.partial( add_bos, Vocabulary.BOS), which_sources=('words')) if max_length != None: stream = SourcewiseMapping(stream, functools.partial( cut_if_too_long, max_length), which_sources=('words')) stream = SourcewiseMapping(stream, vectorize, which_sources=('words')) stream = SourcewiseMapping(stream, word_to_singleton_list, which_sources=('keys')) stream = SourcewiseMapping(stream, vectorize, which_sources=('keys')) stream = Flatten(stream, which_sources=('keys')) if self._layout == 'dict': if remove_keys: stream = FilterSources( stream, [source for source in stream.sources if source != 'keys']) if remove_n_identical_keys: print "remove identical keys" stream = FilterSources(stream, [ source for source in stream.sources if source != 'n_identical_keys' ]) if not batch_size: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=('words')) #stream = Flatten(stream, which_sources=('n_identical_keys')) #if self._layout == 'dict': # stream = FilterSources(stream, [source for source in stream.sources # if source != 'keys_mask']) # stream = FilterSources(stream, [source for source in stream.sources # if source != 'n_identical_keys_mask']) return stream
def create_ivector_test_datastream(path, which_set, batch_size=1, delay=0): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if delay: fs = DelayTransformer(fs, delay) fs = FilterSources(data_stream=fs, sources=['features', 'ivectors']) return Padding(fs)
def combine_datastreams(ds_labeled, ds_unlabeled): # Rename the sources for clarity if ds_labeled is not None: names = {'features': 'features_labeled', 'targets': 'targets_labeled'} if 'mask' in ds_labeled.sources: names['mask'] = 'masks_labeled' ds_labeled = Rename(ds_labeled, names) # Rename the source for input pixels and hide its labels! if ds_unlabeled is not None: sources = list(ds_unlabeled.sources) # Mask away the features # Remove targets del sources[sources.index('targets')] names = {'features': 'features_unlabeled'} if 'mask' in ds_unlabeled.sources: names['mask'] = 'masks_unlabeled' ds_unlabeled = Rename(FilterSources(ds_unlabeled, sources), names=names) if ds_labeled is None: return ds_unlabeled if ds_unlabeled is None: return ds_labeled return SemiDataStream(data_stream_labeled=ds_labeled, data_stream_unlabeled=ds_unlabeled)
def open_stream(which_sets=('train', ), port=5557, num_examples=None): dataset = Blizzard(which_sets=which_sets) if num_examples == None: num_examples = dataset.num_examples data_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) data_stream = ScaleAndShift(data_stream, scale=1 / data_std, shift=-data_mean / data_std) data_stream = Mapping(data_stream, _downsample_and_upsample, add_sources=('upsampled', )) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources=('residual', )) data_stream = FilterSources(data_stream, sources=( 'upsampled', 'residual', )) data_stream = Mapping(data_stream, _segment_axis) data_stream = Mapping(data_stream, _transpose) data_stream = ForceFloatX(data_stream) start_server(data_stream, port=port)
def get_spkid_stream(path, which_set, batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['spks']) return fs
def get_uttid_stream(path, which_set='test_eval92', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) print path, which_set iterator_scheme = SequentialScheme(examples=wsj_dataset.num_examples, batch_size=batch_size) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['uttids']) return fs
def get_datastream(path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) print path, which_set iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def _liacl_data_stream(dataset, rel2index, batch_size, word2index, target='negative_sampling', name="", k=3, shuffle=False, neg_sample_kwargs={}): batches_per_epoch = int(np.ceil(dataset.num_examples / float(batch_size))) if shuffle: iteration_scheme = ShuffledScheme(dataset.num_examples, batch_size) else: iteration_scheme = SequentialScheme(dataset.num_examples, batch_size) data_stream = DataStream(dataset, iteration_scheme=iteration_scheme) data_stream = NumberizeWords(data_stream, word2index, default=word2index[UNKNOWN_TOKEN], which_sources=('head', 'tail')) data_stream = NumberizeWords(data_stream, rel2index, which_sources=('rel')) if target == "score": data_stream = Rename(data_stream, {'score': 'target'}) else: data_stream = FilterSources(data_stream, sources=('head', 'tail', 'rel')) data_stream = Padding(data_stream, mask_sources=('head, tail'), mask_dtype=np.float32) if target == 'negative_sampling': logger.info('target for data stream ' + str(name) + ' is negative sampling') data_stream = NegativeSampling(data_stream, k=k) elif target == 'filtered_negative_sampling': logger.info('target for data stream ' + str(name) + ' is filtered negative sampling') data_stream = FilteredNegativeSampling(data_stream, k=k, **neg_sample_kwargs) elif target == 'score': logger.info('target for data stream ' + str(name) + ' is score') else: raise NotImplementedError( 'target ', target, ' must be one of "score" or "negative_sampling"') data_stream = MergeSource(data_stream, merge_sources=('head', 'tail', 'head_mask', 'tail_mask', 'rel'), merge_name='input') return data_stream, batches_per_epoch
def create_ivector_test_datastream(path, which_set='test_eval92', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = SequentialScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors']) return Padding(fs)
def get_stream(batch_size, source_window=4000, target_window=1000, num_examples=5000): from fuel.datasets.youtube_audio import YouTubeAudio data = YouTubeAudio('XqaJ2Ol5cC4') train_stream = data.get_example_stream() train_stream = ForceFloatX(train_stream) window_stream = Window(0, source_window, target_window, overlapping=False, data_stream=train_stream) source_stream = FilterSources(window_stream, sources=('features', )) feats_stream = Mapping(source_stream, mfcc) targets_stream = FilterSources(window_stream, sources=('targets', )) targets_stream = Flatten(targets_stream) stream = Merge((feats_stream, targets_stream), sources=('features', 'targets')) #Add a random Scheme? it_scheme = ConstantScheme(batch_size, num_examples) batched_stream = Batch(stream, it_scheme, strictness=1) return batched_stream
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: stream = Mapping(stream, _AddLabel(self.eos_label)) if self.add_bos: stream = Mapping(stream, _AddLabel(self.bos_label, append=False, times=self.add_bos)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def get_datastream(path, norm_path, which_set='train_si84', batch_size=1): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) data_mean_std = numpy.load(norm_path) iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) base_stream = Normalize(data_stream=base_stream, means=data_mean_std['mean'], stds=data_mean_std['std']) fs = FilterSources(data_stream=base_stream, sources=['features', 'targets']) padded_stream = Padding(data_stream=fs) return padded_stream
def define_stream(which_sets=('train',), initial_scale=1, scale=0.5, batch_size=64, seq_length=64, frame_size=128, tbptt_flag = True, num_examples=None): def _segment_axis(data): # Defined inside so that frame_size is available x = tuple([numpy.array([segment_axis(x, frame_size, 0) for x in var]) for var in data]) return x scale = float(scale) dataset = Blizzard(which_sets=which_sets) if num_examples is None: num_examples = batch_size*(dataset.num_examples/batch_size) data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme(num_examples, batch_size)) data_stream = ScaleAndShift(data_stream, scale=1/data_std, shift=-data_mean/float(data_std)) # Original sampling rate data_stream = Resample(data_stream, scale=initial_scale) data_stream = Mapping(data_stream, _copy, add_sources=('upsampled',)) data_stream = Resample(data_stream, scale=scale, which_sources=('upsampled',)) data_stream = Resample(data_stream, scale=1/scale, which_sources=('upsampled',)) # data_stream = Mapping(data_stream, _downsample_and_upsample, # add_sources=('upsampled',)) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources=('residual',)) data_stream = FilterSources(data_stream, sources=('upsampled', 'residual',)) data_stream = Mapping(data_stream, _segment_axis) data_stream = Mapping(data_stream, _transpose) return data_stream
def stream_handwriting(which_sets, batch_size, seq_size, tbptt=True): dataset = Handwriting(which_sets) data_stream = DataStream.default_stream( dataset, iteration_scheme=ShuffledScheme( batch_size * (dataset.num_examples / batch_size), batch_size)) data_stream = FilterSources(data_stream, sources=('features', )) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) if tbptt: data_stream = SegmentSequence(data_stream, add_flag=True, seq_size=seq_size) data_stream = ForceFloatX(data_stream) return data_stream
def test_filter_sources(): stream = DataStream( IndexableDataset( OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2)) transformer = FilterSources(stream, sources=("features",)) assert_equal(transformer.sources, ('features',)) assert len(next(transformer.get_epoch_iterator())) == 1 transformer = FilterSources(stream, sources=("features", "targets")) assert_equal(transformer.sources, ('features', 'targets')) assert len(next(transformer.get_epoch_iterator())) == 2 transformer = FilterSources(stream, sources=("targets", "features")) assert_equal(transformer.sources, ('features', 'targets')) assert len(next(transformer.get_epoch_iterator())) == 2 assert_raises(ValueError, transformer.get_data, [0, 1]) assert_raises(ValueError, FilterSources, stream, ['error', 'targets'])
def get_stream(self, part, batches=True, shuffle=True, add_sources=()): dataset = self.get_dataset(part, add_sources=add_sources) stream = (DataStream(dataset, iteration_scheme=ShuffledExampleScheme(dataset.num_examples)) if shuffle else dataset.get_example_stream()) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: if self.prepend_eos: stream = Mapping(stream, _AddEosLabelBeginEnd(self.eos_label)) else: stream = Mapping(stream, _AddEosLabelEnd(self.eos_label)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(self.batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def create_ivector_datastream(path, which_set, batch_size=1, delay=0, min_after_cache=1024, length_sort=False): wsj_dataset = H5PYDataset(path, which_sets=(which_set, )) iterator_scheme = ShuffledScheme(batch_size=batch_size, examples=wsj_dataset.num_examples) base_stream = DataStream(dataset=wsj_dataset, iteration_scheme=iterator_scheme) fs = FilterSources(data_stream=base_stream, sources=['features', 'ivectors', 'targets']) if length_sort: fs = LengthSortTransformer(fs, batch_size, min_after_cache) if delay: fs = DelayTransformer(fs, delay) return Padding(fs)
def test_filter_sources(): stream = DataStream( IndexableDataset(OrderedDict([("features", numpy.ones((4, 2, 2))), ("targets", numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2), ) transformer = FilterSources(stream, sources=("features",)) assert_equal(transformer.sources, ("features",)) assert len(next(transformer.get_epoch_iterator())) == 1 transformer = FilterSources(stream, sources=("features", "targets")) assert_equal(transformer.sources, ("features", "targets")) assert len(next(transformer.get_epoch_iterator())) == 2 transformer = FilterSources(stream, sources=("targets", "features")) assert_equal(transformer.sources, ("features", "targets")) assert len(next(transformer.get_epoch_iterator())) == 2 assert_raises(ValueError, transformer.get_data, [0, 1]) assert_raises(ValueError, FilterSources, stream, ["error", "targets"])
for batch in data_stream.get_epoch_iterator(): for element in batch[0]: all_data = numpy.hstack([all_data, element]) mean_data = all_data.mean() std_data = all_data.std() data_stream = ScaleAndShift(data_stream, scale=1 / std_data, shift=-mean_data / std_data) data_stream = Mapping(data_stream, _downsample_and_upsample, add_sources=('upsampled', )) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources=('residual', )) data_stream = FilterSources(data_stream, sources=( 'upsampled', 'residual', )) data_stream = Mapping(data_stream, _segment_axis) data_stream = Padding(data_stream) data_stream = FilterSources(data_stream, sources=('upsampled', 'residual', 'residual_mask')) data_stream = Mapping(data_stream, _transpose) data_stream = ForceFloatX(data_stream) ################# # Model ################# activations_x = [Rectifier()] * depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
def parrot_stream(voice, use_speaker=False, which_sets=('train', ), batch_size=32, seq_size=50, num_examples=None, sorting_mult=4, noise_level=None, labels_type='full_labels', check_ratio=False, raw_data=True, q_type='mu-law', q_level=256): assert labels_type in [ 'full_labels', 'phonemes', 'unconditional', 'unaligned_phonemes', 'text' ] dataset = VoiceData(voice=voice, which_sets=which_sets) sorting_size = batch_size * sorting_mult if not num_examples: num_examples = dataset.num_examples if 'train' in which_sets: scheme = ShuffledExampleScheme(num_examples) else: scheme = SequentialExampleScheme(num_examples) data_stream = DataStream.default_stream(dataset, iteration_scheme=scheme) if check_ratio and labels_type in ['unaligned_phonemes', 'text']: idx = data_stream.sources.index(labels_type) min_val = 8 if labels_type == 'text' else 12. max_val = 16 if labels_type == 'text' else 25. data_stream = Filter( data_stream, lambda x: _check_ratio(x, 0, idx, min_val, max_val)) segment_sources = ('features', 'features_mask') all_sources = segment_sources if raw_data: raw_sources = ('raw_audio', ) all_sources += raw_sources else: raw_sources = () if labels_type != 'unconditional': all_sources += ('labels', ) data_stream = Rename(data_stream, {labels_type: 'labels'}) if labels_type in ['full_labels', 'phonemes']: segment_sources += ('labels', ) elif labels_type in ['unaligned_phonemes', 'text']: all_sources += ('labels_mask', ) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(sorting_size)) data_stream = Mapping(data_stream, SortMapping(_length)) data_stream = Unpack(data_stream) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Filter(data_stream, lambda x: _check_batch_size(x, batch_size)) data_stream = Padding(data_stream) if use_speaker: data_stream = FilterSources(data_stream, all_sources + ('speaker_index', )) else: data_stream = FilterSources(data_stream, all_sources) data_stream = SourceMapping(data_stream, _transpose, which_sources=segment_sources) # The conditional is not necessary, but I'm still adding it for clarity. if raw_data: data_stream = SourceMapping(data_stream, _chunk, which_sources=raw_sources) raw_transformer = get_raw_transformer(q_type, q_level) data_stream = SourceMapping(data_stream, raw_transformer, which_sources=raw_sources) data_stream = SegmentSequence(data_stream, seq_size=seq_size + 1, share_value=1, return_last=False, add_flag=True, which_sources=segment_sources + raw_sources) if noise_level is not None: data_stream = AddConstantSource(data_stream, noise_level, 'feedback_noise_level') return data_stream
import ipdb from scribe.datasets.handwriting import Handwriting from fuel.transformers import Mapping, Padding, FilterSources, ForceFloatX from fuel.schemes import SequentialScheme from fuel.streams import DataStream def _transpose(data): return tuple(array.swapaxes(0,1) for array in data) batch_size = 10 dataset = Handwriting(('train',)) data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme( 50, batch_size)) data_stream = FilterSources(data_stream, sources = ('features',)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) epoch = data_stream.get_epoch_iterator() for batch in epoch: print batch[0].shape print "Segmented:" data_stream = SegmentSequence(data_stream, add_flag = True) epoch = data_stream.get_epoch_iterator() for batch in epoch: print batch[0].shape, batch[2] #ipdb.set_trace()
return tuple(array.swapaxes(0, 1) for array in data) data_dir = os.environ['FUEL_DATA_PATH'] data_dir = os.path.join(data_dir, 'blizzard/', 'sp_standardize.npz') data_stats = numpy.load(data_dir) sp_mean = data_stats['sp_mean'] sp_std = data_stats['sp_std'] dataset = Blizzard(which_sets=('train', ), filename="sp_blizzard.hdf5") data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme( batch_size * (dataset.num_examples / batch_size), batch_size)) data_stream = FilterSources(data_stream, ('sp', )) data_stream = ScaleAndShift(data_stream, scale=1 / sp_std, shift=-sp_mean / sp_std, which_sources=('sp', )) data_stream = Mapping(data_stream, _transpose) data_stream = SegmentSequence(data_stream, seq_size, add_flag=True) data_stream = ForceFloatX(data_stream) train_stream = data_stream num_valid_examples = 4 * 64 dataset = Blizzard(which_sets=('valid', ), filename="sp_blizzard.hdf5") data_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( num_valid_examples, batch_size)) data_stream = FilterSources(data_stream, ('sp', ))
batch_size = 20 frame_size = 3 k = 20 target_size = frame_size * k hidden_size_recurrent = 400 readout_size = 6 * k + 1 lr = 3e-4 dataset = Handwriting(('train', )) data_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( dataset.num_examples, batch_size)) data_stream = FilterSources(data_stream, sources=('features', )) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) data_stream = ForceFloatX(data_stream) dataset = Handwriting(('valid', )) valid_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( dataset.num_examples, 10 * batch_size)) valid_stream = FilterSources(valid_stream, sources=('features', )) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) valid_stream = ForceFloatX(valid_stream) x_tr = next(data_stream.get_epoch_iterator())
def get_stream(self, part, batch_size=None, shuffle=False, max_length=None, raw_text=False, q_ids=False, seed=None, dataset=None): if not seed: seed = fuel.config.default_seed rng = numpy.random.RandomState(seed) if not dataset: dataset = self.get_dataset(part) if shuffle: stream = DataStream(dataset, iteration_scheme=ShuffledExampleScheme( dataset.num_examples, rng=rng)) else: stream = dataset.get_example_stream() if not q_ids: stream = FilterSources( stream, [source for source in dataset.sources if source != 'q_ids']) else: stream = SourcewiseMapping(stream, _str2vec, which_sources=('q_ids')) stream = PutTextTransfomer(stream, dataset, raw_text=True) # <eos> is added for two purposes: to serve a sentinel for coattention, # and also to ensure the answer span ends at a token eos = self.vocab.EOS stream = SourcewiseMapping(stream, functools.partial(add_eos, eos), which_sources=('contexts', 'questions')) stream = Mapping(stream, functools.partial(select_random_answer, rng), mapping_accepts=dict) if not batch_size: if self._retrieval: raise NotImplementedError() return stream if raw_text: stream = Mapping(stream, keep_text, mapping_accepts=dict, add_sources=('contexts_text', 'questions_text')) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) if self._retrieval: stream = Mapping(stream, functools.partial(retrieve_and_pad_squad, self._retrieval), mapping_accepts=dict, add_sources=('defs', 'def_mask', 'contexts_def_map', 'questions_def_map')) stream = SourcewiseMapping(stream, functools.partial(digitize, self.vocab), which_sources=('contexts', 'questions')) stream = Padding(stream, mask_sources=['contexts', 'questions'] + (['contexts_text'] if raw_text else [])) return stream
batch_size = 20 frame_size = 3 k = 20 target_size = frame_size * k hidden_size_recurrent = 400 readout_size =6*k+1 lr = 3e-4 dataset = Handwriting(('train',)) data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme( dataset.num_examples, batch_size)) data_stream = FilterSources(data_stream, sources = ('features',)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) data_stream = ForceFloatX(data_stream) dataset = Handwriting(('valid',)) valid_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme( dataset.num_examples, 10*batch_size)) valid_stream = FilterSources(valid_stream, sources = ('features',)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) valid_stream = ForceFloatX(valid_stream) x_tr = next(data_stream.get_epoch_iterator())
def test_filters_axis_labels(self): transformer = FilterSources(self.stream, sources=("features",)) assert_equal(transformer.axis_labels, {'features': ('batch', 'width', 'height')})
def test_works_on_unsorted_sources(self): transformer = FilterSources( self.stream, sources=("targets", "features")) assert_equal(transformer.sources, ('features', 'targets'))
def test_works_on_sourcessubset(self): transformer = FilterSources(self.stream, sources=("features",)) assert_equal(transformer.sources, ('features',)) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)),), (numpy.ones((2, 2, 2)),)])