def get_data_stream(iterable): dataset = IterableDataset({'numbers': iterable}) data_stream = Mapping(dataset.get_example_stream(), _data_sqrt, add_sources=('roots', )) data_stream = Mapping(data_stream, _array_tuple) return Batch(data_stream, ConstantScheme(20))
def open_stream(which_sets=('train', ), port=5557, num_examples=None): dataset = Blizzard(which_sets=which_sets) if num_examples == None: num_examples = dataset.num_examples data_stream = DataStream.default_stream(dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) data_stream = ScaleAndShift(data_stream, scale=1 / data_std, shift=-data_mean / data_std) data_stream = Mapping(data_stream, _downsample_and_upsample, add_sources=('upsampled', )) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources=('residual', )) data_stream = FilterSources(data_stream, sources=( 'upsampled', 'residual', )) data_stream = Mapping(data_stream, _segment_axis) data_stream = Mapping(data_stream, _transpose) data_stream = ForceFloatX(data_stream) start_server(data_stream, port=port)
def indexData(self): labCounts = graph_helper.getLabelCounts( self.G, self.trainNodes + self.validationNodes) trainXY, trainIDs = encode_data_VarLen( self.G, self.trainNodes, self.attrKey, self.maxNeighbors, usePrevWeights=self.usePrevWeights, useActualLabs=self.useActualLabs, onlyLabs=self.onlyLabs, useInputX2=self.useInputX2, labCounts=labCounts, dataAug=self.dataAug, pageRankOrder=self.pageRankOrder, usePro=self.usePro, lastH=self.lastHH, nodeIDs=True) validationXY, testIDs = encode_data_VarLen( self.G, self.validationNodes, self.attrKey, self.maxNeighbors, labCounts=labCounts, usePrevWeights=self.usePrevWeights, useActualLabs=self.useActualLabs, onlyLabs=self.onlyLabs, useInputX2=self.useInputX2, pageRankOrder=self.pageRankOrder, usePro=self.usePro, lastH=self.lastHH, nodeIDs=True) self.input_dimx1 = trainXY['x'][0].shape[1] if 'x2' in trainXY: self.input_dimx2 = trainXY['x2'].shape[1] dataset_train = IndexableDataset(trainXY) dataset_valid = IndexableDataset(validationXY) self.num_examples_train = dataset_train.num_examples self.num_examples_valid = dataset_valid.num_examples if self.usePro: transpose_stream = self.transpose_streamPro else: transpose_stream = self.transpose_stream self.stream_train = DataStream(dataset=dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples, batch_size=self.batch_size)) self.stream_train = Padding(self.stream_train, mask_sources=['x']) self.stream_train = Mapping(self.stream_train, transpose_stream) self.stream_valid = DataStream(dataset=dataset_valid, iteration_scheme=ShuffledScheme( examples=dataset_valid.num_examples, batch_size=self.batch_size)) self.stream_valid = Padding(self.stream_valid, mask_sources=['x']) self.stream_valid = Mapping(self.stream_valid, transpose_stream)
def get_data_stream(iterable): """Returns a 'fuel.Batch' datastream of [x~input~numbers, y~targets~roots], with each iteration returning a batch of 20 training examples """ dataset = IterableDataset({'numbers': iterable}) data_stream = Mapping(dataset.get_example_stream(), _data_sqrt, add_sources=('roots', )) data_stream = Mapping(data_stream, _array_tuple) return Batch(data_stream, ConstantScheme(20))
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def test_mapping(): data = [1, 2, 3] data_doubled = [2, 4, 6] stream = DataStream(IterableDataset(data)) wrapper1 = Mapping(stream, lambda d: (2 * d[0], )) assert list(wrapper1.get_epoch_iterator()) == list(zip(data_doubled)) wrapper2 = Mapping(stream, lambda d: (2 * d[0], ), add_sources=("doubled", )) assert wrapper2.sources == ("data", "doubled") assert list(wrapper2.get_epoch_iterator()) == list(zip(data, data_doubled))
def test_mapping_sort(): data = [[1, 2, 3], [2, 3, 1], [3, 2, 1]] data_sorted = [[1, 2, 3]] * 3 data_sorted_rev = [[3, 2, 1]] * 3 stream = DataStream(IterableDataset(data)) wrapper1 = Mapping(stream, SortMapping(operator.itemgetter(0))) assert list(wrapper1.get_epoch_iterator()) == list(zip(data_sorted)) wrapper2 = Mapping(stream, SortMapping(lambda x: -x[0])) assert list(wrapper2.get_epoch_iterator()) == list(zip(data_sorted_rev)) wrapper3 = Mapping(stream, SortMapping(operator.itemgetter(0), reverse=True)) assert list(wrapper3.get_epoch_iterator()) == list(zip(data_sorted_rev))
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def setup_squad_datastream(path, vocab_file, config): ds = SQuADDataset(path, vocab_file) it = SQuADIterator(path) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<DUMMY>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'context', 'question', 'answer', 'ans_indices', 'ans_boundaries' ], mask_dtype='int32') return ds, stream
def test_mapping_accepts_list_or_dict(self): def mapping(d): return [2 * i for i in d[0]], stream = DataStream(IterableDataset(self.data)) assert_raises(ValueError, lambda: Mapping(stream, mapping, mapping_accepts=int))
def get_train_stream(configuration, sfiles, tfiles, svocab_dict, tvocab_dict): s_dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') t_dataset = TextFile(tfiles, tvocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') # Merge stream = Merge([s_dataset.get_example_stream(), t_dataset.get_example_stream()], ('source', 'target')) # Filter -- TODO stream = Filter(stream, predicate=_too_long(seq_len=configuration['seq_len'])) # Map - no need # Batch - Sort stream = Batch(stream, iteration_scheme=ConstantScheme( configuration['batch_size']*configuration['sort_k_batches'])) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) stream = Batch( stream, iteration_scheme=ConstantScheme(configuration['batch_size'])) # Pad # Note that </s>=0. Fuel only allows padding 0 by default masked_stream = Padding(stream) return masked_stream
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=0, eos_id=1, bos_id=2, train_noise=0, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): src_stream = get_stream(src_vocab, src_data, src_vocab_size, unk_id, eos_id, bos_id, train_noise) trg_stream = get_stream(trg_vocab, trg_data, trg_vocab_size, unk_id, eos_id, bos_id, 0) # Merge them to get a source, target pair stream = Merge([src_stream, trg_stream], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_not_too_long(seq_len)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short return PaddingWithEOS(stream, [eos_id, eos_id])
def train(self): print "Loading data" datafile = self.get_datafile() nbexamples = datafile.num_examples nbexamples -= nbexamples % (self.sequence_dim * self.time_dim) train_stream = ReshapeTransformer( DataStream(dataset=datafile, iteration_scheme=ShuffledBatchChunkScheme( nbexamples, self.sequence_dim * self.time_dim)), self.sequence_dim, self.time_dim) if self.image_size is not None: train_stream = Mapping(train_stream, spec_mapping, add_sources=['spectrogram']) print "Building Theano Graph" algorithm, self.fprop = self.build_theano_functions() main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, model=self.model, extensions=[ FinishAfter(after_n_epochs=EPOCHS), TrainingDataMonitoring( [aggregation.mean(self.model.outputs[0])], prefix="train", after_epoch=True), Printing(), SaveParams(EXP_PATH + NAME, after_epoch=True) ]) main_loop.run()
def setup_datastream(path, batch_size, sort_batch_count, valid=False): A = numpy.load( os.path.join(path, ('valid_x_raw.npy' if valid else 'train_x_raw.npy'))) B = numpy.load( os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy'))) C = numpy.load( os.path.join( path, ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy'))) D = [B[x[0]:x[1], 2] for x in C] ds = IndexableDataset({'input': A, 'output': D}) stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A))) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('input')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size, num_examples=len(A))) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def setup_squad_ranker_datastream(path, vocab_file, config, example_count=1836975): ds = SQuADRankerDataset(path, vocab_file) it = ShuffledExampleScheme(examples=example_count) stream = DataStream(ds, iteration_scheme=it) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('question')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=[ 'question', 'answer', 'better', 'worse', 'b_left', 'b_right', 'w_left', 'w_right' ], mask_dtype='int32') return ds, stream
def setup_datastream(path, vocab_file, config): ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question) it = QAIterator(path, shuffle=config.shuffle_questions) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper( stream.sources.index( 'question' if config.concat_ctx_and_question else 'context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) print('sources') print(stream.sources) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32') print('sources2') print(stream.sources) return ds, stream
def framewise_timit_datastream(path, which_set, batch_size, local_copy=False): # load frame-wise dataset timit_dataset = FramewiseTimit(which_set=which_set, path=path, local_copy=local_copy) # set shuffle range shuffle_rng = numpy.random.RandomState(123) # set iterator scheme iterator_scheme = SequentialShuffledScheme( num_examples=timit_dataset.num_examples, batch_size=batch_size, rng=shuffle_rng) # base data stream base_stream = DataStream(dataset=timit_dataset, iteration_scheme=iterator_scheme) # reshape data stream data_source, shape_source reshape_stream = Reshape(data_source='features', shape_source='features_shapes', data_stream=base_stream, iteration_scheme=iterator_scheme) # sort data stream sort_stream = Mapping(data_stream=reshape_stream, mapping=SortMapping(key=lambda x: x[0].shape[0])) # padding data stream padded_stream = Padding(data_stream=sort_stream) return padded_stream
def train(self, req_vars): stream = TaxiDataset('train', data.traintest_ds) if hasattr(self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme(stream.num_examples)) if not data.tvt: valid = TaxiDataset(data.valid_set, data.valid_ds, sources=('trip_id',)) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiGenerateSplits(stream, max_splits=self.config.max_splits) if hasattr(self.config, 'shuffle_batch_size'): stream = transformers.Batch(stream, iteration_scheme=ConstantScheme(self.config.shuffle_batch_size)) stream = Mapping(stream, SortMapping(key=UniformGenerator())) stream = Unpack(stream) stream = transformers.taxi_add_datetime(stream) stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) stream = transformers.Select(stream, tuple(req_vars)) stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) stream = MultiProcessing(stream) return stream
def add_destination(stream): fun = _add_destination_helper(stream.sources.index('latitude'), stream.sources.index('longitude')) return Mapping(stream, fun, add_sources=('destination_latitude', 'destination_longitude'))
def test_add_sources(self): stream = DataStream(IterableDataset(self.data)) transformer = Mapping(stream, lambda d: ([2 * i for i in d[0]],), add_sources=('doubled',)) assert_equal(transformer.sources, ('data', 'doubled')) assert_equal(list(transformer.get_epoch_iterator()), list(zip(self.data, [[2, 4, 6], [4, 6, 2], [6, 4, 2]])))
def get_log_prob_stream(cg, config): eid, did = p_(cg) dataset = config['log_prob_sets'][cg] # Prepare source vocabs and files, make sure special tokens are there src_vocab = cPickle.load(open(config['src_vocabs'][eid])) src_vocab['<S>'] = 0 src_vocab['</S>'] = config['src_eos_idxs'][eid] src_vocab['<UNK>'] = config['unk_id'] # Prepare target vocabs and files, make sure special tokens are there trg_vocab = cPickle.load(open(config['trg_vocabs'][did])) trg_vocab['<S>'] = 0 trg_vocab['</S>'] = config['trg_eos_idxs'][did] trg_vocab['<UNK>'] = config['unk_id'] # Build the preprocessing pipeline for individual streams logger.info('Building logprob stream for cg:[{}]'.format(cg)) src_dataset = TextFile([dataset[0]], src_vocab, None) trg_dataset = TextFile([dataset[1]], trg_vocab, None) stream = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream = Mapping( stream, _oov_to_unk(src_vocab_size=config['src_vocab_sizes'][eid], trg_vocab_size=config['trg_vocab_sizes'][did], unk_id=config['unk_id'])) bs = 100 if 'log_prob_bs' in config: if isinstance(config['log_prob_bs'], dict): bs = config['log_prob_bs'][cg] else: bs = config['log_prob_bs'] stream = Batch(stream, iteration_scheme=ConstantScheme(bs, num_examples=get_num_lines( dataset[0]))) masked_stream = Padding(stream) masked_stream = Mapping( masked_stream, _remapWordIdx([(0, 0, config['src_eos_idxs'][eid]), (2, 0, config['trg_eos_idxs'][did])])) return masked_stream
def get_stream(self, part, batch_size=None, max_length=None, seed=None, remove_keys=False, add_bos_=True, remove_n_identical_keys=True): dataset = self.get_dataset(part, max_length) if self._layout == 'lambada' and part == 'train': stream = DataStream(dataset, iteration_scheme=RandomSpanScheme( dataset.num_examples, max_length, seed)) stream = Mapping(stream, listify) else: stream = dataset.get_example_stream() if add_bos_: stream = SourcewiseMapping(stream, functools.partial( add_bos, Vocabulary.BOS), which_sources=('words')) if max_length != None: stream = SourcewiseMapping(stream, functools.partial( cut_if_too_long, max_length), which_sources=('words')) stream = SourcewiseMapping(stream, vectorize, which_sources=('words')) stream = SourcewiseMapping(stream, word_to_singleton_list, which_sources=('keys')) stream = SourcewiseMapping(stream, vectorize, which_sources=('keys')) stream = Flatten(stream, which_sources=('keys')) if self._layout == 'dict': if remove_keys: stream = FilterSources( stream, [source for source in stream.sources if source != 'keys']) if remove_n_identical_keys: print "remove identical keys" stream = FilterSources(stream, [ source for source in stream.sources if source != 'n_identical_keys' ]) if not batch_size: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=('words')) #stream = Flatten(stream, which_sources=('n_identical_keys')) #if self._layout == 'dict': # stream = FilterSources(stream, [source for source in stream.sources # if source != 'keys_mask']) # stream = FilterSources(stream, [source for source in stream.sources # if source != 'n_identical_keys_mask']) return stream
def balanced_batch(stream, key, batch_size, batch_sort_size): stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * batch_sort_size)) comparison = _balanced_batch_helper(stream.sources.index(key)) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) return Batch(stream, iteration_scheme=ConstantScheme(batch_size))
def taxi_add_first_last_len(stream, k): fun = _taxi_add_first_last_len_helper(k, stream.sources.index('latitude'), stream.sources.index('longitude')) return Mapping(stream, fun, add_sources=('first_k_latitude', 'first_k_longitude', 'last_k_latitude', 'last_k_longitude', 'input_time'))
def define_stream(which_sets=('train',), initial_scale=1, scale=0.5, batch_size=64, seq_length=64, frame_size=128, tbptt_flag = True, num_examples=None): def _segment_axis(data): # Defined inside so that frame_size is available x = tuple([numpy.array([segment_axis(x, frame_size, 0) for x in var]) for var in data]) return x scale = float(scale) dataset = Blizzard(which_sets=which_sets) if num_examples is None: num_examples = batch_size*(dataset.num_examples/batch_size) data_stream = DataStream.default_stream( dataset, iteration_scheme=SequentialScheme(num_examples, batch_size)) data_stream = ScaleAndShift(data_stream, scale=1/data_std, shift=-data_mean/float(data_std)) # Original sampling rate data_stream = Resample(data_stream, scale=initial_scale) data_stream = Mapping(data_stream, _copy, add_sources=('upsampled',)) data_stream = Resample(data_stream, scale=scale, which_sources=('upsampled',)) data_stream = Resample(data_stream, scale=1/scale, which_sources=('upsampled',)) # data_stream = Mapping(data_stream, _downsample_and_upsample, # add_sources=('upsampled',)) data_stream = Mapping(data_stream, _equalize_size) data_stream = Mapping(data_stream, _get_residual, add_sources=('residual',)) data_stream = FilterSources(data_stream, sources=('upsampled', 'residual',)) data_stream = Mapping(data_stream, _segment_axis) data_stream = Mapping(data_stream, _transpose) return data_stream
def test_mapping_sort_multisource(self): data = OrderedDict([('x', self.data_x), ('y', self.data_y)]) data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]), ([1, 2, 3], [4, 5, 6])] stream = DataStream(IterableDataset(data)) transformer = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) assert_equal(list(transformer.get_epoch_iterator()), data_sorted)
def test_mapping_dict(self): def mapping(d): return {'data': [2 * i for i in d['data']]} stream = DataStream(IterableDataset(self.data)) transformer = Mapping(stream, mapping, mapping_accepts=dict) assert_equal(list(transformer.get_epoch_iterator()), list(zip([[2, 4, 6], [4, 6, 2], [6, 4, 2]])))
def test_mapping_sort_multisource(): data = OrderedDict() data['x'] = [[1, 2, 3], [2, 3, 1], [3, 2, 1]] data['y'] = [[6, 5, 4], [6, 5, 4], [6, 5, 4]] data_sorted = [([1, 2, 3], [6, 5, 4]), ([1, 2, 3], [4, 6, 5]), ([1, 2, 3], [4, 5, 6])] stream = DataStream(IterableDataset(data)) wrapper = Mapping(stream, mapping=SortMapping(operator.itemgetter(0))) assert list(wrapper.get_epoch_iterator()) == data_sorted
def get_dev_stream_with_prefixes(val_set=None, val_set_grndtruth=None, src_vocab=None, src_vocab_size=30000, trg_vocab=None, trg_vocab_size=30000, unk_id=1, return_vocab=False, **kwargs): """Setup development set stream if necessary.""" dev_stream = None if val_set is not None and val_set_grndtruth is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) dev_source_dataset = TextFile([val_set], src_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_target_dataset = TextFile([val_set_grndtruth], trg_vocab, bos_token='<S>', eos_token='</S>', unk_token='<UNK>') dev_stream = Merge([dev_source_dataset.get_example_stream(), dev_target_dataset.get_example_stream()], ('source', 'target')) # now add prefix and suffixes to this stream dev_stream = Mapping(dev_stream, PrefixSuffixStreamTransformer(sample_ratio=kwargs.get('dev_sample_ratio', 1.)), add_sources=('target_prefix', 'target_suffix')) dev_stream = Mapping(dev_stream, CopySourceAndTargetToMatchPrefixes(dev_stream)) # changing stream.produces_examples is a little hack which lets us use Unpack to flatten dev_stream.produces_examples = False # flatten the stream back out into (source, target, target_prefix, target_suffix) dev_stream = Unpack(dev_stream) if return_vocab: return dev_stream, src_vocab, trg_vocab else: return dev_stream