def test_strictness_2(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5, 6])) transformer = Batch(stream, ConstantScheme(2), strictness=2) assert_equal( list(transformer.get_epoch_iterator()), [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5, 6]),)], )
def setUp(self): data = range(10) self.stream = Batch( DataStream(IterableDataset(data)), iteration_scheme=ConstantScheme(2)) data_np = numpy.arange(10) self.stream_np = Batch( DataStream(IterableDataset(data_np)), iteration_scheme=ConstantScheme(2))
def get_tr_stream(src_vocab, trg_vocab, src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the training data stream.""" # Load dictionaries and ensure special tokens exist src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab, 'rb')), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = _ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab, 'rb')), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair stream = Merge([src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long stream = Filter(stream, predicate=_too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token stream = Mapping(stream, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) # Build a batched version of stream to read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) # Sort all samples in the read-ahead batch stream = Mapping(stream, SortMapping(_length)) # Convert it into a stream again stream = Unpack(stream) # Construct batches from the stream with specified batch size stream = Batch( stream, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = PaddingWithEOS( stream, [src_vocab_size - 1, trg_vocab_size - 1]) return masked_stream
def get_sgnmt_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the unshuffled training data stream. This corresponds to ``get_sgnmt_tr_stream`` in ``machine_translation/stream`` in the blocks examples.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def setUp(self): self.streams = ( DataStream(IterableDataset(['Hello world!'])), DataStream(IterableDataset(['Bonjour le monde!']))) self.batch_streams = ( Batch(DataStream(IterableDataset(['Hello world!', 'Hi!'])), iteration_scheme=ConstantScheme(2)), Batch(DataStream(IterableDataset(['Bonjour le monde!', 'Salut!'])), iteration_scheme=ConstantScheme(2))) self.transformer = Merge( self.streams, ('english', 'french')) self.batch_transformer = Merge( self.batch_streams, ('english', 'french'))
def get_stream(self, part, batches=True, shuffle=True, add_sources=(), num_examples=None, rng=None, seed=None): dataset = self.get_dataset(part, add_sources=add_sources) if num_examples is None: num_examples = dataset.num_examples if shuffle: iteration_scheme = ShuffledExampleScheme(num_examples, rng=rng) else: iteration_scheme = SequentialExampleScheme(num_examples) stream = DataStream( dataset, iteration_scheme=iteration_scheme) stream = FilterSources(stream, (self.recordings_source, self.labels_source)+tuple(add_sources)) if self.add_eos: stream = Mapping(stream, _AddLabel(self.eos_label)) if self.add_bos: stream = Mapping(stream, _AddLabel(self.bos_label, append=False, times=self.add_bos)) if self.preprocess_text: stream = Mapping(stream, lvsr.datasets.wsj.preprocess_text) stream = Filter(stream, self.length_filter) if self.sort_k_batches and batches: stream = Batch(stream, iteration_scheme=ConstantScheme( self.batch_size * self.sort_k_batches)) stream = Mapping(stream, SortMapping(_length)) stream = Unpack(stream) if self.preprocess_features == 'log_spectrogram': stream = Mapping( stream, functools.partial(apply_preprocessing, log_spectrogram)) if self.normalization: stream = self.normalization.wrap_stream(stream) stream = ForceFloatX(stream) if not batches: return stream stream = Batch( stream, iteration_scheme=ConstantScheme(self.batch_size if part == 'train' else self.validation_batch_size)) stream = Padding(stream) stream = Mapping(stream, switch_first_two_axes) stream = ForceCContiguous(stream) return stream
def get_sgnmt_shuffled_tr_stream(src_data, trg_data, src_vocab_size=30000, trg_vocab_size=30000, unk_id=1, seq_len=50, batch_size=80, sort_k_batches=12, **kwargs): """Prepares the shuffled training data stream. This is similar to ``get_sgnmt_tr_stream`` but uses ``ParallelTextFile`` in combination with ``ShuffledExampleScheme`` to support reshuffling.""" # Build dummy vocabulary to make TextFile happy src_vocab = add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) parallel_dataset = ParallelTextFile(src_data, trg_data, src_vocab, trg_vocab, None) #iter_scheme = SequentialExampleScheme(parallel_dataset.num_examples) iter_scheme = ShuffledExampleScheme(parallel_dataset.num_examples) s = DataStream(parallel_dataset, iteration_scheme=iter_scheme) # Filter sequences that are too long s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) # Replace out of vocabulary tokens with unk token s = Mapping( s, stream._oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=utils.UNK_ID)) # Build a batched version of stream to read k batches ahead s = Batch(s, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # Sort all samples in the read-ahead batch s = Mapping(s, SortMapping(stream._length)) # Convert it into a stream again s = Unpack(s) # Construct batches from the stream with specified batch size s = Batch(s, iteration_scheme=ConstantScheme(batch_size)) # Pad sequences that are short masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return masked_stream
def create_data_generator(path, vocab_file, config): ds = QADataset(path, vocab_file, config.n_entities, need_sep_token=config.concat_ctx_and_question) it = QAIterator(path, shuffle=config.shuffle_questions) stream = DataStream(ds, iteration_scheme=it) if config.concat_ctx_and_question: stream = ConcatCtxAndQuestion(stream, config.concat_question_before, ds.reverse_vocab['<SEP>']) # Sort sets of multiple batches to make batches of similar sizes stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper( stream.sources.index( 'question' if config.concat_ctx_and_question else 'context')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['context', 'question', 'candidates'], mask_dtype='int32') def gen(): if not config.concat_ctx_and_question: for (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg, candidates, candidates_mask) in stream.get_epoch_iterator(): seq_cont_mask = seq_cont_mask.astype('float32') seq_quest_mask = seq_quest_mask.astype('float32') candidates_mask = candidates_mask.astype('float32') yield (seq_cont, seq_cont_mask, seq_quest, seq_quest_mask, tg, candidates, candidates_mask) else: for (seq, seq_mask, tg, candidates, candidates_mask) \ in stream.get_epoch_iterator(): seq_mask = seq_mask.astype('float32') candidates_mask = candidates_mask.astype('float32') yield (seq, seq_mask, tg, candidates, candidates_mask) return gen
def setup_sorter_datastream(path, config): ds = SorterDataset(path) it = ShuffledExampleScheme(examples=config.example_count) stream = DataStream(ds, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size * config.sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('unsorted')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(config.batch_size)) stream = Padding(stream, mask_sources=['answer', 'unsorted'], mask_dtype='int32') return ds, stream
def train(self, req_vars): valid = TaxiDataset(self.config.valid_set, 'valid.hdf5', sources=('trip_id', )) valid_trips_ids = valid.get_data(None, slice(0, valid.num_examples))[0] stream = TaxiDataset('train') if hasattr( self.config, 'use_cuts_for_training') and self.config.use_cuts_for_training: stream = DataStream(stream, iteration_scheme=TaxiTimeCutScheme()) else: stream = DataStream(stream, iteration_scheme=ShuffledExampleScheme( stream.num_examples)) stream = transformers.TaxiExcludeTrips(stream, valid_trips_ids) stream = transformers.TaxiGenerateSplits( stream, max_splits=self.config.max_splits) stream = transformers.taxi_add_datetime(stream) # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) stream = transformers.Select(stream, tuple(req_vars)) stream = Batch(stream, iteration_scheme=ConstantScheme(self.config.batch_size)) stream = MultiProcessing(stream) return stream
def valid(self, req_vars): stream = TaxiStream(self.config.valid_set, 'valid.hdf5') stream = transformers.taxi_add_datetime(stream) # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) stream = transformers.Select(stream, tuple(req_vars)) return Batch(stream, iteration_scheme=ConstantScheme(1000))
def get_stream(self, part, batch_size, seed=None, raw_text=False): d = self.get_dataset(part) print("Dataset with {} examples".format(d.num_examples)) it = ShuffledExampleScheme(d.num_examples, rng=numpy.random.RandomState(seed)) stream = DataStream(d, iteration_scheme=it) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) if self._retrieval: stream = FixedMapping( stream, functools.partial(retrieve_and_pad_snli, self._retrieval), add_sources=("defs", "def_mask", "sentence1_def_map", "sentence2_def_map") ) # This is because there is bug in Fuel :( Cannot concatenate tuple and list if not raw_text: stream = SourcewiseMapping(stream, functools.partial(digitize, self.vocab), which_sources=('sentence1', 'sentence2')) stream = Padding( stream, mask_sources=('sentence1', 'sentence2')) # Increases amount of outputs by x2 return stream
def test(self, req_vars): prefix_stream = DataStream(self.test_dataset, iteration_scheme=SequentialExampleScheme( self.test_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) if not data.tvt: prefix_stream = transformers.taxi_remove_test_only_clients( prefix_stream) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) prefix_stream = Padding(prefix_stream, mask_sources=['latitude', 'longitude']) candidate_stream = self.candidate_stream( self.config.test_candidate_size, False) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) # stream = MultiProcessing(stream) return stream
def test_two_sources(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert len(next(transformer.get_epoch_iterator())) == 4
def _get_align_stream(src_data, trg_data, src_vocab_size, trg_vocab_size, seq_len, **kwargs): """Creates the stream which is used for the main loop. Args: src_data (string): Path to the source sentences trg_data (string): Path to the target sentences src_vocab_size (int): Size of the source vocabulary in the NMT model trg_vocab_size (int): Size of the target vocabulary in the NMT model seq_len (int): Maximum length of any source or target sentence Returns: ExplicitNext. Alignment data stream which can be iterated explicitly """ # Build dummy vocabulary to make TextFile happy src_vocab = _add_special_ids({str(i): i for i in xrange(src_vocab_size)}) trg_vocab = _add_special_ids({str(i): i for i in xrange(trg_vocab_size)}) # Get text files from both source and target src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) # Merge them to get a source, target pair s = Merge( [src_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) s = Filter(s, predicate=stream._too_long(seq_len=seq_len)) s = Batch(s, iteration_scheme=ConstantScheme(1)) masked_stream = stream.PaddingWithEOS(s, [utils.EOS_ID, utils.EOS_ID]) return ExplicitNext(masked_stream)
def test_adds_batch_to_axis_labels(self): stream = DataStream( IterableDataset( {'features': [1, 2, 3, 4, 5]}, axis_labels={'features': ('index',)})) transformer = Batch(stream, ConstantScheme(2), strictness=0) assert_equal(transformer.axis_labels, {'features': ('batch', 'index')})
def candidate_stream(self, n_candidates, sortmap=True): candidate_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: candidate_stream = transformers.TaxiExcludeTrips( candidate_stream, self.valid_trips_ids) candidate_stream = transformers.TaxiExcludeEmptyTrips(candidate_stream) candidate_stream = transformers.taxi_add_datetime(candidate_stream) if not data.tvt: candidate_stream = transformers.add_destination(candidate_stream) if sortmap: candidate_stream = transformers.balanced_batch( candidate_stream, key='latitude', batch_size=n_candidates, batch_sort_size=self.config.batch_sort_size) else: candidate_stream = Batch( candidate_stream, iteration_scheme=ConstantScheme(n_candidates)) candidate_stream = Padding(candidate_stream, mask_sources=['latitude', 'longitude']) return candidate_stream
def get_data_stream(iterable): dataset = IterableDataset({'numbers': iterable}) data_stream = Mapping(dataset.get_example_stream(), _data_sqrt, add_sources=('roots', )) data_stream = Mapping(data_stream, _array_tuple) return Batch(data_stream, ConstantScheme(20))
def test(self, req_vars): prefix_stream = DataStream(self.test_dataset, iteration_scheme=SequentialExampleScheme( self.test_dataset.num_examples)) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) if not data.tvt: prefix_stream = transformers.taxi_remove_test_only_clients( prefix_stream) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = self.candidate_stream( self.config.test_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def train(self, req_vars): prefix_stream = DataStream(self.train_dataset, iteration_scheme=ShuffledExampleScheme( self.train_dataset.num_examples)) if not data.tvt: prefix_stream = transformers.TaxiExcludeTrips( prefix_stream, self.valid_trips_ids) prefix_stream = transformers.TaxiExcludeEmptyTrips(prefix_stream) prefix_stream = transformers.TaxiGenerateSplits( prefix_stream, max_splits=self.config.max_splits) prefix_stream = transformers.taxi_add_datetime(prefix_stream) prefix_stream = transformers.taxi_add_first_last_len( prefix_stream, self.config.n_begin_end_pts) prefix_stream = Batch(prefix_stream, iteration_scheme=ConstantScheme( self.config.batch_size)) candidate_stream = self.candidate_stream( self.config.train_candidate_size) sources = prefix_stream.sources + tuple( 'candidate_%s' % k for k in candidate_stream.sources) stream = Merge((prefix_stream, candidate_stream), sources) stream = transformers.Select(stream, tuple(req_vars)) stream = MultiProcessing(stream) return stream
def test_value_error_on_request(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert_raises(ValueError, transformer.get_data, [0, 1])
def test_cache(): dataset = IterableDataset(range(100)) stream = DataStream(dataset) batched_stream = Batch(stream, ConstantScheme(11)) cached_stream = Cache(batched_stream, ConstantScheme(7)) epoch = cached_stream.get_epoch_iterator() # Make sure that cache is filled as expected for (features, ), cache_size in zip(epoch, [4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 0, 4]): assert len(cached_stream.cache[0]) == cache_size # Make sure that the epoch finishes correctly for (features, ) in cached_stream.get_epoch_iterator(): pass assert len(features) == 100 % 7 assert not cached_stream.cache[0] # Ensure that the epoch transition is correct cached_stream = Cache(batched_stream, ConstantScheme(7, times=3)) for _, epoch in zip(range(2), cached_stream.iterate_epochs()): cache_sizes = [4, 8, 1] for i, (features, ) in enumerate(epoch): assert len(cached_stream.cache[0]) == cache_sizes[i] assert len(features) == 7 assert numpy.all(list(range(100))[i * 7:(i + 1) * 7] == features) assert i == 2
def get_stream(self, part, batch_size=None, max_length=None, seed=None, remove_keys=False, add_bos_=True, remove_n_identical_keys=True): dataset = self.get_dataset(part, max_length) if self._layout == 'lambada' and part == 'train': stream = DataStream(dataset, iteration_scheme=RandomSpanScheme( dataset.num_examples, max_length, seed)) stream = Mapping(stream, listify) else: stream = dataset.get_example_stream() if add_bos_: stream = SourcewiseMapping(stream, functools.partial( add_bos, Vocabulary.BOS), which_sources=('words')) if max_length != None: stream = SourcewiseMapping(stream, functools.partial( cut_if_too_long, max_length), which_sources=('words')) stream = SourcewiseMapping(stream, vectorize, which_sources=('words')) stream = SourcewiseMapping(stream, word_to_singleton_list, which_sources=('keys')) stream = SourcewiseMapping(stream, vectorize, which_sources=('keys')) stream = Flatten(stream, which_sources=('keys')) if self._layout == 'dict': if remove_keys: stream = FilterSources( stream, [source for source in stream.sources if source != 'keys']) if remove_n_identical_keys: print "remove identical keys" stream = FilterSources(stream, [ source for source in stream.sources if source != 'n_identical_keys' ]) if not batch_size: return stream stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=('words')) #stream = Flatten(stream, which_sources=('n_identical_keys')) #if self._layout == 'dict': # stream = FilterSources(stream, [source for source in stream.sources # if source != 'keys_mask']) # stream = FilterSources(stream, [source for source in stream.sources # if source != 'n_identical_keys_mask']) return stream
def get_test_stream(sfiles, svocab_dict): dataset = TextFile(sfiles, svocab_dict, bos_token=None, eos_token=None,\ unk_token='<unk>', level='word', preprocess=None, encoding='utf8') stream = Merge([dataset.get_example_stream(),], ('source', )) stream = Batch( stream, iteration_scheme=ConstantScheme(10)) stream = Padding(stream) return stream
def test(self, req_vars): stream = TaxiStream('test') stream = transformers.taxi_add_datetime(stream) # stream = transformers.taxi_add_first_last_len(stream, self.config.n_begin_end_pts) stream = transformers.taxi_remove_test_only_clients(stream) return Batch(stream, iteration_scheme=ConstantScheme(1))
def test_unpack_transformer(): data = range(10) stream = DataStream(IterableDataset(data)) stream = Batch(stream, iteration_scheme=ConstantScheme(2)) stream = Unpack(stream) epoch = stream.get_epoch_iterator() for i, v in enumerate(epoch): assert numpy.shape(v)[0] == 1 assert v[0] == i
def test_mask_sources(self): transformer = Padding(Batch( DataStream( IterableDataset( OrderedDict([('features', [[1], [2, 3]]), ('targets', [[4, 5, 6], [7]])]))), ConstantScheme(2)), mask_sources=('features',)) assert_equal(len(next(transformer.get_epoch_iterator())), 3)
def test_mask_dtype(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2)), mask_dtype='uint8') assert_equal( str(next(transformer.get_epoch_iterator())[1].dtype), 'uint8')
def load_parallel_data(src_file, tgt_file, batch_size, sort_k_batches, dictionary, training=False): def preproc(s): s = s.replace('``', '"') s = s.replace('\'\'', '"') return s enc_dset = TextFile(files=[src_file], dictionary=dictionary, bos_token=None, eos_token=None, unk_token=CHAR_UNK_TOK, level='character', preprocess=preproc) dec_dset = TextFile(files=[tgt_file], dictionary=dictionary, bos_token=CHAR_SOS_TOK, eos_token=CHAR_EOS_TOK, unk_token=CHAR_UNK_TOK, level='character', preprocess=preproc) # NOTE merge encoder and decoder setup together stream = Merge( [enc_dset.get_example_stream(), dec_dset.get_example_stream()], ('source', 'target')) if training: # filter sequences that are too long stream = Filter(stream, predicate=TooLong(seq_len=CHAR_MAX_SEQ_LEN)) # batch and read k batches ahead stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_k_batches)) # sort all samples in read-ahead batch stream = Mapping(stream, SortMapping(lambda x: len(x[1]))) # turn back into stream stream = Unpack(stream) # batch again stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) masked_stream = Padding(stream) return masked_stream
def setup_datastream(batch_size, **kwargs): ds = ToyDataset(**kwargs) stream = DataStream(ds, iteration_scheme=SequentialExampleScheme( kwargs['nb_examples'])) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size)) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def test_batch(): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) wrapper = Batch(stream, ConstantScheme(2)) batches = list(wrapper.get_epoch_iterator()) expected = [(numpy.array([1, 2]),), (numpy.array([3, 4]),), (numpy.array([5]),)] assert len(batches) == len(expected) for b, e in zip(batches, expected): assert (b[0] == e[0]).all() # Check the `strict` flag def try_strict(strictness): return list(Batch(stream, ConstantScheme(2), strictness=strictness) .get_epoch_iterator()) assert_raises(ValueError, try_strict, 2) assert len(try_strict(1)) == 2 stream2 = DataStream(IterableDataset([1, 2, 3, 4, 5, 6])) assert len(list(Batch(stream2, ConstantScheme(2), strictness=2) .get_epoch_iterator())) == 3
def test_2d_sequences(self): stream = Batch( DataStream( IterableDataset([numpy.ones((3, 4)), 2 * numpy.ones((2, 4))])), ConstantScheme(2)) it = Padding(stream).get_epoch_iterator() data, mask = next(it) assert data.shape == (2, 3, 4) assert (data[0, :, :] == 1).all() assert (data[1, :2, :] == 2).all() assert (mask == numpy.array([[1, 1, 1], [1, 1, 0]])).all()
from fuel.transformers import Mapping, Batch from fuel.schemes import ConstantScheme from fuel.transformers import Flatten from extensions.plot import Plot from datasets.addition import AdditionTask from numpy import swapaxes def _transpose(data): return tuple(swapaxes(array,0,1) if len(array.shape) > 2 else array for array in data) dataset = AdditionTask(1000) train_stream = dataset.get_example_stream() train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(10)) train_stream = Mapping(train_stream, _transpose) features_test, targets_test = next(train_stream.get_epoch_iterator()) x = tensor.tensor3('features') y = tensor.matrix('targets') n_batchs = 1000 h_dim = 2 x_dim = 2 encode = Linear(name='encode', input_dim=x_dim, output_dim=h_dim)
def train_model(batch_size=100, n_h=50, n_epochs=40): # Load the datasets with Fuel dictionary = pkl.load(open(DICT_FILE, 'r')) dictionary['~'] = len(dictionary) reverse_mapping = dict((j, i) for i, j in dictionary.items()) print("Loading the data") train = TextFile(files=[TRAIN_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) train_stream = DataStream.default_stream(train) # organize data in batches and pad shorter sequences with zeros train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) # idem dito for the validation text val = TextFile(files=[VAL_FILE], dictionary=dictionary, unk_token='~', level='character', preprocess=str.lower, bos_token=None, eos_token=None) val_stream = DataStream.default_stream(val) # organize data in batches and pad shorter sequences with zeros val_stream = Batch(val_stream, iteration_scheme=ConstantScheme(batch_size)) val_stream = Padding(val_stream) print('Building model') # Set the random number generator' seeds for consistency rng = numpy.random.RandomState(12345) x = T.lmatrix('x') mask = T.matrix('mask') # Construct the LSTM layer recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1], n_in=n_h, n_out=111) cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, x[1:], mask[1:]) / batch_size # create a list of all model parameters to be fit by gradient descent params = logreg_layer.params + recurrent_layer.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # update_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. learning_rate = 0.1 updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] update_model = theano.function([x, mask], cost, updates=updates) evaluate_model = theano.function([x, mask], cost) # Define and compile a function for generating a sequence step by step. x_t = T.iscalar() h_p = T.vector() c_p = T.vector() h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b energy_exp = T.exp(energy - T.max(energy, 1)[:, None]) output = energy_exp / energy_exp.sum(1)[:, None] single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) start_time = time.clock() iteration = 0 for epoch in range(n_epochs): print 'epoch:', epoch for x_, mask_ in train_stream.get_epoch_iterator(): iteration += 1 cross_entropy = update_model(x_.T, mask_.T) # Generate some text after each 20 minibatches if iteration % 40 == 0: try: prediction = numpy.ones(111, dtype=config.floatX) / 111.0 h_p = numpy.zeros((n_h,), dtype=config.floatX) c_p = numpy.zeros((n_h,), dtype=config.floatX) initial = 'the meaning of life is ' sentence = initial for char in initial: x_t = dictionary[char] prediction, h_p, c_p = single_step(x_t, h_p.flatten(), c_p.flatten()) sample = numpy.random.multinomial(1, prediction.flatten()) for i in range(450): x_t = numpy.argmax(sample) prediction, h_p, c_p = single_step(x_t, h_p.flatten(), c_p.flatten()) sentence += reverse_mapping[x_t] sample = numpy.random.multinomial(1, prediction.flatten()) print 'LSTM: "' + sentence + '"' except ValueError: print 'Something went wrong during sentence generation.' if iteration % 40 == 0: print 'epoch:', epoch, ' minibatch:', iteration val_scores = [] for x_val, mask_val in val_stream.get_epoch_iterator(): val_scores.append(evaluate_model(x_val.T, mask_val.T)) print 'Average validation CE per sentence:', numpy.mean(val_scores) end_time = time.clock() print('Optimization complete.') print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
def test_strictness_2_error(self): stream = DataStream(IterableDataset([1, 2, 3, 4, 5])) transformer = Batch(stream, ConstantScheme(2), strictness=2) assert_raises(ValueError, list, transformer.get_epoch_iterator())
def DStream(datatype, config): if datatype=='train': filename = config['train_file'] filename_morph = config['train_morph_file'] filename_rel = config['train_rel_file'] elif datatype == 'valid': filename = config['valid_file'] filename_morph = config['valid_morph_file'] filename_rel = config['valid_rel_file'] elif datatype == 'test': filename = config['test_file'] filename_morph = config['test_morph_file'] filename_rel = config['test_rel_file'] else: logger.error('wrong datatype, train, valid, or test') data = TextFile(files=[filename], dictionary=pickle.load(open(config['train_dic'],'rb')), unk_token=config['unk_token'], level='word', bos_token=config['bos_token'], eos_token=config['eos_token']) data_morph = TextFile(files=[filename_morph], dictionary=pickle.load(open(config['train_morph_dic'],'rb')), unk_token=config['unk_token'], level='word', bos_token=config['bos_token'], eos_token=config['eos_token']) data_stream = DataStream.default_stream(data) data_stream.sources = ('sentence',) data_morph_stream = DataStream.default_stream(data_morph) data_morph_stream.sources = ('sentence',) # organize data in batches and pad shorter sequences with zeros batch_size = config['batch_size'] rels_stream = [] with open(filename_rel , "r") as fin: lines = fin.readlines() i = 0 while i < len(lines): if i + batch_size < len(lines): rels_stream.append(padding(lines[i : i + batch_size])) i = i + batch_size else: rels_stream.append(padding(lines[i : len(lines)])) i = i + batch_size data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_morph_stream = Batch(data_morph_stream, iteration_scheme=ConstantScheme(batch_size)) data_morph_stream = Padding(data_morph_stream) data_morph_tensor3 = [] mask_morph_tensor3 = [] #data_morph_stream : batch_num * batch * sentence #rels_stream : batch_num * batch * sentence #dta_morph_tensor3 : batch_num * batch * sentence * morph for data_morph_tuple , rel in zip(data_morph_stream.get_epoch_iterator() , rels_stream): data_morph , mask_morph = data_morph_tuple #data_morph : batch * sentence #rel : batch * sentence tmp = [] tmp_mask = [] for m , mask , r in zip(data_morph , mask_morph , rel): #m : sentence #r : sentence start = 0 tmp2 = [] tmp_mask2 = [] for idx in r: tmp2.append(m[start:start+idx].tolist()) tmp_mask2.append(mask[start:start+idx].tolist()) #print m[start:start+idx] start = start + idx #print len(tmp) #print padding2(tmp2) tmp.append(tmp2) tmp_mask.append(tmp_mask2) #print len(tmp) , tmp #print m , r #print m.shape , r.shape #print padding2(tmp) data_morph_tensor3.append(np.array(padding2(tmp))) mask_morph_tensor3.append(np.array(padding2(tmp_mask) , dtype='float32')) return data_stream , data_morph_tensor3 , mask_morph_tensor3
from addition import AdditionTask from fuel.transformers import Mapping, Batch from fuel.schemes import ConstantScheme from numpy import swapaxes def _transpose(data): return tuple(swapaxes(array,0,1) for array in data if len(array.shape) > 2 ) dataset = AdditionTask(17) data_stream = dataset.get_example_stream() data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(14)) data_stream = Mapping(data_stream, _transpose) print next(data_stream.get_epoch_iterator())[0].shape
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
def DStream(datatype, config): if datatype in ['train','valid','test']: filename = config[datatype + '_file'] filename_morph = config[datatype + '_morph_file'] filename_rel = config[datatype + '_rel_file'] else: logger.error('wrong datatype, train, valid, or test') data_stream = getTextFile(filename, config['train_dic'], config) data_morph_stream = getTextFile(filename_morph, config['train_morph_dic'], config) # organize data in batches and pad shorter sequences with zeros batch_size = config['batch_size'] rels_stream = [] cnt = 0 with open(filename_rel , "r") as fin: lines = fin.readlines() i = 0 while i < len(lines): if i + batch_size < len(lines): rels_stream.append(padding(lines[i : i + batch_size])) i = i + batch_size else: rels_stream.append(padding(lines[i : len(lines)])) i = i + batch_size data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(batch_size)) data_stream = Padding(data_stream) data_morph_stream = Batch(data_morph_stream, iteration_scheme=ConstantScheme(batch_size)) data_morph_stream = Padding(data_morph_stream) data_morph_tensor3 = [] mask_morph_tensor3 = [] #data_morph_stream : batch_size * batch * sentence #rels_stream : batch_num * batch * sentence #data_morph_tensor3 : batch_num * batch * sentence * morph cnt = 0 for data_morph_tuple , rel in zip(data_morph_stream.get_epoch_iterator() , rels_stream): data_morph , mask_morph = data_morph_tuple #data_morph : batch * sentence #rel : batch * sentence tmp = [] tmp_mask = [] for m , mask , r in zip(data_morph , mask_morph , rel): start = 0 tmp2 = [] tmp_mask2 = [] for idx in r: tmp2.append(m[start:start+idx].tolist()) tmp_mask2.append(mask[start:start+idx].tolist()) #print m[start:start+idx] start = start + idx #print len(tmp) #print padding2(tmp2) tmp.append(tmp2) tmp_mask.append(tmp_mask2) #print len(tmp) , tmp #print m.shape , r.shape #print padding2(tmp) data_morph_tensor3.append(np.array(padding2(tmp))) mask_morph_tensor3.append(np.array(padding2(tmp_mask) , dtype='float32')) cnt += 1 ''' cnt = 0 for a, b, c in zip(data_stream.get_epoch_iterator() , mask_morph_tensor3, mask_morph_tensor3): data , mask = a if data.shape[1] != b.shape[1]: print data.shape , b.shape, c.shape cnt2 = 0 for i , d in enumerate(data): if cnt2 == 42: print i , len(d) , d dic2 = load_dic() for key in d: if key in dic2 and key != 0: print dic2[key], cnt2 += 1 print cnt #print data.shape , b[99] exit(0) print "###" cnt += 1 exit(0) ''' return data_stream , data_morph_tensor3 , mask_morph_tensor3