def test_dataset_evaluators(): X = theano.tensor.matrix('X') brick = TestBrick(name='test_brick') Y = brick.apply(X) graph = ComputationGraph([Y]) monitor_variables = [v for v in graph.auxiliary_variables] validator = DatasetEvaluator(monitor_variables) data = [ numpy.arange(1, 5, dtype=floatX).reshape(2, 2), numpy.arange(10, 16, dtype=floatX).reshape(3, 2) ] data_stream = IterableDataset(dict(X=data)).get_example_stream() values = validator.evaluate(data_stream) assert values['test_brick_apply_V_squared'] == 4 numpy.testing.assert_allclose(values['test_brick_apply_mean_row_mean'], numpy.vstack(data).mean()) per_batch_mean = numpy.mean([batch.mean() for batch in data]) numpy.testing.assert_allclose( values['test_brick_apply_mean_batch_element'], per_batch_mean) with assert_raises(Exception) as ar: data_stream = IterableDataset(dict(X2=data)).get_example_stream() validator.evaluate(data_stream) assert "Not all data sources" in ar.exception.args[0]
def test_reset_calls_reset_on_all_streams(self): streams = [FlagDataStream(IterableDataset([1, 2, 3])), FlagDataStream(IterableDataset([4, 5, 6])), FlagDataStream(IterableDataset([7, 8, 9]))] transformer = Merge(streams, ('1', '2', '3')) transformer.reset() assert all(stream.reset_called for stream in streams)
def setUp(self): data = range(10) self.stream = Batch(DataStream(IterableDataset(data)), iteration_scheme=ConstantScheme(2)) data_np = numpy.arange(10) self.stream_np = Batch(DataStream(IterableDataset(data_np)), iteration_scheme=ConstantScheme(2))
def test_merge(): english = IterableDataset(['Hello world!']) french = IterableDataset(['Bonjour le monde!']) streams = (english.get_example_stream(), french.get_example_stream()) merged_stream = Merge(streams, ('english', 'french')) assert merged_stream.sources == ('english', 'french') assert (next(merged_stream.get_epoch_iterator()) == ('Hello world!', 'Bonjour le monde!'))
def test_sources_selection(): features = [5, 6, 7, 1] targets = [1, 0, 1, 1] stream = DataStream(IterableDataset(OrderedDict( [('features', features), ('targets', targets)]))) assert list(stream.get_epoch_iterator()) == list(zip(features, targets)) stream = DataStream(IterableDataset( {'features': features, 'targets': targets}, sources=('targets',))) assert list(stream.get_epoch_iterator()) == list(zip(targets))
def get_seq_mnist_streams(hidden_dim, batch_size=100, drop_prob=0.5): permutation = np.random.randint(0, 784, size=(784, )) train_set, valid_set, test_set = load_data('mnist.pkl.gz') train_x = train_set[0].reshape((50000 / batch_size, batch_size, 784)) train_x = np.swapaxes(train_x, 2, 1) train_x = train_x[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 train_y = (np.zeros(train_set[0].shape) - 1) # label for each time-step is -1 and for the last one is the real label train_y[:, -1] = train_set[1] train_y = train_y.reshape((50000 / batch_size, batch_size, 784)) train_y = np.swapaxes(train_y, 2, 1) train_y = train_y[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 valid_x = valid_set[0].reshape((10000 / batch_size, batch_size, 784)) valid_x = np.swapaxes(valid_x, 2, 1) valid_x = valid_x[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 valid_y = (np.zeros(valid_set[0].shape) - 1) # label for each time-step is -1 and for the last one is the real label valid_y[:, -1] = valid_set[1] valid_y = valid_y.reshape((10000 / batch_size, batch_size, 784)) valid_y = np.swapaxes(valid_y, 2, 1) valid_y = valid_y[:, :, :, np.newaxis] # Now the dimension is num_batches x 784 x batch_size x 1 train_x = train_x[:, permutation] valid_x = valid_x[:, permutation] train = IterableDataset({ 'x': train_x.astype(floatX), 'y': train_y[:, -1, :, 0].astype('int32') }) train_stream = DataStream(train) train_stream = SampleDrops(train_stream, drop_prob, hidden_dim, False) train_stream.sources = ('y', 'x', 'drops') train_stream.get_epoch_iterator().next() valid = IterableDataset({ 'x': valid_x.astype(floatX), 'y': valid_y[:, -1, :, 0].astype('int32') }) valid_stream = DataStream(valid) valid_stream = SampleDrops(valid_stream, drop_prob, hidden_dim, True) valid_stream.sources = ('y', 'x', 'drops') return train_stream, valid_stream
def setUp(self): self.streams = ( DataStream(IterableDataset(['Hello world!'])), DataStream(IterableDataset(['Bonjour le monde!']))) self.batch_streams = ( Batch(DataStream(IterableDataset(['Hello world!', 'Hi!'])), iteration_scheme=ConstantScheme(2)), Batch(DataStream(IterableDataset(['Bonjour le monde!', 'Salut!'])), iteration_scheme=ConstantScheme(2))) self.transformer = Merge( self.streams, ('english', 'french')) self.batch_transformer = Merge( self.batch_streams, ('english', 'french'))
def test_num_examples(): assert_raises(ValueError, IterableDataset, {'features': range(10), 'targets': range(7)}) dataset = IterableDataset({'features': range(7), 'targets': range(7)}) assert dataset.num_examples == 7 dataset = IterableDataset(repeat(1)) assert numpy.isnan(dataset.num_examples) x = numpy.random.rand(5, 3) y = numpy.random.rand(5, 4) dataset = IndexableDataset({'features': x, 'targets': y}) assert dataset.num_examples == 5 assert_raises(ValueError, IndexableDataset, {'features': x, 'targets': y[:4]})
def test_cache(): dataset = IterableDataset(range(100)) stream = DataStream(dataset) batched_stream = Batch(stream, ConstantScheme(11)) cached_stream = Cache(batched_stream, ConstantScheme(7)) epoch = cached_stream.get_epoch_iterator() # Make sure that cache is filled as expected for (features, ), cache_size in zip(epoch, [4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 0, 4]): assert len(cached_stream.cache[0]) == cache_size # Make sure that the epoch finishes correctly for (features, ) in cached_stream.get_epoch_iterator(): pass assert len(features) == 100 % 7 assert not cached_stream.cache[0] # Ensure that the epoch transition is correct cached_stream = Cache(batched_stream, ConstantScheme(7, times=3)) for _, epoch in zip(range(2), cached_stream.iterate_epochs()): cache_sizes = [4, 8, 1] for i, (features, ) in enumerate(epoch): assert len(cached_stream.cache[0]) == cache_sizes[i] assert len(features) == 7 assert numpy.all(list(range(100))[i * 7:(i + 1) * 7] == features) assert i == 2
def test_mapping_accepts_list_or_dict(self): def mapping(d): return [2 * i for i in d[0]], stream = DataStream(IterableDataset(self.data)) assert_raises(ValueError, lambda: Mapping(stream, mapping, mapping_accepts=int))
def do_test(with_serialization): data_stream = IterableDataset(range(10)).get_example_stream() main_loop = MainLoop(MockAlgorithm(), data_stream, extensions=[ WriteBatchExtension(), FinishAfter(after_n_batches=14) ]) main_loop.run() assert main_loop.log.status['iterations_done'] == 14 if with_serialization: main_loop = cPickle.loads(cPickle.dumps(main_loop)) finish_after = unpack([ ext for ext in main_loop.extensions if isinstance(ext, FinishAfter) ], singleton=True) finish_after.add_condition( ["after_batch"], predicate=lambda log: log.status['iterations_done'] == 27) main_loop.run() assert main_loop.log.status['iterations_done'] == 27 assert main_loop.log.status['epochs_done'] == 2 for i in range(27): assert main_loop.log[i + 1]['batch'] == {"data": i % 10}
def test_training_data_monitoring_updates_algorithm(): features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 5], [5, 8]] ] targets = numpy.array([f.sum() for f in features]) dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') m = x.mean().copy(name='features_mean') t = y.sum().copy(name='targets_sum') main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=UpdatesAlgorithm(), extensions=[ TrainingDataMonitoring([m, t], prefix="train1", after_batch=True) ], ) main_loop.extensions[0].main_loop = main_loop assert len(main_loop.algorithm.updates) == 0 main_loop.extensions[0].do('before_training') assert len(main_loop.algorithm.updates) > 0
def test_adds_batch_to_axis_labels(self): stream = DataStream( IterableDataset( {'features': [1, 2, 3, 4, 5]}, axis_labels={'features': ('index',)})) transformer = Batch(stream, ConstantScheme(2), strictness=0) assert_equal(transformer.axis_labels, {'features': ('batch', 'index')})
def test_perclass_accuracy_monitor(): features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) label_i_to_c = {0: "a", 1: "b", 2: "c"} test_probs = shared_floatx( numpy.array([ [0.0, 0.0, 1.0], [0.75, 0.25, 0.0], [0.0, 0.75, 0.25], [0.25, 0.75, 0.0], ], dtype=floatX)) targets = shared_floatx( numpy.array([[2.0], [0.0], [1.0], [2.0]], dtype=floatX)) perclass_accuracy_monitor = PerClassAccuracyMonitor( datastream, prediction=numpy.argmax(test_probs, axis=1), targets=targets.ravel(), label_i_to_c=label_i_to_c) perclass_accuracy_monitor.main_loop = setup_mainloop([]) perclass_accuracy_monitor.do('after_batch') assert perclass_accuracy_monitor.main_loop.log[0][ 'perclass accuracy_a'] == 1.0 assert perclass_accuracy_monitor.main_loop.log[0][ 'perclass accuracy_b'] == 1.0 assert perclass_accuracy_monitor.main_loop.log[0][ 'perclass accuracy_c'] == 0.5
def setUp(self): dataset = IterableDataset( OrderedDict([('features', [1, 2, 3]), ('targets', [0, 1, 0])]), axis_labels={'features': ('batch'), 'targets': ('batch')}) self.stream = DataStream(dataset) self.wrapper = ScaleAndShift( self.stream, 2, -1, which_sources=('targets',))
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None, src_vocab_size=30000, unk_id=1, **kwargs): """Setup development set stream if necessary.""" def _get_np_array(filename): return numpy.load(filename)['arr_0'] dev_stream = None if val_set is not None and src_vocab is not None: src_vocab = _ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) dev_dataset = TextFile([val_set], src_vocab, None) # now add the source with the image features # create the image datastream (iterate over a file line-by-line) con_features = _get_np_array(val_context_features) con_feature_dataset = IterableDataset(con_features) valid_image_stream = DataStream(con_feature_dataset) # dev_stream = DataStream(dev_dataset) dev_stream = Merge([dev_dataset.get_example_stream(), valid_image_stream], ('source', 'initial_context')) # dev_stream = dev_stream.get_example_stream() return dev_stream
def test_shared_variable_modifier_two_parameters(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]]] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') cost = ((x * W).sum() - y) ** 2 cost.name = 'cost' step_rule = Scale(0.001) sgd = GradientDescent(cost=cost, parameters=[W], step_rule=step_rule) modifier = SharedVariableModifier( step_rule.learning_rate, lambda _, val: numpy.cast[theano.config.floatX](val * 0.2)) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=sgd, extensions=[FinishAfter(after_n_epochs=1), modifier]) main_loop.run() new_value = step_rule.learning_rate.get_value() assert_allclose(new_value, 0.001 * 0.2 ** n_batches, atol=1e-5)
def test_confusion_matrix(): features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) label_i_to_c = {0: "a", 1: "b", 2: "c"} test_probs = shared_floatx( numpy.array([[0.75, 0.0, 0.0], [0.75, 0.0, 0.0], [0.0, 0.0, 0.75], [0.0, 0.0, 0.75], [0.75, 0.0, 0.0], [0.0, 0.0, 0.75]], dtype=floatX)) targets = shared_floatx( numpy.array([[2.0], [0.0], [2.0], [2.0], [0.0], [1.0]], dtype=floatX)) d = DirectoryCreator(directory="confusionMatrixTest") extension = ConfusionMatrixMonitor(datastream, prediction=numpy.argmax(test_probs, axis=1), targets=targets.ravel(), dest_directory="confusionMatrixTest", every_n_batches=3) main_loop = setup_mainloop([d, extension]) main_loop.run() path = 'confusionMatrixTest/confusion_iterations_3.npz' expected = numpy.array( [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [(1.0 / 3.0), 0.0, (2.0 / 3.0)]], dtype=floatX) assert_allclose(numpy.load(path), expected) shutil.rmtree('confusionMatrixTest')
def setup_mainloop(extensions): """Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]] dataset = IterableDataset(dict(features=features)) datastream = DataStream(dataset) W = shared_floatx([0, 0], name='W') add_role(W, PARAMETER) x = tensor.vector('features') cost = tensor.sum((x - W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop(model=Model(cost), data_stream=datastream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), ] + extensions) return main_loop
def test_floatx(): x = [numpy.array(d, dtype="float64") for d in [[1, 2], [3, 4], [5, 6]]] y = [numpy.array(d, dtype="int64") for d in [1, 2, 3]] dataset = IterableDataset(OrderedDict([("x", x), ("y", y)])) data = next(ForceFloatX(DataStream(dataset)).get_epoch_iterator()) assert str(data[0].dtype) == floatX assert str(data[1].dtype) == "int64"
def test_filter_examples(self): data = [1, 2, 3] data_filtered = [1, 3] stream = DataStream(IterableDataset(data)) wrapper = Filter(stream, lambda d: d[0] % 2 == 1) assert_equal(list(wrapper.get_epoch_iterator()), list(zip(data_filtered)))
def test_add_sources(self): stream = DataStream(IterableDataset(self.data)) transformer = Mapping(stream, lambda d: ([2 * i for i in d[0]],), add_sources=('doubled',)) assert_equal(transformer.sources, ('data', 'doubled')) assert_equal(list(transformer.get_epoch_iterator()), list(zip(self.data, [[2, 4, 6], [4, 6, 2], [6, 4, 2]])))
def setup_mainloop(extension): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ # Since progressbar2 3.6.0, the `maxval` kwarg has been replaced by # `max_value`, which has a default value of 100. If we're still using # `maxval` by accident, this test should fail complaining that # the progress bar has received a value out of range. features = [numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2]] * 101] dataset = IterableDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x-W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, parameters=[W], step_rule=Scale(1e-3)) main_loop = MainLoop( model=None, data_stream=dataset.get_example_stream(), algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=1), extension]) return main_loop
def test_two_sources(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert len(next(transformer.get_epoch_iterator())) == 4
def test_value_error_on_request(self): transformer = Padding(Batch( DataStream( IterableDataset( dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))), ConstantScheme(2))) assert_raises(ValueError, transformer.get_data, [0, 1])
def setup_mainloop(extension): """Set up a simple main loop for progress bar tests. Create a MainLoop, register the given extension, supply it with a DataStream and a minimal model/cost to optimize. """ features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]] ] dataset = IterableDataset(dict(features=features)) W = shared_floatx([0, 0], name='W') x = tensor.vector('features') cost = tensor.sum((x - W)**2) cost.name = "cost" algorithm = GradientDescent(cost=cost, params=[W], step_rule=Scale(1e-3)) main_loop = MainLoop(model=None, data_stream=dataset.get_example_stream(), algorithm=algorithm, extensions=[FinishAfter(after_n_epochs=1), extension]) return main_loop
def get_data_stream(iterable): dataset = IterableDataset({'numbers': iterable}) data_stream = Mapping(dataset.get_example_stream(), _data_sqrt, add_sources=('roots', )) data_stream = Mapping(data_stream, _array_tuple) return Batch(data_stream, ConstantScheme(20))
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 4], [5, 6]] ] targets = [(weights * f).sum() for f in features] n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = named_copy(W.sum(), 'W_sum') cost = ((x * W).sum() - y)**2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row['true_cost'] = (( (W.get_value() * data["features"]).sum() - data["targets"])**2) main_loop = MainLoop(model=None, data_stream=dataset.get_example_stream(), algorithm=GradientDescent(cost=cost, parameters=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V], prefix="train1", after_batch=True), TrainingDataMonitoring( [aggregation.mean(W_sum), cost], prefix="train2", after_epoch=True), TrueCostExtension() ]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row['train1_V'], 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i]['true_cost'], main_loop.log[i + 1]['train1_cost']) assert_allclose( main_loop.log[n_batches]['train2_cost'], sum([main_loop.log[i]['true_cost'] for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches]['train2_W_sum'], sum([ main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1) ]) / n_batches)
def test_ngram_stream(): sentences = [ list(numpy.random.randint(10, size=sentence_length)) for sentence_length in [3, 5, 7] ] stream = IterableDataset(sentences).get_example_stream() ngrams = NGrams(4, stream) assert len(list(ngrams.get_epoch_iterator())) == 4
def test_ngram_stream_raises_error_on_request(): sentences = [ list(numpy.random.randint(10, size=sentence_length)) for sentence_length in [3, 5, 7] ] stream = DataStream(IterableDataset(sentences)) ngrams = NGrams(4, stream) assert_raises(ValueError, ngrams.get_data, [0, 1])