Пример #1
0
def test_dataset_evaluators():
    X = theano.tensor.matrix('X')
    brick = TestBrick(name='test_brick')
    Y = brick.apply(X)
    graph = ComputationGraph([Y])
    monitor_variables = [v for v in graph.auxiliary_variables]
    validator = DatasetEvaluator(monitor_variables)

    data = [
        numpy.arange(1, 5, dtype=floatX).reshape(2, 2),
        numpy.arange(10, 16, dtype=floatX).reshape(3, 2)
    ]
    data_stream = IterableDataset(dict(X=data)).get_example_stream()

    values = validator.evaluate(data_stream)
    assert values['test_brick_apply_V_squared'] == 4
    numpy.testing.assert_allclose(values['test_brick_apply_mean_row_mean'],
                                  numpy.vstack(data).mean())
    per_batch_mean = numpy.mean([batch.mean() for batch in data])
    numpy.testing.assert_allclose(
        values['test_brick_apply_mean_batch_element'], per_batch_mean)

    with assert_raises(Exception) as ar:
        data_stream = IterableDataset(dict(X2=data)).get_example_stream()
        validator.evaluate(data_stream)
    assert "Not all data sources" in ar.exception.args[0]
Пример #2
0
 def test_reset_calls_reset_on_all_streams(self):
     streams = [FlagDataStream(IterableDataset([1, 2, 3])),
                FlagDataStream(IterableDataset([4, 5, 6])),
                FlagDataStream(IterableDataset([7, 8, 9]))]
     transformer = Merge(streams, ('1', '2', '3'))
     transformer.reset()
     assert all(stream.reset_called for stream in streams)
Пример #3
0
 def setUp(self):
     data = range(10)
     self.stream = Batch(DataStream(IterableDataset(data)),
                         iteration_scheme=ConstantScheme(2))
     data_np = numpy.arange(10)
     self.stream_np = Batch(DataStream(IterableDataset(data_np)),
                            iteration_scheme=ConstantScheme(2))
Пример #4
0
def test_merge():
    english = IterableDataset(['Hello world!'])
    french = IterableDataset(['Bonjour le monde!'])
    streams = (english.get_example_stream(), french.get_example_stream())
    merged_stream = Merge(streams, ('english', 'french'))
    assert merged_stream.sources == ('english', 'french')
    assert (next(merged_stream.get_epoch_iterator()) == ('Hello world!',
                                                         'Bonjour le monde!'))
Пример #5
0
def test_sources_selection():
    features = [5, 6, 7, 1]
    targets = [1, 0, 1, 1]
    stream = DataStream(IterableDataset(OrderedDict(
        [('features', features), ('targets', targets)])))
    assert list(stream.get_epoch_iterator()) == list(zip(features, targets))

    stream = DataStream(IterableDataset(
        {'features': features, 'targets': targets},
        sources=('targets',)))
    assert list(stream.get_epoch_iterator()) == list(zip(targets))
Пример #6
0
def get_seq_mnist_streams(hidden_dim, batch_size=100, drop_prob=0.5):
    permutation = np.random.randint(0, 784, size=(784, ))

    train_set, valid_set, test_set = load_data('mnist.pkl.gz')
    train_x = train_set[0].reshape((50000 / batch_size, batch_size, 784))
    train_x = np.swapaxes(train_x, 2, 1)
    train_x = train_x[:, :, :, np.newaxis]
    # Now the dimension is num_batches x 784 x batch_size x 1

    train_y = (np.zeros(train_set[0].shape) - 1)
    # label for each time-step is -1 and for the last one is the real label
    train_y[:, -1] = train_set[1]
    train_y = train_y.reshape((50000 / batch_size, batch_size, 784))
    train_y = np.swapaxes(train_y, 2, 1)
    train_y = train_y[:, :, :, np.newaxis]
    # Now the dimension is num_batches x 784 x batch_size x 1

    valid_x = valid_set[0].reshape((10000 / batch_size, batch_size, 784))
    valid_x = np.swapaxes(valid_x, 2, 1)
    valid_x = valid_x[:, :, :, np.newaxis]
    # Now the dimension is num_batches x 784 x batch_size x 1

    valid_y = (np.zeros(valid_set[0].shape) - 1)
    # label for each time-step is -1 and for the last one is the real label
    valid_y[:, -1] = valid_set[1]
    valid_y = valid_y.reshape((10000 / batch_size, batch_size, 784))
    valid_y = np.swapaxes(valid_y, 2, 1)
    valid_y = valid_y[:, :, :, np.newaxis]
    # Now the dimension is num_batches x 784 x batch_size x 1

    train_x = train_x[:, permutation]
    valid_x = valid_x[:, permutation]

    train = IterableDataset({
        'x': train_x.astype(floatX),
        'y': train_y[:, -1, :, 0].astype('int32')
    })
    train_stream = DataStream(train)
    train_stream = SampleDrops(train_stream, drop_prob, hidden_dim, False)
    train_stream.sources = ('y', 'x', 'drops')

    train_stream.get_epoch_iterator().next()

    valid = IterableDataset({
        'x': valid_x.astype(floatX),
        'y': valid_y[:, -1, :, 0].astype('int32')
    })
    valid_stream = DataStream(valid)
    valid_stream = SampleDrops(valid_stream, drop_prob, hidden_dim, True)
    valid_stream.sources = ('y', 'x', 'drops')

    return train_stream, valid_stream
Пример #7
0
 def setUp(self):
     self.streams = (
         DataStream(IterableDataset(['Hello world!'])),
         DataStream(IterableDataset(['Bonjour le monde!'])))
     self.batch_streams = (
         Batch(DataStream(IterableDataset(['Hello world!', 'Hi!'])),
               iteration_scheme=ConstantScheme(2)),
         Batch(DataStream(IterableDataset(['Bonjour le monde!', 'Salut!'])),
               iteration_scheme=ConstantScheme(2)))
     self.transformer = Merge(
         self.streams, ('english', 'french'))
     self.batch_transformer = Merge(
         self.batch_streams, ('english', 'french'))
Пример #8
0
def test_num_examples():
    assert_raises(ValueError, IterableDataset,
                  {'features': range(10), 'targets': range(7)})
    dataset = IterableDataset({'features': range(7),
                               'targets': range(7)})
    assert dataset.num_examples == 7
    dataset = IterableDataset(repeat(1))
    assert numpy.isnan(dataset.num_examples)
    x = numpy.random.rand(5, 3)
    y = numpy.random.rand(5, 4)
    dataset = IndexableDataset({'features': x, 'targets': y})
    assert dataset.num_examples == 5
    assert_raises(ValueError, IndexableDataset,
                  {'features': x, 'targets': y[:4]})
Пример #9
0
def test_cache():
    dataset = IterableDataset(range(100))
    stream = DataStream(dataset)
    batched_stream = Batch(stream, ConstantScheme(11))
    cached_stream = Cache(batched_stream, ConstantScheme(7))
    epoch = cached_stream.get_epoch_iterator()

    # Make sure that cache is filled as expected
    for (features, ), cache_size in zip(epoch,
                                        [4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 0, 4]):
        assert len(cached_stream.cache[0]) == cache_size

    # Make sure that the epoch finishes correctly
    for (features, ) in cached_stream.get_epoch_iterator():
        pass
    assert len(features) == 100 % 7
    assert not cached_stream.cache[0]

    # Ensure that the epoch transition is correct
    cached_stream = Cache(batched_stream, ConstantScheme(7, times=3))
    for _, epoch in zip(range(2), cached_stream.iterate_epochs()):
        cache_sizes = [4, 8, 1]
        for i, (features, ) in enumerate(epoch):
            assert len(cached_stream.cache[0]) == cache_sizes[i]
            assert len(features) == 7
            assert numpy.all(list(range(100))[i * 7:(i + 1) * 7] == features)
        assert i == 2
Пример #10
0
    def test_mapping_accepts_list_or_dict(self):
        def mapping(d):
            return [2 * i for i in d[0]],

        stream = DataStream(IterableDataset(self.data))
        assert_raises(ValueError,
                      lambda: Mapping(stream, mapping, mapping_accepts=int))
Пример #11
0
    def do_test(with_serialization):
        data_stream = IterableDataset(range(10)).get_example_stream()
        main_loop = MainLoop(MockAlgorithm(),
                             data_stream,
                             extensions=[
                                 WriteBatchExtension(),
                                 FinishAfter(after_n_batches=14)
                             ])
        main_loop.run()
        assert main_loop.log.status['iterations_done'] == 14

        if with_serialization:
            main_loop = cPickle.loads(cPickle.dumps(main_loop))

        finish_after = unpack([
            ext
            for ext in main_loop.extensions if isinstance(ext, FinishAfter)
        ],
                              singleton=True)
        finish_after.add_condition(
            ["after_batch"],
            predicate=lambda log: log.status['iterations_done'] == 27)
        main_loop.run()
        assert main_loop.log.status['iterations_done'] == 27
        assert main_loop.log.status['epochs_done'] == 2
        for i in range(27):
            assert main_loop.log[i + 1]['batch'] == {"data": i % 10}
Пример #12
0
def test_training_data_monitoring_updates_algorithm():
    features = [
        numpy.array(f, dtype=theano.config.floatX)
        for f in [[1, 2], [3, 5], [5, 8]]
    ]
    targets = numpy.array([f.sum() for f in features])
    dataset = IterableDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    m = x.mean().copy(name='features_mean')
    t = y.sum().copy(name='targets_sum')

    main_loop = MainLoop(
        model=None,
        data_stream=dataset.get_example_stream(),
        algorithm=UpdatesAlgorithm(),
        extensions=[
            TrainingDataMonitoring([m, t], prefix="train1", after_batch=True)
        ],
    )
    main_loop.extensions[0].main_loop = main_loop
    assert len(main_loop.algorithm.updates) == 0
    main_loop.extensions[0].do('before_training')
    assert len(main_loop.algorithm.updates) > 0
Пример #13
0
 def test_adds_batch_to_axis_labels(self):
     stream = DataStream(
         IterableDataset(
             {'features': [1, 2, 3, 4, 5]},
             axis_labels={'features': ('index',)}))
     transformer = Batch(stream, ConstantScheme(2), strictness=0)
     assert_equal(transformer.axis_labels, {'features': ('batch', 'index')})
Пример #14
0
def test_perclass_accuracy_monitor():
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    dataset = IterableDataset(dict(features=features))
    datastream = DataStream(dataset)
    label_i_to_c = {0: "a", 1: "b", 2: "c"}
    test_probs = shared_floatx(
        numpy.array([
            [0.0, 0.0, 1.0],
            [0.75, 0.25, 0.0],
            [0.0, 0.75, 0.25],
            [0.25, 0.75, 0.0],
        ],
                    dtype=floatX))
    targets = shared_floatx(
        numpy.array([[2.0], [0.0], [1.0], [2.0]], dtype=floatX))
    perclass_accuracy_monitor = PerClassAccuracyMonitor(
        datastream,
        prediction=numpy.argmax(test_probs, axis=1),
        targets=targets.ravel(),
        label_i_to_c=label_i_to_c)
    perclass_accuracy_monitor.main_loop = setup_mainloop([])
    perclass_accuracy_monitor.do('after_batch')

    assert perclass_accuracy_monitor.main_loop.log[0][
        'perclass accuracy_a'] == 1.0
    assert perclass_accuracy_monitor.main_loop.log[0][
        'perclass accuracy_b'] == 1.0
    assert perclass_accuracy_monitor.main_loop.log[0][
        'perclass accuracy_c'] == 0.5
Пример #15
0
 def setUp(self):
     dataset = IterableDataset(
         OrderedDict([('features', [1, 2, 3]), ('targets', [0, 1, 0])]),
         axis_labels={'features': ('batch'), 'targets': ('batch')})
     self.stream = DataStream(dataset)
     self.wrapper = ScaleAndShift(
         self.stream, 2, -1, which_sources=('targets',))
Пример #16
0
def get_dev_stream_with_context_features(val_context_features=None, val_set=None, src_vocab=None,
                                         src_vocab_size=30000, unk_id=1, **kwargs):
    """Setup development set stream if necessary."""

    def _get_np_array(filename):
        return numpy.load(filename)['arr_0']


    dev_stream = None
    if val_set is not None and src_vocab is not None:
        src_vocab = _ensure_special_tokens(
            src_vocab if isinstance(src_vocab, dict) else
            cPickle.load(open(src_vocab)),
            bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id)

        dev_dataset = TextFile([val_set], src_vocab, None)

        # now add the source with the image features
        # create the image datastream (iterate over a file line-by-line)
        con_features = _get_np_array(val_context_features)
        con_feature_dataset = IterableDataset(con_features)
        valid_image_stream = DataStream(con_feature_dataset)

        # dev_stream = DataStream(dev_dataset)
        dev_stream = Merge([dev_dataset.get_example_stream(),
                            valid_image_stream], ('source', 'initial_context'))
    #         dev_stream = dev_stream.get_example_stream()

    return dev_stream
Пример #17
0
def test_shared_variable_modifier_two_parameters():
    weights = numpy.array([-1, 1], dtype=theano.config.floatX)
    features = [numpy.array(f, dtype=theano.config.floatX)
                for f in [[1, 2], [3, 4], [5, 6]]]
    targets = [(weights * f).sum() for f in features]
    n_batches = 3
    dataset = IterableDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    cost = ((x * W).sum() - y) ** 2
    cost.name = 'cost'

    step_rule = Scale(0.001)
    sgd = GradientDescent(cost=cost, parameters=[W],
                          step_rule=step_rule)
    modifier = SharedVariableModifier(
        step_rule.learning_rate,
        lambda _, val: numpy.cast[theano.config.floatX](val * 0.2))
    main_loop = MainLoop(
        model=None, data_stream=dataset.get_example_stream(),
        algorithm=sgd,
        extensions=[FinishAfter(after_n_epochs=1), modifier])

    main_loop.run()

    new_value = step_rule.learning_rate.get_value()
    assert_allclose(new_value,
                    0.001 * 0.2 ** n_batches,
                    atol=1e-5)
Пример #18
0
def test_confusion_matrix():
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    dataset = IterableDataset(dict(features=features))
    datastream = DataStream(dataset)
    label_i_to_c = {0: "a", 1: "b", 2: "c"}
    test_probs = shared_floatx(
        numpy.array([[0.75, 0.0, 0.0], [0.75, 0.0, 0.0], [0.0, 0.0, 0.75],
                     [0.0, 0.0, 0.75], [0.75, 0.0, 0.0], [0.0, 0.0, 0.75]],
                    dtype=floatX))
    targets = shared_floatx(
        numpy.array([[2.0], [0.0], [2.0], [2.0], [0.0], [1.0]], dtype=floatX))
    d = DirectoryCreator(directory="confusionMatrixTest")
    extension = ConfusionMatrixMonitor(datastream,
                                       prediction=numpy.argmax(test_probs,
                                                               axis=1),
                                       targets=targets.ravel(),
                                       dest_directory="confusionMatrixTest",
                                       every_n_batches=3)
    main_loop = setup_mainloop([d, extension])

    main_loop.run()
    path = 'confusionMatrixTest/confusion_iterations_3.npz'
    expected = numpy.array(
        [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [(1.0 / 3.0), 0.0, (2.0 / 3.0)]],
        dtype=floatX)
    assert_allclose(numpy.load(path), expected)
    shutil.rmtree('confusionMatrixTest')
Пример #19
0
def setup_mainloop(extensions):
    """Create a MainLoop, register the given extension, supply it with a
        DataStream and a minimal model/cost to optimize.
    """
    features = [numpy.array(f, dtype=floatX) for f in [[1, 2], [3, 4], [5, 6]]]
    dataset = IterableDataset(dict(features=features))
    datastream = DataStream(dataset)

    W = shared_floatx([0, 0], name='W')
    add_role(W, PARAMETER)
    x = tensor.vector('features')
    cost = tensor.sum((x - W)**2)
    cost.name = "cost"

    algorithm = GradientDescent(cost=cost,
                                parameters=[W],
                                step_rule=Scale(1e-3))

    main_loop = MainLoop(model=Model(cost),
                         data_stream=datastream,
                         algorithm=algorithm,
                         extensions=[
                             FinishAfter(after_n_epochs=1),
                         ] + extensions)

    return main_loop
Пример #20
0
def test_floatx():
    x = [numpy.array(d, dtype="float64") for d in [[1, 2], [3, 4], [5, 6]]]
    y = [numpy.array(d, dtype="int64") for d in [1, 2, 3]]
    dataset = IterableDataset(OrderedDict([("x", x), ("y", y)]))
    data = next(ForceFloatX(DataStream(dataset)).get_epoch_iterator())
    assert str(data[0].dtype) == floatX
    assert str(data[1].dtype) == "int64"
Пример #21
0
 def test_filter_examples(self):
     data = [1, 2, 3]
     data_filtered = [1, 3]
     stream = DataStream(IterableDataset(data))
     wrapper = Filter(stream, lambda d: d[0] % 2 == 1)
     assert_equal(list(wrapper.get_epoch_iterator()),
                  list(zip(data_filtered)))
Пример #22
0
 def test_add_sources(self):
     stream = DataStream(IterableDataset(self.data))
     transformer = Mapping(stream, lambda d: ([2 * i for i in d[0]],),
                           add_sources=('doubled',))
     assert_equal(transformer.sources, ('data', 'doubled'))
     assert_equal(list(transformer.get_epoch_iterator()),
                  list(zip(self.data, [[2, 4, 6], [4, 6, 2], [6, 4, 2]])))
Пример #23
0
def setup_mainloop(extension):
    """Set up a simple main loop for progress bar tests.

    Create a MainLoop, register the given extension, supply it with a
    DataStream and a minimal model/cost to optimize.

    """
    # Since progressbar2 3.6.0, the `maxval` kwarg has been replaced by
    # `max_value`, which has a default value of 100. If we're still using
    # `maxval` by accident, this test should fail complaining that
    # the progress bar has received a value out of range.
    features = [numpy.array(f, dtype=theano.config.floatX)
                for f in [[1, 2]] * 101]
    dataset = IterableDataset(dict(features=features))

    W = shared_floatx([0, 0], name='W')
    x = tensor.vector('features')
    cost = tensor.sum((x-W)**2)
    cost.name = "cost"

    algorithm = GradientDescent(cost=cost, parameters=[W],
                                step_rule=Scale(1e-3))

    main_loop = MainLoop(
        model=None, data_stream=dataset.get_example_stream(),
        algorithm=algorithm,
        extensions=[
            FinishAfter(after_n_epochs=1),
            extension])

    return main_loop
Пример #24
0
 def test_two_sources(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert len(next(transformer.get_epoch_iterator())) == 4
Пример #25
0
 def test_value_error_on_request(self):
     transformer = Padding(Batch(
         DataStream(
             IterableDataset(
                 dict(features=[[1], [2, 3]], targets=[[4, 5, 6], [7]]))),
         ConstantScheme(2)))
     assert_raises(ValueError, transformer.get_data, [0, 1])
Пример #26
0
def setup_mainloop(extension):
    """Set up a simple main loop for progress bar tests.

    Create a MainLoop, register the given extension, supply it with a
    DataStream and a minimal model/cost to optimize.

    """
    features = [
        numpy.array(f, dtype=theano.config.floatX)
        for f in [[1, 2], [3, 4], [5, 6]]
    ]
    dataset = IterableDataset(dict(features=features))

    W = shared_floatx([0, 0], name='W')
    x = tensor.vector('features')
    cost = tensor.sum((x - W)**2)
    cost.name = "cost"

    algorithm = GradientDescent(cost=cost, params=[W], step_rule=Scale(1e-3))

    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_example_stream(),
                         algorithm=algorithm,
                         extensions=[FinishAfter(after_n_epochs=1), extension])

    return main_loop
Пример #27
0
def get_data_stream(iterable):
    dataset = IterableDataset({'numbers': iterable})
    data_stream = Mapping(dataset.get_example_stream(),
                          _data_sqrt,
                          add_sources=('roots', ))
    data_stream = Mapping(data_stream, _array_tuple)
    return Batch(data_stream, ConstantScheme(20))
Пример #28
0
def test_training_data_monitoring():
    weights = numpy.array([-1, 1], dtype=theano.config.floatX)
    features = [
        numpy.array(f, dtype=theano.config.floatX)
        for f in [[1, 2], [3, 4], [5, 6]]
    ]
    targets = [(weights * f).sum() for f in features]
    n_batches = 3
    dataset = IterableDataset(dict(features=features, targets=targets))

    x = tensor.vector('features')
    y = tensor.scalar('targets')
    W = shared_floatx([0, 0], name='W')
    V = shared_floatx(7, name='V')
    W_sum = named_copy(W.sum(), 'W_sum')
    cost = ((x * W).sum() - y)**2
    cost.name = 'cost'

    class TrueCostExtension(TrainingExtension):
        def before_batch(self, data):
            self.main_loop.log.current_row['true_cost'] = ((
                (W.get_value() * data["features"]).sum() - data["targets"])**2)

    main_loop = MainLoop(model=None,
                         data_stream=dataset.get_example_stream(),
                         algorithm=GradientDescent(cost=cost,
                                                   parameters=[W],
                                                   step_rule=Scale(0.001)),
                         extensions=[
                             FinishAfter(after_n_epochs=1),
                             TrainingDataMonitoring([W_sum, cost, V],
                                                    prefix="train1",
                                                    after_batch=True),
                             TrainingDataMonitoring(
                                 [aggregation.mean(W_sum), cost],
                                 prefix="train2",
                                 after_epoch=True),
                             TrueCostExtension()
                         ])

    main_loop.run()

    # Check monitoring of a shared varible
    assert_allclose(main_loop.log.current_row['train1_V'], 7.0)

    for i in range(n_batches):
        # The ground truth is written to the log before the batch is
        # processed, where as the extension writes after the batch is
        # processed. This is why the iteration numbers differs here.
        assert_allclose(main_loop.log[i]['true_cost'],
                        main_loop.log[i + 1]['train1_cost'])
    assert_allclose(
        main_loop.log[n_batches]['train2_cost'],
        sum([main_loop.log[i]['true_cost']
             for i in range(n_batches)]) / n_batches)
    assert_allclose(
        main_loop.log[n_batches]['train2_W_sum'],
        sum([
            main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1)
        ]) / n_batches)
Пример #29
0
def test_ngram_stream():
    sentences = [
        list(numpy.random.randint(10, size=sentence_length))
        for sentence_length in [3, 5, 7]
    ]
    stream = IterableDataset(sentences).get_example_stream()
    ngrams = NGrams(4, stream)
    assert len(list(ngrams.get_epoch_iterator())) == 4
Пример #30
0
def test_ngram_stream_raises_error_on_request():
    sentences = [
        list(numpy.random.randint(10, size=sentence_length))
        for sentence_length in [3, 5, 7]
    ]
    stream = DataStream(IterableDataset(sentences))
    ngrams = NGrams(4, stream)
    assert_raises(ValueError, ngrams.get_data, [0, 1])