Exemplo n.º 1
0
def _test_mean_like_aggregator(scheme, func):
    """Common test function for both Mean and Perplexity"""
    features = numpy.array([[0, 3], [2, 9], [2, 4], [5, 1], [6, 7]],
                           dtype=theano.config.floatX)
    num_examples = features.shape[0]
    batch_size = 2

    dataset = IndexableDataset(OrderedDict([('features', features)]))

    data_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 num_examples, batch_size))

    x = tensor.matrix('features')
    y = (x**0.5).sum(axis=0)
    y.name = 'y'
    z = y.sum()
    z.name = 'z'

    y.tag.aggregation_scheme = scheme(y, x.shape[0])
    z.tag.aggregation_scheme = scheme(z, x.shape[0])

    y_desired = func((features**0.5).mean(axis=0))
    z_desired = func((features**0.5).sum(axis=1).mean(axis=0))

    assert_allclose(
        DatasetEvaluator([y]).evaluate(data_stream)['y'],
        numpy.array(y_desired, dtype=theano.config.floatX))
    assert_allclose(
        DatasetEvaluator([z]).evaluate(data_stream)['z'],
        numpy.array(z_desired, dtype=theano.config.floatX))
Exemplo n.º 2
0
 def test_flatten_batches(self):
     wrapper = Flatten(DataStream(IndexableDataset(self.data),
                                  iteration_scheme=SequentialScheme(4, 2)),
                       which_sources=('features', ))
     assert_equal(list(wrapper.get_epoch_iterator()),
                  [(numpy.ones((2, 4)), numpy.array([[0], [1]])),
                   (numpy.ones((2, 4)), numpy.array([[0], [1]]))])
Exemplo n.º 3
0
 def test_flatten_examples(self):
     wrapper = Flatten(DataStream(
         IndexableDataset(self.data),
         iteration_scheme=SequentialExampleScheme(4)),
                       which_sources=('features', ))
     assert_equal(list(wrapper.get_epoch_iterator()),
                  [(numpy.ones(4), 0), (numpy.ones(4), 1)] * 2)
Exemplo n.º 4
0
def test_mean_aggregator():
    num_examples = 4
    batch_size = 2

    features = numpy.array([[0, 3], [2, 9], [2, 4], [5, 1]],
                           dtype=theano.config.floatX)

    dataset = IndexableDataset(OrderedDict([('features', features)]))

    data_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 num_examples, batch_size))

    x = tensor.matrix('features')
    y = (x**2).mean(axis=0)
    y.name = 'y'
    z = y.sum()
    z.name = 'z'

    y.tag.aggregation_scheme = Mean(y, 1.)
    z.tag.aggregation_scheme = Mean(z, 1.)

    assert_allclose(
        DatasetEvaluator([y]).evaluate(data_stream)['y'],
        numpy.array([8.25, 26.75], dtype=theano.config.floatX))
    assert_allclose(
        DatasetEvaluator([z]).evaluate(data_stream)['z'],
        numpy.array([35], dtype=theano.config.floatX))
Exemplo n.º 5
0
 def test_one_hot_batches_invalid_input(self):
     wrapper = OneHotEncoding(DataStream(IndexableDataset(self.data),
                                         iteration_scheme=SequentialScheme(
                                             4, 2)),
                              num_classes=2,
                              which_sources=('targets', ))
     assert_raises(ValueError, list, wrapper.get_epoch_iterator())
Exemplo n.º 6
0
def setup_datastream(path, batch_size, sort_batch_count, valid=False):
    A = numpy.load(
        os.path.join(path,
                     ('valid_x_raw.npy' if valid else 'train_x_raw.npy')))
    B = numpy.load(
        os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy')))
    C = numpy.load(
        os.path.join(
            path,
            ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy')))

    D = [B[x[0]:x[1], 2] for x in C]

    ds = IndexableDataset({'input': A, 'output': D})
    stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A)))

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size *
                                                   sort_batch_count))
    comparison = _balanced_batch_helper(stream.sources.index('input'))
    stream = Mapping(stream, SortMapping(comparison))
    stream = Unpack(stream)

    stream = Batch(stream,
                   iteration_scheme=ConstantScheme(batch_size,
                                                   num_examples=len(A)))
    stream = Padding(stream, mask_sources=['input', 'output'])

    return ds, stream
Exemplo n.º 7
0
def get_stream_raw(dataset, which_set, mini_batch_size):
    data = get_data(dataset)

    # dataset is a 3D array of shape: Time X Batch X Features
    dataset = data[which_set]
    time, batch, features = dataset.shape
    nb_mini_batches = batch / mini_batch_size
    dataset = dataset[:, :nb_mini_batches * mini_batch_size, :]

    # Create the target_dataset
    targets_dataset = dataset[1:, :, :]

    # Cut the dataset into several minibatches
    # dataset is now 4D (nb_mini_batches X Time X mini_batch_size X Features)
    dataset = np.swapaxes(dataset, 0, 1)
    targets_dataset = np.swapaxes(targets_dataset, 0, 1)
    dataset = np.reshape(dataset,
                         (nb_mini_batches, mini_batch_size, time, features))
    targets_dataset = np.reshape(
        targets_dataset,
        (nb_mini_batches, mini_batch_size, time - 1, features))
    dataset = np.swapaxes(dataset, 1, 2)
    targets_dataset = np.swapaxes(targets_dataset, 1, 2)

    # Create fuel dataset
    dataset = IndexableDataset({
        'features': dataset,
        'targets': targets_dataset
    })
    stream = DataStream(
        dataset, iteration_scheme=SequentialExampleScheme(nb_mini_batches))
    return stream
Exemplo n.º 8
0
 def test_filter_batches(self):
     data = [1, 2, 3, 4]
     data_filtered = [([3, 4],)]
     stream = DataStream(IndexableDataset(data),
                         iteration_scheme=SequentialScheme(4, 2))
     wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0)
     assert_equal(list(wrapper.get_epoch_iterator()), data_filtered)
Exemplo n.º 9
0
 def setUp(self):
     self.string_data = [b'Hello', b'World!']
     self.dataset = IndexableDataset(
         indexables={
             'words':
             [numpy.fromstring(s, dtype='uint8') for s in self.string_data]
         },
         axis_labels={'words': ('batch', 'bytes')})
Exemplo n.º 10
0
def test_ngram_stream_raises_error_on_batch_stream():
    sentences = [
        list(numpy.random.randint(10, size=sentence_length))
        for sentence_length in [3, 5, 7]
    ]
    stream = DataStream(IndexableDataset(sentences),
                        iteration_scheme=SequentialScheme(3, 1))
    assert_raises(ValueError, NGrams, 4, stream)
Exemplo n.º 11
0
def test_single_mapping_value_error_on_request():
    class IdentitySingleMapping(SingleMapping):
        def mapping(self, source):
            return source

    data_stream = DataStream(IndexableDataset([0, 1, 2]))
    transformer = IdentitySingleMapping(data_stream)
    assert_raises(ValueError, transformer.get_data, [0, 1])
Exemplo n.º 12
0
 def test_axis_labels_are_passed_through(self):
     stream = DataStream(
         IndexableDataset(
             {'features': [1, 2, 3, 4]},
             axis_labels={'features': ('batch',)}),
         iteration_scheme=SequentialScheme(4, 2))
     wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0)
     assert_equal(wrapper.axis_labels, stream.axis_labels)
Exemplo n.º 13
0
 def setUp(self):
     self.stream = DataStream(
         IndexableDataset(
             OrderedDict([('features', numpy.ones((4, 2, 2))),
                          ('targets', numpy.array([0, 1, 0, 1]))]),
             axis_labels={'features': ('batch', 'width', 'height'),
                          'targets': ('batch',)}),
         iteration_scheme=SequentialScheme(4, 2))
Exemplo n.º 14
0
    def load_data(self, data_path):
        logging.info("Loading: " + data_path)

        data = pd.read_csv(data_path, sep="\t", header=None)
        data.columns = ['rel', 'head', 'tail', 'score']
        assert (not data.empty)
        self.N = len(data)
        return IndexableDataset(data.to_dict('list'))
Exemplo n.º 15
0
def get_dev_stream(valid_file, **kwargs):
    valid_data = cPickle.load(open(valid_file))
    images = [example[0] for example in valid_data]
    targets = [example[1] for example in valid_data]
    dataset = IndexableDataset(
        OrderedDict([('input', images), ('output', targets)]))
    return DataStream(dataset,
                      iteration_scheme=SequentialExampleScheme(len(images)))
Exemplo n.º 16
0
    def _construct_dataset(self, dataset):
        '''Construct an fuel indexable dataset.

        Every field corresponds to the name of self.provide_sources
        :param dataset: A tuple of data
        :return:
        '''
        return IndexableDataset(
            indexables=OrderedDict(zip(self.provide_souces, dataset)))
Exemplo n.º 17
0
def test_flatten():
    stream = DataStream(IndexableDataset(
        OrderedDict([('features', numpy.ones((4, 2, 2))),
                     ('targets', numpy.array([0, 1, 0, 1]))])),
                        iteration_scheme=SequentialScheme(4, 2))
    wrapper = Flatten(stream, which_sources=('features', ))
    assert_equal(list(wrapper.get_epoch_iterator()),
                 [(numpy.ones((2, 4)), numpy.array([0, 1])),
                  (numpy.ones((2, 4)), numpy.array([0, 1]))])
Exemplo n.º 18
0
 def test_axis_labels_on_flatten_examples(self):
     wrapper = Flatten(
         DataStream(IndexableDataset(self.data),
                    iteration_scheme=SequentialExampleScheme(4),
                    axis_labels={'features': ('batch', 'width', 'height'),
                                 'targets': ('batch', 'index')}),
         which_sources=('features',))
     assert_equal(wrapper.axis_labels, {'features': ('feature',),
                                        'targets': ('index',)})
Exemplo n.º 19
0
 def test_axis_labels_on_flatten_batches_with_none(self):
     wrapper = Flatten(
         DataStream(IndexableDataset(self.data),
                    iteration_scheme=SequentialScheme(4, 2),
                    axis_labels={'features': None,
                                 'targets': ('batch', 'index')}),
         which_sources=('features',))
     assert_equal(wrapper.axis_labels, {'features': None,
                                        'targets': ('batch', 'index')})
Exemplo n.º 20
0
def test_min_max_aggregators():
    num_examples = 4
    batch_size = 2

    features = numpy.array([[2, 3], [2, 9], [2, 4], [5, 1]],
                           dtype=theano.config.floatX)

    dataset = IndexableDataset(OrderedDict([('features', features)]))

    data_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 num_examples, batch_size))

    x = tensor.matrix('features')
    y = (x**2).sum(axis=0)
    y.name = 'y'
    z = y.min()
    z.name = 'z'

    y.tag.aggregation_scheme = Maximum(y)
    z.tag.aggregation_scheme = Minimum(z)

    assert_allclose(
        DatasetEvaluator([y]).evaluate(data_stream)['y'],
        numpy.array([29, 90], dtype=theano.config.floatX))
    assert_allclose(
        DatasetEvaluator([z]).evaluate(data_stream)['z'],
        numpy.array([8], dtype=theano.config.floatX))

    # Make sure accumulators are reset.
    features = numpy.array([[2, 1], [1, 3], [1, -1], [2.5, 1]],
                           dtype=theano.config.floatX)

    dataset = IndexableDataset(OrderedDict([('features', features)]))

    data_stream = DataStream(dataset,
                             iteration_scheme=SequentialScheme(
                                 num_examples, batch_size))
    assert_allclose(
        DatasetEvaluator([y]).evaluate(data_stream)['y'],
        numpy.array([7.25, 10], dtype=theano.config.floatX))
    assert_allclose(
        DatasetEvaluator([z]).evaluate(data_stream)['z'],
        numpy.array([2], dtype=theano.config.floatX))
Exemplo n.º 21
0
    def replaceTestData(self, testNodes, maxNeighbors=1000, maskNames=['x']):
        if self.batchesInferences:
            batch_size = self.batch_size
        else:
            batch_size = 1

        testing, testIDs = encode_data_VarLen(self.G,
                                              testNodes,
                                              self.attrKey,
                                              maxNeighbors,
                                              useActualLabs=self.useActualLabs,
                                              useInputX2=self.useInputX2,
                                              onlyLabs=self.onlyLabs,
                                              lastH=self.lastHH,
                                              nodeIDs=True)
        dataset_test = IndexableDataset(testing)
        self.stream_test = DataStream(dataset=dataset_test,
                                      iteration_scheme=SequentialScheme(
                                          examples=dataset_test.num_examples,
                                          batch_size=batch_size))
        #add masks, have to do individually to avoid all dimensions must be equal error
        #write own padding transformer, their's sucks ...
        self.stream_test = Padding(self.stream_test, mask_sources=maskNames)
        #transpose them for rnn input
        self.stream_test = Mapping(self.stream_test, self.transpose_streamTest)
        self.num_examples_test = dataset_test.num_examples

        #replace shareddata with test_all data
        self.test_all, names = self.iterateShared(self.stream_test,
                                                  makeShared=False,
                                                  name="test")

        #if we are doing test in batches
        if self.batchesInferences:
            for key in self.test_all:
                totalTestBatches = len(self.test_all[key])
                if key != 'nodeID':
                    for i in range(0, totalTestBatches):
                        #if test data has more batches, we add more to shared data list
                        #else we just reset
                        if i >= self.totalBatches:
                            newKey = key + '_myinput'
                            self.sharedData[key].append(
                                shared(self.test_all[key][i],
                                       name=self.sharedName + '_' + newKey +
                                       '_test_' + str(i)))
                        else:
                            self.sharedData[key][i].set_value(
                                self.test_all[key][i], borrow=True)

                    self.sharedBatch[key].set_value(
                        self.sharedData[key][0].get_value(borrow=True),
                        borrow=True)

            self.stream_test_int = IntStream(0, totalTestBatches, 1,
                                             'int_stream')
Exemplo n.º 22
0
 def test_one_hot_examples(self):
     wrapper = OneHotEncoding(DataStream(
         IndexableDataset(self.data),
         iteration_scheme=SequentialExampleScheme(4)),
                              num_classes=4,
                              which_sources=('targets', ))
     assert_equal(list(wrapper.get_epoch_iterator()),
                  [(numpy.ones((2, 2)), numpy.array([[1, 0, 0, 0]])),
                   (numpy.ones((2, 2)), numpy.array([[0, 1, 0, 0]])),
                   (numpy.ones((2, 2)), numpy.array([[0, 0, 1, 0]])),
                   (numpy.ones((2, 2)), numpy.array([[0, 0, 0, 1]]))])
Exemplo n.º 23
0
def get_stream(trainXY, batch_size=100):
    #trainXY=genSynXY()
    dataset_train = IndexableDataset(trainXY)
    stream_train_1 = DataStream(dataset=dataset_train,
                                iteration_scheme=ShuffledScheme(
                                    examples=dataset_train.num_examples,
                                    batch_size=batch_size))
    stream_train_2 = Padding(stream_train_1)
    #stream_train_1.sources=('x_mask_o', 'y_mask_o', 'x', 'y')
    stream_train_3 = Mapping(stream_train_2, transpose_stream)

    return (stream_train_3, dataset_train.num_examples)
Exemplo n.º 24
0
def test_rename():
    stream = DataStream(IndexableDataset(
        OrderedDict([('X', numpy.ones((4, 2, 2))),
                     ('y', numpy.array([0, 1, 0, 1]))])),
                        iteration_scheme=SequentialScheme(4, 2))
    transformer = Rename(stream, {'X': 'features', 'y': 'targets'})
    assert_equal(transformer.sources, ('features', 'targets'))
    assert_equal(list(transformer.get_epoch_iterator()),
                 [(numpy.ones((2, 2, 2)), numpy.array([0, 1])),
                  (numpy.ones((2, 2, 2)), numpy.array([0, 1]))])
    assert_raises(ValueError, transformer.get_data, [0, 1])
    assert_raises(KeyError, Rename, stream, {'Z': 'features'})
Exemplo n.º 25
0
def createDataset(corpus=None, sequence_length=25, repeat=1):
    if not corpus: corpus = Corpus(open("corpus.txt").read())
    vocab_size = corpus.vocab_size()
    in_splits  = corpus.get_splits(seq_len=sequence_length, repeat=repeat)
    out_splits = corpus.get_splits(seq_len=sequence_length, repeat=repeat, shifted=True)

    df = IndexableDataset({
        'inchar': in_splits.astype(np.uint8),
        'outchar': out_splits.astype(np.uint8)
    })

    return df,vocab_size
Exemplo n.º 26
0
    def load_data(self, data_path):
        data_path = os.path.join(self.data_dir, data_path)
        if not os.path.isabs(data_path):
            data_path = os.path.join(DATA_DIR, data_path)

        logging.info("Loading: " + data_path)

        data = pd.read_csv(data_path, sep="\t", header=None)
        data.columns = ['rel', 'head', 'tail', 'score']
        assert (not data.empty)

        dataset = IndexableDataset(data.to_dict('list'))
        return dataset
Exemplo n.º 27
0
    def test_ignore_groups(self):
        stream_example = StructuredOneHotEncoding(DataStream(
            IndexableDataset(self.data),
            iteration_scheme=SequentialExampleScheme(4)),
                                                  num_classes=self.num_classes,
                                                  ignore_groups=[0, 2],
                                                  which_sources=('targets', ))

        assert_equal(list(stream_example.get_epoch_iterator()),
                     [(numpy.ones((2, 2)), numpy.array([[0, 1]])),
                      (numpy.ones((2, 2)), numpy.array([[1, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 1]])),
                      (numpy.ones((2, 2)), numpy.array([[1, 0]]))])

        stream_example2 = StructuredOneHotEncoding(
            DataStream(IndexableDataset(self.data),
                       iteration_scheme=SequentialExampleScheme(4)),
            num_classes=self.num_classes,
            ignore_groups=[1, 2],
            which_sources=('targets', ))

        assert_equal(list(stream_example2.get_epoch_iterator()),
                     [(numpy.ones((2, 2)), numpy.array([[1, 0, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])),
                      (numpy.ones((2, 2)), numpy.array([[0, 0, 1]]))])

        stream_batch = StructuredOneHotEncoding(DataStream(
            IndexableDataset(self.data),
            iteration_scheme=SequentialScheme(4, 2)),
                                                num_classes=self.num_classes,
                                                ignore_groups=[0, 2],
                                                which_sources=('targets', ))

        assert_equal(list(stream_batch.get_epoch_iterator()), [
            (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])),
            (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])),
        ])
Exemplo n.º 28
0
 def test_batch_iteration_scheme_with_lists(self):
     """Batch schemes should work with more than ndarrays."""
     data = IndexableDataset(
         OrderedDict([('foo', list(range(50))), ('bar', list(range(1,
                                                                   51)))]))
     stream = DataStream(data,
                         iteration_scheme=ShuffledScheme(
                             data.num_examples, 5))
     returned = [
         sum(batches, [])
         for batches in zip(*list(stream.get_epoch_iterator()))
     ]
     assert set(returned[0]) == set(range(50))
     assert set(returned[1]) == set(range(1, 51))
Exemplo n.º 29
0
def test_datastream_evaluator():
    stream = IndexableDataset(indexables=OrderedDict([
        ("data", np.ones((10, 4, 9), dtype="float32")),
    ])).get_example_stream()

    x = T.matrix("data")
    mon = x.sum(axis=1)
    mon.name = "mon"

    evaluator = DataStreamEvaluator([mon])
    results = evaluator.evaluate(stream)
    assert set(results.keys()) == set(['mon'])

    assert_allclose(results['mon'], np.ones((4 * 10)) * 9)
Exemplo n.º 30
0
def test_num_examples():
    assert_raises(ValueError, IterableDataset,
                  {'features': range(10), 'targets': range(7)})
    dataset = IterableDataset({'features': range(7),
                               'targets': range(7)})
    assert dataset.num_examples == 7
    dataset = IterableDataset(repeat(1))
    assert numpy.isnan(dataset.num_examples)
    x = numpy.random.rand(5, 3)
    y = numpy.random.rand(5, 4)
    dataset = IndexableDataset({'features': x, 'targets': y})
    assert dataset.num_examples == 5
    assert_raises(ValueError, IndexableDataset,
                  {'features': x, 'targets': y[:4]})