def _test_mean_like_aggregator(scheme, func): """Common test function for both Mean and Perplexity""" features = numpy.array([[0, 3], [2, 9], [2, 4], [5, 1], [6, 7]], dtype=theano.config.floatX) num_examples = features.shape[0] batch_size = 2 dataset = IndexableDataset(OrderedDict([('features', features)])) data_stream = DataStream(dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) x = tensor.matrix('features') y = (x**0.5).sum(axis=0) y.name = 'y' z = y.sum() z.name = 'z' y.tag.aggregation_scheme = scheme(y, x.shape[0]) z.tag.aggregation_scheme = scheme(z, x.shape[0]) y_desired = func((features**0.5).mean(axis=0)) z_desired = func((features**0.5).sum(axis=1).mean(axis=0)) assert_allclose( DatasetEvaluator([y]).evaluate(data_stream)['y'], numpy.array(y_desired, dtype=theano.config.floatX)) assert_allclose( DatasetEvaluator([z]).evaluate(data_stream)['z'], numpy.array(z_desired, dtype=theano.config.floatX))
def test_flatten_batches(self): wrapper = Flatten(DataStream(IndexableDataset(self.data), iteration_scheme=SequentialScheme(4, 2)), which_sources=('features', )) assert_equal(list(wrapper.get_epoch_iterator()), [(numpy.ones((2, 4)), numpy.array([[0], [1]])), (numpy.ones((2, 4)), numpy.array([[0], [1]]))])
def test_flatten_examples(self): wrapper = Flatten(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), which_sources=('features', )) assert_equal(list(wrapper.get_epoch_iterator()), [(numpy.ones(4), 0), (numpy.ones(4), 1)] * 2)
def test_mean_aggregator(): num_examples = 4 batch_size = 2 features = numpy.array([[0, 3], [2, 9], [2, 4], [5, 1]], dtype=theano.config.floatX) dataset = IndexableDataset(OrderedDict([('features', features)])) data_stream = DataStream(dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) x = tensor.matrix('features') y = (x**2).mean(axis=0) y.name = 'y' z = y.sum() z.name = 'z' y.tag.aggregation_scheme = Mean(y, 1.) z.tag.aggregation_scheme = Mean(z, 1.) assert_allclose( DatasetEvaluator([y]).evaluate(data_stream)['y'], numpy.array([8.25, 26.75], dtype=theano.config.floatX)) assert_allclose( DatasetEvaluator([z]).evaluate(data_stream)['z'], numpy.array([35], dtype=theano.config.floatX))
def test_one_hot_batches_invalid_input(self): wrapper = OneHotEncoding(DataStream(IndexableDataset(self.data), iteration_scheme=SequentialScheme( 4, 2)), num_classes=2, which_sources=('targets', )) assert_raises(ValueError, list, wrapper.get_epoch_iterator())
def setup_datastream(path, batch_size, sort_batch_count, valid=False): A = numpy.load( os.path.join(path, ('valid_x_raw.npy' if valid else 'train_x_raw.npy'))) B = numpy.load( os.path.join(path, ('valid_phn.npy' if valid else 'train_phn.npy'))) C = numpy.load( os.path.join( path, ('valid_seq_to_phn.npy' if valid else 'train_seq_to_phn.npy'))) D = [B[x[0]:x[1], 2] for x in C] ds = IndexableDataset({'input': A, 'output': D}) stream = DataStream(ds, iteration_scheme=ShuffledExampleScheme(len(A))) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size * sort_batch_count)) comparison = _balanced_batch_helper(stream.sources.index('input')) stream = Mapping(stream, SortMapping(comparison)) stream = Unpack(stream) stream = Batch(stream, iteration_scheme=ConstantScheme(batch_size, num_examples=len(A))) stream = Padding(stream, mask_sources=['input', 'output']) return ds, stream
def get_stream_raw(dataset, which_set, mini_batch_size): data = get_data(dataset) # dataset is a 3D array of shape: Time X Batch X Features dataset = data[which_set] time, batch, features = dataset.shape nb_mini_batches = batch / mini_batch_size dataset = dataset[:, :nb_mini_batches * mini_batch_size, :] # Create the target_dataset targets_dataset = dataset[1:, :, :] # Cut the dataset into several minibatches # dataset is now 4D (nb_mini_batches X Time X mini_batch_size X Features) dataset = np.swapaxes(dataset, 0, 1) targets_dataset = np.swapaxes(targets_dataset, 0, 1) dataset = np.reshape(dataset, (nb_mini_batches, mini_batch_size, time, features)) targets_dataset = np.reshape( targets_dataset, (nb_mini_batches, mini_batch_size, time - 1, features)) dataset = np.swapaxes(dataset, 1, 2) targets_dataset = np.swapaxes(targets_dataset, 1, 2) # Create fuel dataset dataset = IndexableDataset({ 'features': dataset, 'targets': targets_dataset }) stream = DataStream( dataset, iteration_scheme=SequentialExampleScheme(nb_mini_batches)) return stream
def test_filter_batches(self): data = [1, 2, 3, 4] data_filtered = [([3, 4],)] stream = DataStream(IndexableDataset(data), iteration_scheme=SequentialScheme(4, 2)) wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0) assert_equal(list(wrapper.get_epoch_iterator()), data_filtered)
def setUp(self): self.string_data = [b'Hello', b'World!'] self.dataset = IndexableDataset( indexables={ 'words': [numpy.fromstring(s, dtype='uint8') for s in self.string_data] }, axis_labels={'words': ('batch', 'bytes')})
def test_ngram_stream_raises_error_on_batch_stream(): sentences = [ list(numpy.random.randint(10, size=sentence_length)) for sentence_length in [3, 5, 7] ] stream = DataStream(IndexableDataset(sentences), iteration_scheme=SequentialScheme(3, 1)) assert_raises(ValueError, NGrams, 4, stream)
def test_single_mapping_value_error_on_request(): class IdentitySingleMapping(SingleMapping): def mapping(self, source): return source data_stream = DataStream(IndexableDataset([0, 1, 2])) transformer = IdentitySingleMapping(data_stream) assert_raises(ValueError, transformer.get_data, [0, 1])
def test_axis_labels_are_passed_through(self): stream = DataStream( IndexableDataset( {'features': [1, 2, 3, 4]}, axis_labels={'features': ('batch',)}), iteration_scheme=SequentialScheme(4, 2)) wrapper = Filter(stream, lambda d: d[0][0] % 3 == 0) assert_equal(wrapper.axis_labels, stream.axis_labels)
def setUp(self): self.stream = DataStream( IndexableDataset( OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([0, 1, 0, 1]))]), axis_labels={'features': ('batch', 'width', 'height'), 'targets': ('batch',)}), iteration_scheme=SequentialScheme(4, 2))
def load_data(self, data_path): logging.info("Loading: " + data_path) data = pd.read_csv(data_path, sep="\t", header=None) data.columns = ['rel', 'head', 'tail', 'score'] assert (not data.empty) self.N = len(data) return IndexableDataset(data.to_dict('list'))
def get_dev_stream(valid_file, **kwargs): valid_data = cPickle.load(open(valid_file)) images = [example[0] for example in valid_data] targets = [example[1] for example in valid_data] dataset = IndexableDataset( OrderedDict([('input', images), ('output', targets)])) return DataStream(dataset, iteration_scheme=SequentialExampleScheme(len(images)))
def _construct_dataset(self, dataset): '''Construct an fuel indexable dataset. Every field corresponds to the name of self.provide_sources :param dataset: A tuple of data :return: ''' return IndexableDataset( indexables=OrderedDict(zip(self.provide_souces, dataset)))
def test_flatten(): stream = DataStream(IndexableDataset( OrderedDict([('features', numpy.ones((4, 2, 2))), ('targets', numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2)) wrapper = Flatten(stream, which_sources=('features', )) assert_equal(list(wrapper.get_epoch_iterator()), [(numpy.ones((2, 4)), numpy.array([0, 1])), (numpy.ones((2, 4)), numpy.array([0, 1]))])
def test_axis_labels_on_flatten_examples(self): wrapper = Flatten( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4), axis_labels={'features': ('batch', 'width', 'height'), 'targets': ('batch', 'index')}), which_sources=('features',)) assert_equal(wrapper.axis_labels, {'features': ('feature',), 'targets': ('index',)})
def test_axis_labels_on_flatten_batches_with_none(self): wrapper = Flatten( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialScheme(4, 2), axis_labels={'features': None, 'targets': ('batch', 'index')}), which_sources=('features',)) assert_equal(wrapper.axis_labels, {'features': None, 'targets': ('batch', 'index')})
def test_min_max_aggregators(): num_examples = 4 batch_size = 2 features = numpy.array([[2, 3], [2, 9], [2, 4], [5, 1]], dtype=theano.config.floatX) dataset = IndexableDataset(OrderedDict([('features', features)])) data_stream = DataStream(dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) x = tensor.matrix('features') y = (x**2).sum(axis=0) y.name = 'y' z = y.min() z.name = 'z' y.tag.aggregation_scheme = Maximum(y) z.tag.aggregation_scheme = Minimum(z) assert_allclose( DatasetEvaluator([y]).evaluate(data_stream)['y'], numpy.array([29, 90], dtype=theano.config.floatX)) assert_allclose( DatasetEvaluator([z]).evaluate(data_stream)['z'], numpy.array([8], dtype=theano.config.floatX)) # Make sure accumulators are reset. features = numpy.array([[2, 1], [1, 3], [1, -1], [2.5, 1]], dtype=theano.config.floatX) dataset = IndexableDataset(OrderedDict([('features', features)])) data_stream = DataStream(dataset, iteration_scheme=SequentialScheme( num_examples, batch_size)) assert_allclose( DatasetEvaluator([y]).evaluate(data_stream)['y'], numpy.array([7.25, 10], dtype=theano.config.floatX)) assert_allclose( DatasetEvaluator([z]).evaluate(data_stream)['z'], numpy.array([2], dtype=theano.config.floatX))
def replaceTestData(self, testNodes, maxNeighbors=1000, maskNames=['x']): if self.batchesInferences: batch_size = self.batch_size else: batch_size = 1 testing, testIDs = encode_data_VarLen(self.G, testNodes, self.attrKey, maxNeighbors, useActualLabs=self.useActualLabs, useInputX2=self.useInputX2, onlyLabs=self.onlyLabs, lastH=self.lastHH, nodeIDs=True) dataset_test = IndexableDataset(testing) self.stream_test = DataStream(dataset=dataset_test, iteration_scheme=SequentialScheme( examples=dataset_test.num_examples, batch_size=batch_size)) #add masks, have to do individually to avoid all dimensions must be equal error #write own padding transformer, their's sucks ... self.stream_test = Padding(self.stream_test, mask_sources=maskNames) #transpose them for rnn input self.stream_test = Mapping(self.stream_test, self.transpose_streamTest) self.num_examples_test = dataset_test.num_examples #replace shareddata with test_all data self.test_all, names = self.iterateShared(self.stream_test, makeShared=False, name="test") #if we are doing test in batches if self.batchesInferences: for key in self.test_all: totalTestBatches = len(self.test_all[key]) if key != 'nodeID': for i in range(0, totalTestBatches): #if test data has more batches, we add more to shared data list #else we just reset if i >= self.totalBatches: newKey = key + '_myinput' self.sharedData[key].append( shared(self.test_all[key][i], name=self.sharedName + '_' + newKey + '_test_' + str(i))) else: self.sharedData[key][i].set_value( self.test_all[key][i], borrow=True) self.sharedBatch[key].set_value( self.sharedData[key][0].get_value(borrow=True), borrow=True) self.stream_test_int = IntStream(0, totalTestBatches, 1, 'int_stream')
def test_one_hot_examples(self): wrapper = OneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=4, which_sources=('targets', )) assert_equal(list(wrapper.get_epoch_iterator()), [(numpy.ones((2, 2)), numpy.array([[1, 0, 0, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1, 0, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 0, 1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 0, 0, 1]]))])
def get_stream(trainXY, batch_size=100): #trainXY=genSynXY() dataset_train = IndexableDataset(trainXY) stream_train_1 = DataStream(dataset=dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples, batch_size=batch_size)) stream_train_2 = Padding(stream_train_1) #stream_train_1.sources=('x_mask_o', 'y_mask_o', 'x', 'y') stream_train_3 = Mapping(stream_train_2, transpose_stream) return (stream_train_3, dataset_train.num_examples)
def test_rename(): stream = DataStream(IndexableDataset( OrderedDict([('X', numpy.ones((4, 2, 2))), ('y', numpy.array([0, 1, 0, 1]))])), iteration_scheme=SequentialScheme(4, 2)) transformer = Rename(stream, {'X': 'features', 'y': 'targets'}) assert_equal(transformer.sources, ('features', 'targets')) assert_equal(list(transformer.get_epoch_iterator()), [(numpy.ones((2, 2, 2)), numpy.array([0, 1])), (numpy.ones((2, 2, 2)), numpy.array([0, 1]))]) assert_raises(ValueError, transformer.get_data, [0, 1]) assert_raises(KeyError, Rename, stream, {'Z': 'features'})
def createDataset(corpus=None, sequence_length=25, repeat=1): if not corpus: corpus = Corpus(open("corpus.txt").read()) vocab_size = corpus.vocab_size() in_splits = corpus.get_splits(seq_len=sequence_length, repeat=repeat) out_splits = corpus.get_splits(seq_len=sequence_length, repeat=repeat, shifted=True) df = IndexableDataset({ 'inchar': in_splits.astype(np.uint8), 'outchar': out_splits.astype(np.uint8) }) return df,vocab_size
def load_data(self, data_path): data_path = os.path.join(self.data_dir, data_path) if not os.path.isabs(data_path): data_path = os.path.join(DATA_DIR, data_path) logging.info("Loading: " + data_path) data = pd.read_csv(data_path, sep="\t", header=None) data.columns = ['rel', 'head', 'tail', 'score'] assert (not data.empty) dataset = IndexableDataset(data.to_dict('list')) return dataset
def test_ignore_groups(self): stream_example = StructuredOneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, ignore_groups=[0, 2], which_sources=('targets', )) assert_equal(list(stream_example.get_epoch_iterator()), [(numpy.ones((2, 2)), numpy.array([[0, 1]])), (numpy.ones((2, 2)), numpy.array([[1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1]])), (numpy.ones((2, 2)), numpy.array([[1, 0]]))]) stream_example2 = StructuredOneHotEncoding( DataStream(IndexableDataset(self.data), iteration_scheme=SequentialExampleScheme(4)), num_classes=self.num_classes, ignore_groups=[1, 2], which_sources=('targets', )) assert_equal(list(stream_example2.get_epoch_iterator()), [(numpy.ones((2, 2)), numpy.array([[1, 0, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 1, 0]])), (numpy.ones((2, 2)), numpy.array([[0, 0, 1]]))]) stream_batch = StructuredOneHotEncoding(DataStream( IndexableDataset(self.data), iteration_scheme=SequentialScheme(4, 2)), num_classes=self.num_classes, ignore_groups=[0, 2], which_sources=('targets', )) assert_equal(list(stream_batch.get_epoch_iterator()), [ (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])), (numpy.ones((2, 2, 2)), numpy.array([[0, 1], [1, 0]])), ])
def test_batch_iteration_scheme_with_lists(self): """Batch schemes should work with more than ndarrays.""" data = IndexableDataset( OrderedDict([('foo', list(range(50))), ('bar', list(range(1, 51)))])) stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, 5)) returned = [ sum(batches, []) for batches in zip(*list(stream.get_epoch_iterator())) ] assert set(returned[0]) == set(range(50)) assert set(returned[1]) == set(range(1, 51))
def test_datastream_evaluator(): stream = IndexableDataset(indexables=OrderedDict([ ("data", np.ones((10, 4, 9), dtype="float32")), ])).get_example_stream() x = T.matrix("data") mon = x.sum(axis=1) mon.name = "mon" evaluator = DataStreamEvaluator([mon]) results = evaluator.evaluate(stream) assert set(results.keys()) == set(['mon']) assert_allclose(results['mon'], np.ones((4 * 10)) * 9)
def test_num_examples(): assert_raises(ValueError, IterableDataset, {'features': range(10), 'targets': range(7)}) dataset = IterableDataset({'features': range(7), 'targets': range(7)}) assert dataset.num_examples == 7 dataset = IterableDataset(repeat(1)) assert numpy.isnan(dataset.num_examples) x = numpy.random.rand(5, 3) y = numpy.random.rand(5, 4) dataset = IndexableDataset({'features': x, 'targets': y}) assert dataset.num_examples == 5 assert_raises(ValueError, IndexableDataset, {'features': x, 'targets': y[:4]})