def test_ngram_delta_small_threshold_tf(): """Test to verify that a small threshold work in ngrams.""" with temporary_directory() as tmp_dir: tmp_url = 'file://{}'.format(tmp_dir) ids = range(0, 99, 5) create_test_dataset(tmp_url, ids) fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) reader = Reader( schema_fields=ngram, dataset_url=tmp_url, reader_pool=DummyPool(), ) with tf.Session() as sess: with pytest.raises(OutOfRangeError): sess.run(tf_tensors(reader)) reader.stop() reader.join()
def _test_noncontinuous_ngram_tf(ngram_fields, synthetic_dataset): """Test non continuous ngram in tf of a certain length. Non continuous here refers to that the reader will not necessarily return consecutive ngrams because partition is more than one and false is true.""" dataset_dicts = synthetic_dataset.data ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) reader = Reader( schema_fields=ngram, dataset_url=synthetic_dataset.url, reader_pool=ThreadPool(1), ) readout_examples = tf_tensors(reader) # Make sure we have static shape info for all fields for timestep in readout_examples: for field in readout_examples[timestep]: assert field.get_shape().dims is not None # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: for _ in range(5): actual = sess.run(readout_examples) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()
def _read_from_tf_tensors(self, count, shuffling_queue_capacity, min_after_dequeue, ngram): """Used by several test cases. Reads a 'count' rows using reader. The reader is configured without row-group shuffling and guarantees deterministic order of rows up to the results queue TF shuffling which is controlled by 'shuffling_queue_capacity', 'min_after_dequeue' arguments. The function returns a tuple with: (actual data read from the dataset, a TF tensor returned by the reader) """ # Nullable fields can not be read by tensorflow (what would be the dimension of a tensor for null data?) fields = set(TestSchema.fields.values()) - { TestSchema.matrix_nullable, TestSchema.string_array_nullable } schema_fields = (fields if ngram is None else ngram) reader = Reader(schema_fields=schema_fields, dataset_url=self._dataset_url, reader_pool=DummyPool(), shuffle=False) row_tensors = tf_tensors( reader, shuffling_queue_capacity=shuffling_queue_capacity, min_after_dequeue=min_after_dequeue) # Read a bunch of entries from the dataset and compare the data to reference with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer() ]) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, start=True) # Collect all the data we need from 'count' number of reads rows_data = [sess.run(row_tensors) for _ in range(count)] coord.request_stop() coord.join(threads) reader.stop() reader.join() return rows_data, row_tensors
def test_partition_multi_node(synthetic_dataset): """Tests that the reader only returns half of the expected data consistently""" reader = Reader(synthetic_dataset.url, reader_pool=DummyPool(), training_partition=0, num_training_partitions=5) reader_2 = Reader(synthetic_dataset.url, reader_pool=DummyPool(), training_partition=0, num_training_partitions=5) results_1 = [] expected = [] for row in reader: actual = dict(row._asdict()) results_1.append(actual) expected.append( next(d for d in synthetic_dataset.data if d['id'] == actual['id'])) results_2 = [dict(row._asdict()) for row in reader_2] # Since order is non deterministic, we need to sort results by id results_1.sort(key=lambda x: x['id']) results_2.sort(key=lambda x: x['id']) expected.sort(key=lambda x: x['id']) np.testing.assert_equal(expected, results_1) np.testing.assert_equal(results_1, results_2) assert len(results_1) < len(synthetic_dataset.data) # Test that separate partitions also have no overlap by checking ids id_set = set([item['id'] for item in results_1]) for partition in range(1, 5): with Reader(synthetic_dataset.url, reader_pool=DummyPool(), training_partition=partition, num_training_partitions=5) as reader_other: for row in reader_other: assert dict(row._asdict())['id'] not in id_set reader.stop() reader.join() reader_2.stop() reader_2.join()
def test_ngram_length_1_tf(synthetic_dataset): """Test to verify that ngram generalize to support length 1""" dataset_dicts = synthetic_dataset.data fields = {0: [TestSchema.id, TestSchema.id2]} ngram = NGram(fields=fields, delta_threshold=0.012, timestamp_field=TestSchema.id) reader = Reader(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 5), reader_pool=DummyPool()) with tf.Session() as sess: for _ in range(10): actual = sess.run(tf_tensors(reader)) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()