def test_ngram_length_1_tf(synthetic_dataset): """Test to verify that ngram generalize to support length 1""" dataset_dicts = synthetic_dataset.data fields = {0: [TestSchema.id, TestSchema.id2]} ngram = NGram(fields=fields, delta_threshold=0.012, timestamp_field=TestSchema.id) reader = Reader(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 5), reader_pool=DummyPool()) with tf.Session() as sess: for _ in range(10): actual = sess.run(tf_tensors(reader)) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()
def _test_continuous_ngram(ngram_fields, dataset_num_files_1): """Test continuous ngram of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1.""" ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_num_files_1.url, reader_pool=ThreadPool(1), shuffle_options=ShuffleOptions(False)) as reader: expected_id = 0 for _ in range(ngram.length): actual = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) np.testing.assert_equal(actual, expected_ngram) expected_id = expected_id + 1
def test_ngram_validation(): """Test to verify that ngram validation work as expected.""" fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } with pytest.raises(ValueError): # delta threshold must be an int NGram(fields=fields, delta_threshold='abc', timestamp_field=TestSchema.id) with pytest.raises(ValueError): # timestamp_field must be a field NGram(fields=fields, delta_threshold=5, timestamp_field=5) with pytest.raises(ValueError): # Fields must be a dict NGram(fields=[], delta_threshold=5, timestamp_field=TestSchema.id) with pytest.raises(ValueError): # Each value in fields must be an array NGram(fields={0: 'test'}, delta_threshold=5, timestamp_field=TestSchema.id) with pytest.raises(ValueError): # timestamp_overlap must be bool NGram(fields=fields, delta_threshold=0.5, timestamp_field=TestSchema.id, timestamp_overlap=2) # Check some positive cases NGram(fields=fields, delta_threshold=0.5, timestamp_field=TestSchema.id) NGram(fields=fields, delta_threshold=Decimal('0.5'), timestamp_field=TestSchema.id)
def test_ngram_shuffle_drop_ratio(synthetic_dataset, reader_factory): """Test to verify the shuffle drop ratio work as expected.""" fields = { -2: [TestSchema.id, TestSchema.id2, TestSchema.matrix], -1: [TestSchema.id, TestSchema.id2, TestSchema.image_png], 0: [TestSchema.id, TestSchema.id2, TestSchema.decimal], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], 2: [TestSchema.id, TestSchema.id2] } ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id) with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_row_groups=False) as reader: unshuffled = [row[0].id for row in reader] with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_row_groups=True, shuffle_row_drop_partitions=6) as reader: shuffled = [row[0].id for row in reader] assert len(unshuffled) == len(shuffled) assert unshuffled != shuffled
def test_ngram_basic_longer_no_overlap(synthetic_dataset, reader_factory): """Tests basic ngram with no delta threshold with no overlaps of timestamps.""" fields = { -5: [TestSchema.id, TestSchema.id2, TestSchema.matrix], -4: [TestSchema.id, TestSchema.id2, TestSchema.image_png], -3: [TestSchema.id, TestSchema.id2, TestSchema.decimal], -2: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], -1: [TestSchema.id, TestSchema.id2] } dataset_dicts = synthetic_dataset.data ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id, timestamp_overlap=False) with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_row_groups=False) as reader: timestamps_seen = set() for actual in reader: expected_ngram = _get_named_tuple_from_ngram(ngram, dataset_dicts, actual[min(actual.keys())].id) np.testing.assert_equal(actual, expected_ngram) for step in actual.values(): timestamp = step.id assert timestamp not in timestamps_seen timestamps_seen.add(timestamp)
def _test_continuous_ngram_returns(ngram_fields, ts_field, dataset_num_files_1, reader_factory): """Test continuous ngram of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1. Returns the ngram object""" ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=ts_field) with reader_factory(dataset_num_files_1.url, schema_fields=ngram, shuffle_row_groups=False) as reader: expected_id = 0 for _ in range(ngram.length): actual = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) np.testing.assert_equal(actual, expected_ngram) expected_id = expected_id + 1 return ngram
def test_ngram_delta_threshold(dataset_0_3_8_10_11_20_23): """Test to verify that delta threshold work as expected in one partition in the same ngram and between consecutive ngrams. delta threshold here refers that each ngram must not be more than delta threshold apart for the field specified by timestamp_field.""" fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=4, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_0_3_8_10_11_20_23.url, reader_pool=ThreadPool(1), shuffle_options=ShuffleOptions(False)) as reader: # NGrams expected: (0, 3), (8, 10), (10, 11) first_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 0) np.testing.assert_equal(first_item, expected_ngram) second_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 3) np.testing.assert_equal(second_item, expected_ngram) third_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 5) np.testing.assert_equal(third_item, expected_ngram) with pytest.raises(StopIteration): next(reader)
def test_ngram_delta_small_threshold(): """Test to verify that a small threshold work in ngrams.""" with temporary_directory() as tmp_dir: tmp_url = 'file://{}'.format(tmp_dir) ids = range(0, 99, 5) create_test_dataset(tmp_url, ids) fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=1, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=tmp_url, reader_pool=ThreadPool(10)) as reader: with pytest.raises(StopIteration): next(reader)
def test_ngram_shuffle_drop_ratio(synthetic_dataset): """Test to verify the shuffle drop ratio work as expected.""" fields = { -2: [TestSchema.id, TestSchema.id2, TestSchema.matrix], -1: [TestSchema.id, TestSchema.id2, TestSchema.image_png], 0: [TestSchema.id, TestSchema.id2, TestSchema.decimal], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], 2: [TestSchema.id, TestSchema.id2] } ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id) with Reader(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(False), reader_pool=DummyPool()) as reader: unshuffled = [row[0].id for row in reader] with Reader(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 6), reader_pool=DummyPool()) as reader: shuffled = [row[0].id for row in reader] assert len(unshuffled) == len(shuffled) assert unshuffled != shuffled
def test_dataset_on_ngram_not_supported(synthetic_dataset, reader_factory): ngram = NGram({0: list(_EXCLUDE_FIELDS), 1: [TestSchema.id]}, 100, TestSchema.id) with reader_factory(synthetic_dataset.url, schema_fields=ngram) as reader: with pytest.raises(NotImplementedError): make_petastorm_dataset(reader)