def test_normalize_shuffle_partitions(synthetic_dataset): dataset = pq.ParquetDataset(synthetic_dataset.path) shuffle_options = ShuffleOptions(True, 2) Reader._normalize_shuffle_options(shuffle_options, dataset) assert shuffle_options.shuffle_row_drop_partitions == 2 shuffle_options = ShuffleOptions(True, 1000) Reader._normalize_shuffle_options(shuffle_options, dataset) assert shuffle_options.shuffle_row_drop_partitions == 10
def compute_correlation_distribution(dataset_url, id_column, shuffle_options, num_corr_samples=100): """ Compute the correlation distribution of a given shuffle_options on an existing dataset. Use this to compare 2 different shuffling options compare. It is encouraged to use a dataset generated by generate_shuffle_analysis_dataset for this analysis. :param dataset_url: Dataset url to compute correlation distribution of :param id_column: Column where an integer or string id can be found :param shuffle_options: shuffle options to test correlation against :param num_corr_samples: How many samples of the correlation to take to compute distribution :return: (mean, standard deviation) of computed distribution """ # Read the dataset without any shuffling in order (need to use a dummy pool for this). with Reader(dataset_url, shuffle_options=ShuffleOptions(False), reader_pool=DummyPool()) as reader: unshuffled = [row[id_column] for row in reader] correlations = [] for _ in range(num_corr_samples): with Reader(dataset_url, shuffle_options=shuffle_options) as reader: shuffled = [row[id_column] for row in reader] correlations.append(abs(np.corrcoef(unshuffled, shuffled)[0, 1])) mean = np.mean(correlations) std_dev = np.std(correlations) return mean, std_dev
def test_ngram_basic_longer_no_overlap(synthetic_dataset, reader_factory): """Tests basic ngram with no delta threshold with no overlaps of timestamps.""" fields = { -5: [TestSchema.id, TestSchema.id2, TestSchema.matrix], -4: [TestSchema.id, TestSchema.id2, TestSchema.image_png], -3: [TestSchema.id, TestSchema.id2, TestSchema.decimal], -2: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], -1: [TestSchema.id, TestSchema.id2] } dataset_dicts = synthetic_dataset.data ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id, timestamp_overlap=False) with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(False)) as reader: timestamps_seen = set() for actual in reader: expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) np.testing.assert_equal(actual, expected_ngram) for step in actual.values(): timestamp = step.id assert timestamp not in timestamps_seen timestamps_seen.add(timestamp)
def test_predicate_on_multiple_fields(synthetic_dataset, reader_factory): expected_values = {'id': 11, 'id2': 1} with reader_factory(synthetic_dataset.url, shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values)) as reader: actual = next(reader) assert actual.id == expected_values['id'] assert actual.id2 == expected_values['id2']
def _test_continuous_ngram_tf(ngram_fields, dataset_num_files_1, reader_factory): """Tests continuous ngram in tf of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1. """ ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with reader_factory(dataset_num_files_1.url, schema_fields=ngram, shuffle_options=ShuffleOptions(False)) as reader: readout_examples = tf_tensors(reader) # Make sure we have static shape info for all fields for timestep in readout_examples: for field in readout_examples[timestep]: assert field.get_shape().dims is not None # Read a bunch of entries from the dataset and compare the data to reference expected_id = 0 with tf.Session() as sess: for _ in range(5): actual = sess.run(readout_examples) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) _assert_equal_ngram(actual, expected_ngram) expected_id = expected_id + 1
def readout_all_ids(shuffle, drop_ratio): with Reader(dataset_url=synthetic_dataset.url, reader_pool=DummyPool(), shuffle_options=ShuffleOptions(shuffle, drop_ratio)) as reader: ids = [row.id for row in reader] return ids
def test_ngram_delta_threshold_tf(dataset_0_3_8_10_11_20_23): """Test to verify that delta threshold work as expected in one partition in the same ngram and between consecutive ngrams. delta threshold here refers that each ngram must not be more than delta threshold apart for the field specified by timestamp_field.""" fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=4, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_0_3_8_10_11_20_23.url, reader_pool=DummyPool(), shuffle_options=ShuffleOptions(False)) as reader: # Ngrams expected: (0, 3), (8, 10), (10, 11) with tf.Session() as sess: readout = tf_tensors(reader) for timestep in readout: for field in readout[timestep]: assert field.get_shape().dims is not None first_item = sess.run(readout) expected_item = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 0) _assert_equal_ngram(first_item, expected_item) readout = tf_tensors(reader) for timestep in readout: for field in readout[timestep]: assert field.get_shape().dims is not None second_item = sess.run(readout) expected_item = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 3) _assert_equal_ngram(second_item, expected_item) readout = tf_tensors(reader) for timestep in readout: for field in readout[timestep]: assert field.get_shape().dims is not None third_item = sess.run(readout) expected_item = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 5) _assert_equal_ngram(third_item, expected_item) with pytest.raises(OutOfRangeError): sess.run(tf_tensors(reader))
def test_ngram_shuffle_drop_ratio(synthetic_dataset, reader_factory): """Test to verify the shuffle drop ratio work as expected.""" fields = { -2: [TestSchema.id, TestSchema.id2, TestSchema.matrix], -1: [TestSchema.id, TestSchema.id2, TestSchema.image_png], 0: [TestSchema.id, TestSchema.id2, TestSchema.decimal], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], 2: [TestSchema.id, TestSchema.id2] } ngram = NGram(fields=fields, delta_threshold=10, timestamp_field=TestSchema.id) with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(False)) as reader: unshuffled = [row[0].id for row in reader] with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 6)) as reader: shuffled = [row[0].id for row in reader] assert len(unshuffled) == len(shuffled) assert unshuffled != shuffled
def test_predicate_with_invalid_fields(synthetic_dataset): """Try passing an invalid field name from a predicate to the reader. An error should be raised.""" TEST_CASES = [ {'invalid_field_name': 1}, dict(), {'invalid_field_name': 1, 'id': 11}, {'invalid_field_name': 1, 'invalid_field_name_2': 11}] for predicate_spec in TEST_CASES: with Reader(synthetic_dataset.url, shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(predicate_spec), reader_pool=ThreadPool(1)) as reader: with pytest.raises(ValueError): next(reader)
def test_ngram_length_1(synthetic_dataset, reader_factory): """Test to verify that ngram generalize to support length 1""" dataset_dicts = synthetic_dataset.data fields = {0: [TestSchema.id, TestSchema.id2]} ngram = NGram(fields=fields, delta_threshold=0.012, timestamp_field=TestSchema.id) with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 3)) as reader: for _ in range(10): actual = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram)
def test_stable_pieces_order(synthetic_dataset): """Tests that the reader raises value errors when appropriate""" RERUN_THE_TEST_COUNT = 20 baseline_run = None for _ in range(RERUN_THE_TEST_COUNT): with Reader(synthetic_dataset.url, schema_fields=[TestSchema.id], shuffle_options=ShuffleOptions(False), reader_pool=DummyPool()) as reader: this_run = [row.id for row in reader] if baseline_run: assert this_run == baseline_run baseline_run = this_run
def _test_noncontinuous_ngram(ngram_fields, synthetic_dataset, reader_factory): """Test noncontinuous ngram of a certain length. Non continuous here refers to that the reader will not necessarily return consecutive ngrams because partition is more than one and false is true.""" dataset_dicts = synthetic_dataset.data ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with reader_factory(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 5)) as reader: for _ in range(10): actual = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) np.testing.assert_equal(actual, expected_ngram)
def _test_continuous_ngram(ngram_fields, dataset_num_files_1, reader_factory): """Test continuous ngram of a certain length. Continuous here refers to that this reader will always return consecutive ngrams due to shuffle being false and partition being 1.""" ngram = NGram(fields=ngram_fields, delta_threshold=10, timestamp_field=TestSchema.id) with reader_factory(dataset_num_files_1.url, schema_fields=ngram, shuffle_options=ShuffleOptions(False)) as reader: expected_id = 0 for _ in range(ngram.length): actual = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_num_files_1.data, expected_id) np.testing.assert_equal(actual, expected_ngram) expected_id = expected_id + 1
def test_invalid_schema_field(synthetic_dataset): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False) ]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1)) assert 'bogus_key' in str(e)
def test_ngram_length_1_tf(synthetic_dataset): """Test to verify that ngram generalize to support length 1""" dataset_dicts = synthetic_dataset.data fields = {0: [TestSchema.id, TestSchema.id2]} ngram = NGram(fields=fields, delta_threshold=0.012, timestamp_field=TestSchema.id) reader = Reader(synthetic_dataset.url, schema_fields=ngram, shuffle_options=ShuffleOptions(True, 5), reader_pool=DummyPool()) with tf.Session() as sess: for _ in range(10): actual = sess.run(tf_tensors(reader)) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_dicts, actual[min(actual.keys())].id) _assert_equal_ngram(actual, expected_ngram) reader.stop() reader.join()
def test_ngram_delta_threshold(dataset_0_3_8_10_11_20_23): """Test to verify that delta threshold work as expected in one partition in the same ngram and between consecutive ngrams. delta threshold here refers that each ngram must not be more than delta threshold apart for the field specified by timestamp_field.""" fields = { 0: [ TestSchema.id, TestSchema.id2, TestSchema.image_png, TestSchema.matrix ], 1: [TestSchema.id, TestSchema.id2, TestSchema.sensor_name], } ngram = NGram(fields=fields, delta_threshold=4, timestamp_field=TestSchema.id) with Reader(schema_fields=ngram, dataset_url=dataset_0_3_8_10_11_20_23.url, reader_pool=ThreadPool(1), shuffle_options=ShuffleOptions(False)) as reader: # NGrams expected: (0, 3), (8, 10), (10, 11) first_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 0) np.testing.assert_equal(first_item, expected_ngram) second_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 3) np.testing.assert_equal(second_item, expected_ngram) third_item = next(reader) expected_ngram = _get_named_tuple_from_ngram( ngram, dataset_0_3_8_10_11_20_23.data, 5) np.testing.assert_equal(third_item, expected_ngram) with pytest.raises(StopIteration): next(reader)
def readout_all_ids(shuffle): with Reader(synthetic_dataset.url, shuffle_options=ShuffleOptions(shuffle), reader_pool=ThreadPool(1)) as reader: ids = [row.id for row in reader] return ids