def test_predicate_on_multiple_fields(synthetic_dataset, reader_factory): expected_values = {'id': 11, 'id2': 1} with reader_factory(synthetic_dataset.url, shuffle_row_groups=False, predicate=EqualPredicate(expected_values)) as reader: actual = next(reader) assert actual.id == expected_values['id'] assert actual.id2 == expected_values['id2']
def test_two_column_predicate(synthetic_dataset, reader_factory): """Test quering a single column with a predicate on the same column """ with reader_factory(synthetic_dataset.url, schema_fields=[TestSchema.id2, TestSchema.partition_key], predicate=EqualPredicate({'id2': 1, 'partition_key': 'p_2'})) as reader: all_rows = list(reader) all_id2 = np.array(list(map(operator.attrgetter('id2'), all_rows))) all_partition_key = np.array(list(map(operator.attrgetter('partition_key'), all_rows))) assert (all_id2 == 1).all() assert (all_partition_key == 'p_2').all()
def test_single_column_predicate(synthetic_dataset): """Test quering a single column with a predicate on the same column """ with Reader(synthetic_dataset.url, schema_fields=[TestSchema.id], predicate=EqualPredicate({'id': 1}), reader_pool=ThreadPool(1)) as reader: # Read a bunch of entries from the dataset and compare the data to reference for row in reader: actual = dict(row._asdict()) expected = next(d for d in synthetic_dataset.data if d['id'] == actual['id']) np.testing.assert_equal(expected['id'], actual['id'])
def test_predicate_with_invalid_fields(synthetic_dataset, reader_factory): """Try passing an invalid field name from a predicate to the reader. An error should be raised.""" TEST_CASES = [ {'invalid_field_name': 1}, dict(), {'invalid_field_name': 1, 'id': 11}, {'invalid_field_name': 1, 'invalid_field_name_2': 11}] for predicate_spec in TEST_CASES: with reader_factory(synthetic_dataset.url, shuffle_row_groups=False, predicate=EqualPredicate(predicate_spec)) as reader: with pytest.raises(ValueError): next(reader)
def test_invalid_schema_field(synthetic_dataset): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False)]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError) as e: Reader(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(expected_values), reader_pool=ThreadPool(1)) assert 'bogus_key' in str(e)
def test_predicate_with_invalid_fields(synthetic_dataset): """Try passing an invalid field name from a predicate to the reader. An error should be raised.""" TEST_CASES = [ {'invalid_field_name': 1}, dict(), {'invalid_field_name': 1, 'id': 11}, {'invalid_field_name': 1, 'invalid_field_name_2': 11}] for predicate_spec in TEST_CASES: with Reader(synthetic_dataset.url, shuffle_options=ShuffleOptions(False), predicate=EqualPredicate(predicate_spec), reader_pool=ThreadPool(1)) as reader: with pytest.raises(ValueError): next(reader)
def test_invalid_schema_field(synthetic_dataset, reader_factory): # Let's assume we are selecting columns using a schema which is different from the one # stored in the dataset. Would expect to get a reasonable error message BogusSchema = Unischema('BogusSchema', [ UnischemaField('partition_key', np.string_, (), ScalarCodec(StringType()), False), UnischemaField('id', np.int64, (), ScalarCodec(LongType()), False), UnischemaField('bogus_key', np.int32, (), ScalarCodec(ShortType()), False) ]) expected_values = {'bogus_key': 11, 'id': 1} with pytest.raises(ValueError, match='bogus_key'): reader_factory(synthetic_dataset.url, schema_fields=BogusSchema.fields.values(), shuffle_row_groups=False, predicate=EqualPredicate(expected_values))
def test_single_column_predicate(synthetic_dataset, reader_factory): """Test quering a single column with a predicate on the same column """ with reader_factory(synthetic_dataset.url, schema_fields=[TestSchema.id], predicate=EqualPredicate({'id': 1})) \ as reader: all_rows = list(reader) assert 1 == len(all_rows) assert 1 == all_rows[0].id