예제 #1
0
def test_rowgroup_selector_partition_key(synthetic_dataset, reader_factory):
    """ Select row groups to read based on dataset index for array field"""
    with reader_factory(synthetic_dataset.url,
                        rowgroup_selector=SingleIndexSelector(
                            TestSchema.partition_key.name, ['p_1'])) as reader:
        count = sum(1 for _ in reader)
        assert 10 == count
예제 #2
0
def test_rowgroup_selector_wrong_index_name(synthetic_dataset):
    """ Attempt to select row groups to based on wrong dataset index,
        Reader should raise exception
    """
    with pytest.raises(ValueError):
        Reader(synthetic_dataset.url,
               rowgroup_selector=SingleIndexSelector('WrongIndexName', ['some_value']), reader_pool=DummyPool())
예제 #3
0
def test_rowgroup_selector_string_field(synthetic_dataset, reader_factory):
    """ Select row groups to read based on dataset index for string field"""
    with reader_factory(synthetic_dataset.url,
                        rowgroup_selector=SingleIndexSelector(TestSchema.sensor_name.name, ['test_sensor'])) as reader:
        count = sum(1 for _ in reader)

        # Since we use artificial dataset all sensors have the same name,
        # so all row groups should be selected and all 1000 generated rows should be returned
        assert 100 == count
예제 #4
0
def test_rowgroup_selector_multiple_fields_union(synthetic_dataset, reader_factory):
    union_index_selector = UnionIndexSelector(
        [SingleIndexSelector(TestSchema.sensor_name.name, ['test_sensor']),
         SingleIndexSelector(TestSchema.id.name, [2, 18])]
    )
    with reader_factory(synthetic_dataset.url,
                        rowgroup_selector=union_index_selector) as reader:
        count = 0
        status = [False, False, False]
        for row in reader:
            if row.id == 2:
                status[0] = True
            if row.id == 18:
                status[1] = True
            if row.sensor_name == 'test_sensor':
                status[2] = True
            count += 1
        assert all(status)
        assert 100 == count
예제 #5
0
def test_rowgroup_selector_nullable_array_field(synthetic_dataset, reader_factory):
    """ Select row groups to read based on dataset index for array field"""
    with reader_factory(synthetic_dataset.url,
                        rowgroup_selector=SingleIndexSelector(TestSchema.string_array_nullable.name,
                                                              ['100'])) as reader:
        count = sum(1 for _ in reader)
        # This field contain id string, generated like this
        #   None if id % 5 == 0 else np.asarray([], dtype=np.string_) if id % 4 == 0 else
        #   np.asarray([str(i+id) for i in xrange(2)], dtype=np.string_)
        # hence '100' could be present in row id 99 as 99+1 and row id 100 as 100+0
        # but row 100 will be skipped by ' None if id % 5 == 0' condition, so only one row group should be selected
        assert 10 == count
예제 #6
0
def test_rowgroup_selector_integer_field(synthetic_dataset, reader_factory):
    """ Select row groups to read based on dataset index for integer field"""
    with reader_factory(synthetic_dataset.url, rowgroup_selector=SingleIndexSelector(TestSchema.id.name, [2, 18])) \
            as reader:
        status = [False, False]
        count = 0
        for row in reader:
            if row.id == 2:
                status[0] = True
            if row.id == 18:
                status[1] = True
            count += 1
        # both id values in reader result
        assert all(status)
        # read only 2 row groups, 100 rows per row group
        assert 20 == count
예제 #7
0
def test_regenerate_using_row_group_summary_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Regenerate the metadata (taking the schema information from the common_metadata which exists)
    petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path), '--use-summary-metadata'])

    dataset = pq.ParquetDataset(a_moved_path)
    # Metadata path should not exist still (should be only _common_metadata)
    assert dataset.metadata

    # Reader should now work again with rowgroup selector since it was in original metadata
    _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))
예제 #8
0
def test_regenerate_row_group_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Delete only the metadata file
    dataset = pq.ParquetDataset(a_moved_path)
    os.remove(dataset.metadata_path)

    # Should now raise a value error
    with pytest.raises(ValueError):
        _check_reader(a_moved_path)

    # Regenerate the metadata (taking the schema information from the common_metadata which exists)
    petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path)])

    # Reader should now work again with rowgroup selector since it was in original metadata
    _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))