示例#1
0
def test_regenerate_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Delete both metadata files
    dataset = pq.ParquetDataset(a_moved_path)
    os.remove(dataset.common_metadata_path)

    # Should now raise a value error
    with pytest.raises(PetastormMetadataError):
        _check_reader(a_moved_path)

    # Regenerate all metadata including unischema information
    petastorm_generate_metadata._main([
        '--dataset_url',
        'file://{}'.format(a_moved_path),
        '--unischema_class',
        'petastorm.tests.test_common.TestSchema',
    ])

    # Reader should now work again (row group selector will not since we removed all metadata)
    _check_reader(a_moved_path)
示例#2
0
def test_regenerate_using_row_group_summary_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Regenerate the metadata (taking the schema information from the common_metadata which exists)
    petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path), '--use-summary-metadata'])

    dataset = pq.ParquetDataset(a_moved_path)
    # Metadata path should not exist still (should be only _common_metadata)
    assert dataset.metadata

    # Reader should now work again with rowgroup selector since it was in original metadata
    _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))
示例#3
0
def test_cannot_find_unischema(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Delete both metadata files
    dataset = pq.ParquetDataset(a_moved_path)
    os.remove(dataset.metadata_path)
    os.remove(dataset.common_metadata_path)

    # Should now raise a value error
    with pytest.raises(ValueError):
        _check_reader(a_moved_path)

    # Regeneration should fail since it cannot find the unischema
    with pytest.raises(ValueError):
        petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path)])
示例#4
0
def test_regenerate_row_group_metadata(synthetic_dataset, tmpdir):
    a_moved_path = tmpdir.join('moved').strpath
    copytree(synthetic_dataset.path, a_moved_path)

    # Make sure we can read dataset before
    _check_reader(a_moved_path)

    # Delete only the metadata file
    dataset = pq.ParquetDataset(a_moved_path)
    os.remove(dataset.metadata_path)

    # Should now raise a value error
    with pytest.raises(ValueError):
        _check_reader(a_moved_path)

    # Regenerate the metadata (taking the schema information from the common_metadata which exists)
    petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path)])

    # Reader should now work again with rowgroup selector since it was in original metadata
    _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))