def test_regenerate_metadata(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Delete both metadata files dataset = pq.ParquetDataset(a_moved_path) os.remove(dataset.common_metadata_path) # Should now raise a value error with pytest.raises(PetastormMetadataError): _check_reader(a_moved_path) # Regenerate all metadata including unischema information petastorm_generate_metadata._main([ '--dataset_url', 'file://{}'.format(a_moved_path), '--unischema_class', 'petastorm.tests.test_common.TestSchema', ]) # Reader should now work again (row group selector will not since we removed all metadata) _check_reader(a_moved_path)
def test_regenerate_using_row_group_summary_metadata(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Regenerate the metadata (taking the schema information from the common_metadata which exists) petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path), '--use-summary-metadata']) dataset = pq.ParquetDataset(a_moved_path) # Metadata path should not exist still (should be only _common_metadata) assert dataset.metadata # Reader should now work again with rowgroup selector since it was in original metadata _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))
def test_cannot_find_unischema(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Delete both metadata files dataset = pq.ParquetDataset(a_moved_path) os.remove(dataset.metadata_path) os.remove(dataset.common_metadata_path) # Should now raise a value error with pytest.raises(ValueError): _check_reader(a_moved_path) # Regeneration should fail since it cannot find the unischema with pytest.raises(ValueError): petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path)])
def test_regenerate_row_group_metadata(synthetic_dataset, tmpdir): a_moved_path = tmpdir.join('moved').strpath copytree(synthetic_dataset.path, a_moved_path) # Make sure we can read dataset before _check_reader(a_moved_path) # Delete only the metadata file dataset = pq.ParquetDataset(a_moved_path) os.remove(dataset.metadata_path) # Should now raise a value error with pytest.raises(ValueError): _check_reader(a_moved_path) # Regenerate the metadata (taking the schema information from the common_metadata which exists) petastorm_generate_metadata._main(['--dataset_url', 'file://{}'.format(a_moved_path)]) # Reader should now work again with rowgroup selector since it was in original metadata _check_reader(a_moved_path, SingleIndexSelector(TestSchema.id.name, [2, 18]))