def test_raises_on_invalid_input(store_factory, bound_update_dataset): dataset_uuid = "dataset_uuid" partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))] }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))] }, ] dataset = store_dataframes_as_dataset(dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid) with pytest.raises(Exception): new_partitions = [({"stuff"}, [("something", {1, 2, 3})])] # invalid format bound_update_dataset(new_partitions, store=store_factory, dataset_uuid=dataset_uuid) # Check no new partitions have been written to storage mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) assert len(mps) == len(dataset.partitions)
def test_add_column_to_existing_index(store_factory, metadata_version, bound_build_dataset_indices): dataset_uuid = "dataset_uuid" partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({ "p": [1, 2], "x": [100, 4500] }))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={ 1: ["cluster_1"], 2: ["cluster_1"] }) }, }, { "label": "cluster_2", "data": [("core", pd.DataFrame({ "p": [4, 3], "x": [500, 10] }))], "indices": { "p": ExplicitSecondaryIndex("p", index_dct={ 4: ["cluster_2"], 3: ["cluster_2"] }) }, }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=metadata_version, ) assert dataset.load_all_indices(store=store_factory()).indices.keys() == { "p" } # Create indices bound_build_dataset_indices(store_factory, dataset_uuid, columns=["x"]) # Assert indices are properly created mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) for column_name in ["p", "x"]: assert all([mp.indices[column_name] for mp in mps]) dataset_factory = DatasetFactory(dataset_uuid, store_factory, load_all_indices=True) assert dataset_factory.indices.keys() == {"p", "x"}
def test_metadata_version( store_factory, bound_update_dataset, mock_default_metadata_version, backend_identifier, ): if backend_identifier in ("dask.dataframe", "dask.delayed"): pytest.skip() # TODO: fix `io.dask.*.test_update._update_dataset` dataset_uuid = "dataset_uuid" partitions = [ { "label": "cluster_1", "data": [("core", pd.DataFrame({"p": [1, 2]}))] }, { "label": "cluster_2", "data": [("core", pd.DataFrame({"p": [2, 3]}))] }, ] dataset = store_dataframes_as_dataset( dfs=partitions, store=store_factory, dataset_uuid=dataset_uuid, metadata_version=DEFAULT_METADATA_VERSION, ) with pytest.raises(AssertionError, match="Traversed through mock"): # Try to commit data to dataset using a different metadata version # and different data format (format is mocked) # This does not raise when the `parse_input_to_metapartition` # argument is `default_metadata_version` instead of `metadata_version` new_partitions = ("core", pd.DataFrame({"p": [2, 3]})) bound_update_dataset( new_partitions, store=store_factory, dataset_uuid=dataset_uuid, default_metadata_version=mock_default_metadata_version, ) mps = read_dataset_as_metapartitions(store=store_factory, dataset_uuid=dataset_uuid) assert len(mps) == len(dataset.partitions)