def test_commit_dataset_from_nested_metapartition(store): """ Check it is possible to use `commit_dataset` with nested metapartitions as input. Original issue: https://github.com/JDASoftwareGroup/kartothek/issues/40 """ df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) create_empty_dataset_header( store=store, dataset_uuid="uuid", schema=make_meta(df, "table", ["a"]), partition_on=["a"], ) partitions = [] for x in range(2): partitions.append( write_single_partition( store=store, dataset_uuid="uuid", data=df, partition_on=["a"], )) partition_labels = {mp_.label for mp in partitions for mp_ in mp} dm = commit_dataset(store=store, dataset_uuid="uuid", new_partitions=partitions, partition_on=["a"]) assert dm.partitions.keys() == partition_labels
def test_write_single_partition(store_factory, mock_uuid, metadata_version): create_empty_dataset_header( store=store_factory(), table_meta={ "table1": pd.DataFrame({"col": [1]}), "table2": pd.DataFrame({"other_col": ["a"]}), }, dataset_uuid="some_dataset", metadata_version=metadata_version, ) new_data = { "data": { "table1": pd.DataFrame({"col": [1, 2]}), "table2": pd.DataFrame({"other_col": ["a", "b"]}), } } keys_in_store = set(store_factory().keys()) new_mp = write_single_partition(store=store_factory, dataset_uuid="some_dataset", data=new_data) keys_in_store.add("some_dataset/table1/auto_dataset_uuid.parquet") keys_in_store.add("some_dataset/table2/auto_dataset_uuid.parquet") assert set(store_factory().keys()) == keys_in_store expected_mp = MetaPartition( label="auto_dataset_uuid", # this will be a hash of the input files={ "table1": "some_dataset/table1/auto_dataset_uuid.parquet", "table2": "some_dataset/table2/auto_dataset_uuid.parquet", }, metadata_version=4, table_meta={ "table1": make_meta(pd.DataFrame({"col": [1, 2]}), origin="table1"), "table2": make_meta(pd.DataFrame({"other_col": ["a", "b"]}), origin="table2"), }, ) assert new_mp == expected_mp with pytest.raises(ValueError): # col is an integer column so this is incompatible. new_data["data"]["table1"] = pd.DataFrame( {"col": [datetime.date(2010, 1, 1)]}) write_single_partition(store=store_factory, dataset_uuid="some_dataset", data=new_data)
def test_initial_commit(store): dataset_uuid = "dataset_uuid" df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, table_meta={"core": make_meta(df, origin="1")}, dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False new_data = {"data": {"core": df}} new_metapartition = write_single_partition(store=store, dataset_uuid=dataset.uuid, data=new_data) new_partition = [{ "label": new_metapartition.label, "data": [("core", None)] }] updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True actual = read_table(store=store, table="core", dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame( OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual)
def test_create_dataset_header(store, metadata_storage_format, frozen_time): table_meta = {"table": make_meta(pd.DataFrame({"col": [1]}), origin="1")} new_dataset = create_empty_dataset_header( store=store, table_meta=table_meta, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, ) expected_dataset = DatasetMetadata( uuid="new_dataset_uuid", metadata_version=4, explicit_partitions=False, table_meta=table_meta, ) assert new_dataset == expected_dataset storage_keys = list(store.keys()) assert len(storage_keys) == 2 loaded = DatasetMetadata.load_from_store(store=store, uuid="new_dataset_uuid") assert loaded == expected_dataset # If the read succeeds, the schema is written read_schema_metadata(dataset_uuid=new_dataset.uuid, store=store, table="table")
def test_create_dataset_header_minimal_version(store, metadata_storage_format): with pytest.raises(NotImplementedError): create_empty_dataset_header( store=store, table_meta={"table": pd.DataFrame({"col": [1]})}, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=3, ) create_empty_dataset_header( store=store, table_meta={"table": pd.DataFrame({"col": [1]})}, dataset_uuid="new_dataset_uuid", metadata_storage_format=metadata_storage_format, metadata_version=4, )
def test_initial_commit(store): dataset_uuid = "dataset_uuid" df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, schema=make_meta(df, origin="1"), dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False new_metapartition = write_single_partition(store=store, dataset_uuid=dataset.uuid, data=df) updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, # FIXME: is this breaking and if so, is it expected? new_partitions=[new_metapartition], delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True actual = read_table(store=store, dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame( OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual)
def test_create_empty_header_from_pyarrow_schema(store_factory): # GH228 df = pd.DataFrame( [{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}] ) dataset_uuid = "sample_ds" schema = pa.Schema.from_pandas(df) dm = create_empty_dataset_header( store=store_factory, dataset_uuid=dataset_uuid, table_meta={"table": schema}, partition_on=["part"], ) new_partitions = [ write_single_partition( store=store_factory, dataset_uuid=dataset_uuid, data=[{"table": df.loc[df["part"] == 1]}], partition_on=["part"], ) ] assert len(dm.partitions) == 0 dm = commit_dataset( store=store_factory, dataset_uuid=dataset_uuid, new_partitions=new_partitions, partition_on=["part"], ) assert len(dm.partitions) == 1