def test_write_single_partition(store_factory, mock_uuid, metadata_version): create_empty_dataset_header( store=store_factory(), table_meta={ "table1": pd.DataFrame({"col": [1]}), "table2": pd.DataFrame({"other_col": ["a"]}), }, dataset_uuid="some_dataset", metadata_version=metadata_version, ) new_data = { "data": { "table1": pd.DataFrame({"col": [1, 2]}), "table2": pd.DataFrame({"other_col": ["a", "b"]}), } } keys_in_store = set(store_factory().keys()) new_mp = write_single_partition(store=store_factory, dataset_uuid="some_dataset", data=new_data) keys_in_store.add("some_dataset/table1/auto_dataset_uuid.parquet") keys_in_store.add("some_dataset/table2/auto_dataset_uuid.parquet") assert set(store_factory().keys()) == keys_in_store expected_mp = MetaPartition( label="auto_dataset_uuid", # this will be a hash of the input files={ "table1": "some_dataset/table1/auto_dataset_uuid.parquet", "table2": "some_dataset/table2/auto_dataset_uuid.parquet", }, metadata_version=4, table_meta={ "table1": make_meta(pd.DataFrame({"col": [1, 2]}), origin="table1"), "table2": make_meta(pd.DataFrame({"other_col": ["a", "b"]}), origin="table2"), }, ) assert new_mp == expected_mp with pytest.raises(ValueError): # col is an integer column so this is incompatible. new_data["data"]["table1"] = pd.DataFrame( {"col": [datetime.date(2010, 1, 1)]}) write_single_partition(store=store_factory, dataset_uuid="some_dataset", data=new_data)
def test_initial_commit(store): dataset_uuid = "dataset_uuid" df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, table_meta={"core": make_meta(df, origin="1")}, dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False new_data = {"data": {"core": df}} new_metapartition = write_single_partition(store=store, dataset_uuid=dataset.uuid, data=new_data) new_partition = [{ "label": new_metapartition.label, "data": [("core", None)] }] updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True actual = read_table(store=store, table="core", dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame( OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual)
def test_commit_dataset_from_nested_metapartition(store): """ Check it is possible to use `commit_dataset` with nested metapartitions as input. Original issue: https://github.com/JDASoftwareGroup/kartothek/issues/40 """ df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}) create_empty_dataset_header( store=store, dataset_uuid="uuid", schema=make_meta(df, "table", ["a"]), partition_on=["a"], ) partitions = [] for x in range(2): partitions.append( write_single_partition( store=store, dataset_uuid="uuid", data=df, partition_on=["a"], )) partition_labels = {mp_.label for mp in partitions for mp_ in mp} dm = commit_dataset(store=store, dataset_uuid="uuid", new_partitions=partitions, partition_on=["a"]) assert dm.partitions.keys() == partition_labels
def test_initial_commit(store): dataset_uuid = "dataset_uuid" df = pd.DataFrame(OrderedDict([("P", [5]), ("L", [5]), ("TARGET", [5])])) dataset = create_empty_dataset_header( store=store, schema=make_meta(df, origin="1"), dataset_uuid=dataset_uuid, metadata_version=4, ) assert dataset.explicit_partitions is False new_metapartition = write_single_partition(store=store, dataset_uuid=dataset.uuid, data=df) updated_dataset = commit_dataset( store=store, dataset_uuid=dataset.uuid, # FIXME: is this breaking and if so, is it expected? new_partitions=[new_metapartition], delete_scope=None, partition_on=None, ) assert updated_dataset.explicit_partitions is True actual = read_table(store=store, dataset_uuid=updated_dataset.uuid) df_expected = pd.DataFrame( OrderedDict([("L", [5]), ("P", [5]), ("TARGET", [5])])) assert_frame_equal(df_expected, actual)
def test_create_empty_header_from_pyarrow_schema(store_factory): # GH228 df = pd.DataFrame( [{"part": 1, "id": 1, "col1": "abc"}, {"part": 2, "id": 2, "col1": np.nan}] ) dataset_uuid = "sample_ds" schema = pa.Schema.from_pandas(df) dm = create_empty_dataset_header( store=store_factory, dataset_uuid=dataset_uuid, table_meta={"table": schema}, partition_on=["part"], ) new_partitions = [ write_single_partition( store=store_factory, dataset_uuid=dataset_uuid, data=[{"table": df.loc[df["part"] == 1]}], partition_on=["part"], ) ] assert len(dm.partitions) == 0 dm = commit_dataset( store=store_factory, dataset_uuid=dataset_uuid, new_partitions=new_partitions, partition_on=["part"], ) assert len(dm.partitions) == 1
def test_commit_dataset_from_metapartition(dataset_function, store): new_data = [ pd.DataFrame( OrderedDict([ ("P", [5]), ("L", [5]), ("TARGET", [5]), ("DATE", [datetime.date(2016, 3, 23)]), ])) ] new_partition = write_single_partition(store=store, dataset_uuid=dataset_function.uuid, data=new_data) pre_commit_dataset = DatasetMetadata.load_from_store( uuid=dataset_function.uuid, store=store) # Cannot assert equal since the metadata is differently ordered assert pre_commit_dataset == dataset_function updated_dataset = commit_dataset( store=store, dataset_uuid=dataset_function.uuid, new_partitions=new_partition, delete_scope=None, partition_on=None, ) assert updated_dataset != dataset_function assert updated_dataset.uuid == dataset_function.uuid assert len( updated_dataset.partitions) == len(dataset_function.partitions) + 1 # ensure that the new dataset is actually the one on disc loaded_dataset = DatasetMetadata.load_from_store(uuid=updated_dataset.uuid, store=store) assert loaded_dataset == updated_dataset # Read the data and check whether the rows above are included. # This checks whether all necessary informations were updated in the header # (e.g. files attributes of the partitions) actual = read_table(store=store, dataset_uuid=dataset_function.uuid) df_expected = pd.DataFrame( OrderedDict([ ( "DATE", [ datetime.date(2016, 3, 23), datetime.date(2010, 1, 1), datetime.date(2009, 12, 31), ], ), ("L", [5, 1, 2]), ("P", [5, 1, 2]), ("TARGET", [5, 1, 2]), ])) actual = actual.sort_values("DATE", ascending=False).reset_index(drop=True) assert_frame_equal(df_expected, actual)
def test_write_single_partition_different_partitioning(store_factory): df = pd.DataFrame( OrderedDict([("location", ["0", "1", "2"]), ("other", ["a", "a", "a"])])) input_ = [{ "label": "label", "data": [("order_proposals", df)], "indices": { "location": {k: ["label"] for k in df["location"].unique()} }, }] dataset = store_dataframes_as_dataset( dfs=input_, store=store_factory, dataset_uuid="dataset_uuid", metadata_version=4, partition_on=["other"], ) new_data = { "data": { "order_proposals": pd.DataFrame( OrderedDict([("other", ["b", "b", "b"]), ("location", ["0", "1", "2"])])) } } initial_keys = len(list(store_factory().keys())) with pytest.raises(ValueError): write_single_partition( store=store_factory, dataset_uuid=dataset.uuid, data=new_data, partition_on="location", ) assert initial_keys == len(list(store_factory().keys())) write_single_partition( store=store_factory, dataset_uuid=dataset.uuid, data=new_data, partition_on=["other"], ) assert initial_keys + 1 == len(list(store_factory().keys())) new_data["label"] = "some_other_label" # If no partitioning is given, it will be determined based on the existing dataset write_single_partition(store=store_factory, dataset_uuid=dataset.uuid, data=new_data) assert initial_keys + 2 == len(list(store_factory().keys()))