def test_partition_on_raises_no_cols_left(empty): original_df = pd.DataFrame({"test": [1, 2, 3]}) if empty: original_df = original_df.loc[[]] mp = MetaPartition( label="label_1", file="file", data=original_df, metadata_version=4 ) with pytest.raises(ValueError) as e: mp.partition_on(["test"]) assert str(e.value) == "No data left to save outside partition columns"
def test_partition_on_raises_pocols_missing(empty): original_df = pd.DataFrame({"test": [1, 2, 3]}) if empty: original_df = original_df.loc[[]] mp = MetaPartition( label="label_1", file="file", data=original_df, metadata_version=4 ) with pytest.raises(ValueError) as e: mp.partition_on(["test", "foo", "bar"]) assert str(e.value) == "Partition column(s) missing: bar, foo"
def test_partition_on_scalar_intermediate(df_not_nested): """ Test against a bug where grouping leaves a scalar value """ assert len(df_not_nested) == 1 mp = MetaPartition(label="somelabel", data={"table": df_not_nested}, metadata_version=4) for col in df_not_nested: if col == "byte": with pytest.raises(UnicodeDecodeError): mp.partition_on(col) else: new_mp = mp.partition_on(col) assert len(new_mp) == 1
def test_partition_on_one_level_ts(): original_df = pd.DataFrame({ "test": [ pd.Timestamp("2001-01-01"), pd.Timestamp("2001-01-02"), pd.Timestamp("2001-01-03"), ], "some_values": [1, 2, 3], }) mp = MetaPartition(label="label_1", file="file", data=original_df, metadata_version=4) new_mp = mp.partition_on(["test"]) assert len(new_mp.metapartitions) == 3 labels = set() for mp in new_mp: labels.add(mp.label) assert len(mp.data) == 1 assert mp.data is not None df = mp.data assert df._is_view # try to be agnostic about the order assert len(df) == 1 assert "test" not in df expected_labels = set([ "test=2001-01-01%2000%3A00%3A00/label_1", "test=2001-01-02%2000%3A00%3A00/label_1", "test=2001-01-03%2000%3A00%3A00/label_1", ]) assert labels == expected_labels
def test_partition_urlencode(): original_df = pd.DataFrame({"ÖŒå": [1, 2, 3], "some_values": [1, 2, 3]}) mp = MetaPartition(label="label_1", data=original_df, metadata_version=4) new_mp = mp.partition_on(["ÖŒå"]) assert len(new_mp.metapartitions) == 3 labels = set() for mp in new_mp: labels.add(mp.label) assert len(mp.data) == 1 assert mp.data is not None df = mp.data assert df._is_view # try to be agnostic about the order assert len(df) == 1 assert "ÖŒå" not in df expected_labels = set( [ "%C3%96%C5%92%C3%A5=1/label_1", "%C3%96%C5%92%C3%A5=2/label_1", "%C3%96%C5%92%C3%A5=3/label_1", ] ) assert labels == expected_labels
def test_partition_on_one_level(): original_df = pd.DataFrame({"test": [1, 2, 3], "some_values": [1, 2, 3]}) mp = MetaPartition( label="label_1", files={"core": "file"}, data={"core": original_df}, dataset_metadata={"dataset": "metadata"}, metadata_version=4, ) new_mp = mp.partition_on(["test"]) assert len(new_mp.metapartitions) == 3 labels = set() for mp in new_mp: labels.add(mp.label) assert len(mp.data) == 1 assert "core" in mp.data df = mp.data["core"] assert df._is_view # try to be agnostic about the order assert len(df) == 1 assert "test" not in df expected_labels = set( ["test=1/label_1", "test=2/label_1", "test=3/label_1"]) assert labels == expected_labels
def test_partition_on_roundtrip(store): original_df = pd.DataFrame( OrderedDict([("test", [1, 2, 3]), ("some_values", [1, 2, 3])]) ) mp = MetaPartition(label="label_1", data=original_df, metadata_version=4) new_mp = mp.partition_on(["test"]) new_mp = new_mp.store_dataframes(store=store, dataset_uuid="some_uuid") store_schema_metadata(new_mp.schema, "some_uuid", store) # Test immediately after dropping and later once with new metapartition to check table meta reloading new_mp = new_mp.load_dataframes(store=store) assert len(new_mp.metapartitions) == 3 dfs = [] for internal_mp in new_mp: dfs.append(internal_mp.data) actual_df = pd.concat(dfs).sort_values(by="test").reset_index(drop=True) pdt.assert_frame_equal(original_df, actual_df) for i in range(1, 4): # Check with fresh metapartitions new_mp = MetaPartition( label=f"test={i}/label_1", file=f"some_uuid/table/test={i}/label_1.parquet", metadata_version=4, ) new_mp = new_mp.load_dataframes(store=store) actual_df = new_mp.data expected_df = pd.DataFrame(OrderedDict([("test", [i]), ("some_values", [i])])) pdt.assert_frame_equal(expected_df, actual_df)
def test_partition_on_with_primary_index(df_not_nested): mp = MetaPartition( label="pkey=1/base_label", data=df_not_nested, partition_keys=["pkey"], metadata_version=4, ) new = mp.partition_on(["pkey", "int64"]) split_label = new.label.split("/") assert len(split_label) == 3 assert split_label[0] == "pkey=1" assert split_label[1] == "int64=1" assert split_label[2] == "base_label" assert mp == mp.partition_on(["pkey"])
def test_partition_on_scalar_intermediate(df_not_nested, col): """ Test against a bug where grouping leaves a scalar value """ assert len(df_not_nested) == 1 mp = MetaPartition(label="somelabel", data=df_not_nested, metadata_version=4) new_mp = mp.partition_on(col) assert len(new_mp) == 1
def test_partition_on_explicit_index(): original_df = pd.DataFrame({ "level1": [1, 2, 1, 2, 1, 2], "level2": [1, 1, 1, 2, 2, 2], "explicit_index_col": np.arange(0, 6), }) mp = MetaPartition( label="label_1", file="file", data=original_df, indices={ "explicit_index_col": {value: ["label_1"] for value in np.arange(0, 6)} }, metadata_version=4, ) new_mp = mp.partition_on(["level1", "level2"]) assert len(new_mp) == 4 expected_indices = { "explicit_index_col": ExplicitSecondaryIndex( "explicit_index_col", { 0: ["level1=1/level2=1/label_1"], 2: ["level1=1/level2=1/label_1"] }, ) } assert expected_indices == new_mp["level1=1/level2=1/label_1"].indices expected_indices = { "explicit_index_col": ExplicitSecondaryIndex("explicit_index_col", {4: ["level1=1/level2=2/label_1"]}) } assert expected_indices == new_mp["level1=1/level2=2/label_1"].indices expected_indices = { "explicit_index_col": ExplicitSecondaryIndex("explicit_index_col", {1: ["level1=2/level2=1/label_1"]}) } assert expected_indices == new_mp["level1=2/level2=1/label_1"].indices expected_indices = { "explicit_index_col": ExplicitSecondaryIndex( "explicit_index_col", { 3: ["level1=2/level2=2/label_1"], 5: ["level1=2/level2=2/label_1"] }, ) } assert expected_indices == new_mp["level1=2/level2=2/label_1"].indices
def test_partition_on_valid_schemas(): """ Ensure that partitioning is possible even if the output schemas of the sub partitions may be different """ df = pd.DataFrame({"partition_col": [0, 1], "values": [None, "str"]}) mp = MetaPartition(label="base_label", data=df, metadata_version=4) mp = mp.partition_on(["partition_col"]) assert len(mp) == 2 expected_meta = make_meta(df, origin="1", partition_keys="partition_col") assert mp.schema == expected_meta
def test_partition_on_keeps_table_name(): mp = MetaPartition( label="label_1", data=pd.DataFrame({ "P": [1, 2, 1, 2], "L": [1, 1, 2, 2] }), table_name="non-default-name", ) repartitioned_mp = mp.partition_on(["P"]) assert repartitioned_mp.table_name == "non-default-name"
def test_partition_on_nested(): original_df = pd.DataFrame( { "level1": [1, 2, 3, 1, 2, 3], "level2": [1, 1, 1, 2, 2, 2], "no_index_col": np.arange(0, 6), } ) mp = MetaPartition( label="label_1", files={"core": "file"}, data={"core": original_df}, dataset_metadata={"dataset": "metadata"}, metadata_version=4, ) mp2 = MetaPartition( label="label_2", files={"core": "file"}, data={"core": original_df}, dataset_metadata={"dataset": "metadata"}, metadata_version=4, ) mp = mp.add_metapartition(mp2) new_mp = mp.partition_on(["level1", "level2"]) assert len(new_mp.metapartitions) == 12 labels = [] for mp in new_mp: labels.append(mp.label) assert len(mp.data) == 1 assert "core" in mp.data df = mp.data["core"] assert df._is_view # try to be agnostic about the order assert len(df) == 1 assert "level1" not in df assert "level2" not in df assert "no_index_col" in df expected_labels = [ "level1=1/level2=1/label_1", "level1=1/level2=2/label_1", "level1=2/level2=1/label_1", "level1=2/level2=2/label_1", "level1=3/level2=1/label_1", "level1=3/level2=2/label_1", "level1=1/level2=1/label_2", "level1=1/level2=2/label_2", "level1=2/level2=1/label_2", "level1=2/level2=2/label_2", "level1=3/level2=1/label_2", "level1=3/level2=2/label_2", ] assert sorted(labels) == sorted(expected_labels)
def store_dataframes_as_dataset( store, dataset_uuid, dfs, metadata=None, partition_on=None, df_serializer=None, overwrite=False, metadata_storage_format=DEFAULT_METADATA_STORAGE_FORMAT, metadata_version=DEFAULT_METADATA_VERSION, ): """ Utility function to store a list of dataframes as a partitioned dataset with multiple tables (files). Useful for very small datasets where all data fits into memory. Parameters ---------- dfs : dict of pd.DataFrame or pd.DataFrame The dataframe(s) to be stored. If only a single dataframe is passed, it will be stored as the `core` table. Returns ------- The stored dataset """ if dataset_uuid is None: dataset_uuid = gen_uuid() if isinstance(dfs, dict): dfs = {"data": [(table, df) for table, df in dfs.items()]} if not overwrite: raise_if_dataset_exists(dataset_uuid=dataset_uuid, store=store) mp = parse_input_to_metapartition(dfs, metadata_version) if partition_on: mp = MetaPartition.partition_on(mp, partition_on) mps = mp.store_dataframes(store=store, dataset_uuid=dataset_uuid, df_serializer=df_serializer) return store_dataset_from_partitions( partition_list=mps, dataset_uuid=dataset_uuid, store=store, dataset_metadata=metadata, metadata_storage_format=metadata_storage_format, )
def test_partition_on_stable_order(): """ Assert that the partition_on algo is stable wrt to row ordering """ unique_values = 3 total_values = 20 random_index = np.repeat( np.arange(unique_values), int(np.ceil(total_values / unique_values)) )[:total_values] np.random.shuffle(random_index) df = pd.DataFrame( {"partition_key": random_index, "sorted_col": range(total_values)} ) mp = MetaPartition(label="label_1", data=df, metadata_version=4) new_mp = mp.partition_on("partition_key") for sub_mp in new_mp: sub_df = sub_mp.data assert sub_df.sorted_col.is_monotonic
def test_partition_on_multiple_tables_empty_table(): original_df = pd.DataFrame({"level1": [1, 2, 3], "no_index_col": np.arange(0, 3)}) mp = MetaPartition( label="label_1", data=OrderedDict( [ ("core", original_df), ("empty_table", pd.DataFrame(columns=["level1", "another_col"])), ] ), metadata_version=4, ) new_mp = mp.partition_on("level1") labels = [] for mp in new_mp: labels.append(mp.label) assert "empty_table" in mp.data assert mp.data["empty_table"].empty assert set(mp.data["empty_table"].columns) == {"another_col"}
def test_partition_two_level(): original_df = pd.DataFrame( { "level1": [1, 2, 3, 1, 2, 3], "level2": [1, 1, 1, 2, 2, 2], "no_index_col": np.arange(0, 6), } ) mp = MetaPartition( label="label_1", file="file", data=original_df, metadata_version=4 ) new_mp = mp.partition_on(["level1", "level2"]) assert len(new_mp.metapartitions) == 6 labels = [] for mp in new_mp: labels.append(mp.label) assert len(mp.data) == 1 assert mp.data is not None df = mp.data assert df._is_view # try to be agnostic about the order assert len(df) == 1 assert "level1" not in df assert "level2" not in df assert "no_index_col" in df expected_labels = [ "level1=1/level2=1/label_1", "level1=1/level2=2/label_1", "level1=2/level2=1/label_1", "level1=2/level2=2/label_1", "level1=3/level2=1/label_1", "level1=3/level2=2/label_1", ] assert sorted(labels) == sorted(expected_labels)
def prepare_data_for_ktk(df, ktk_cube_dataset_id, cube, existing_payload, partition_on, consume_df=False): """ Prepare data so it can be handed over to Kartothek. Some checks will be applied to the data to ensure it is sane. Parameters ---------- df: pandas.DataFrame DataFrame to be passed to Kartothek. ktk_cube_dataset_id: str Ktk_cube dataset UUID (w/o cube prefix). cube: kartothek.core.cube.cube.Cube Cube specification. existing_payload: Set[str] Existing payload columns. partition_on: Iterable[str] Partition-on attribute for given dataset. consume_df: bool Whether the incoming DataFrame can be destroyed while processing it. Returns ------- mp: kartothek.io_components.metapartition.MetaPartition Kartothek-ready MetaPartition, may be sentinel (aka empty and w/o label). Raises ------ ValueError In case anything is fishy. """ check_user_df(ktk_cube_dataset_id, df, cube, existing_payload, partition_on) if (df is None) or df.empty: # fast-path for empty DF return MetaPartition( label=None, metadata_version=KTK_CUBE_METADATA_VERSION, partition_keys=list(partition_on), ) # TODO: find a more elegant solution that works w/o copy df_orig = df df = df.copy() if consume_df: # the original df is still referenced in the parent scope, so drop it df_orig.drop(columns=df_orig.columns, index=df_orig.index, inplace=True) df_columns = list(df.columns) df_columns_set = set(df_columns) # normalize value order and reset index sort_keys = [ col for col in itertools.chain(cube.partition_columns, cube.dimension_columns) if col in df_columns_set ] df = sort_dataframe(df=df, columns=sort_keys) # check duplicate cells _check_duplicates(ktk_cube_dataset_id, df, sort_keys, cube) # check+convert column names to unicode strings df.rename(columns={c: converter_str(c) for c in df_columns}, inplace=True) # create MetaPartition object for easier handling mp = MetaPartition( label=gen_uuid(), data=df, metadata_version=KTK_CUBE_METADATA_VERSION, ) del df # partition data mp = mp.partition_on(list(partition_on)) # reset indices again (because partition_on breaks it) for mp2 in mp: mp2.data.reset_index(drop=True, inplace=True) del mp2 # calculate indices indices_to_build = set(cube.index_columns) & df_columns_set if ktk_cube_dataset_id == cube.seed_dataset: indices_to_build |= set(cube.dimension_columns) - set( cube.suppress_index_on) indices_to_build -= set(partition_on) mp = mp.build_indices(indices_to_build) return mp
def test_partition_on_with_primary_index_invalid(df_not_nested): mp = MetaPartition( label="pkey=1/pkey2=2/base_label", data=df_not_nested, partition_keys=["pkey", "pkey2"], metadata_version=4, ) with pytest.raises(ValueError, match="Incompatible"): mp.partition_on("int64") with pytest.raises(ValueError, match="Incompatible"): mp.partition_on(["int64", "pkey"]) with pytest.raises(ValueError, match="Incompatible"): mp.partition_on(["pkey", "int64"]) with pytest.raises(ValueError, match="Incompatible"): mp.partition_on(["pkey2", "pkey1", "int64"]) mp.partition_on(["pkey", "pkey2"]) mp.partition_on(["pkey", "pkey2", "int64"])