Exemplo n.º 1
0
def test_update_dataset_from_ddf_empty(store_factory, shuffle):
    with pytest.raises(ValueError, match="Cannot store empty datasets"):
        update_dataset_from_ddf(
            dask.dataframe.from_delayed([], meta=(("a", int), )),
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=shuffle,
            partition_on=["a"],
        ).compute()
Exemplo n.º 2
0
def test_update_dataset_from_ddf_empty(store_factory, shuffle):
    with pytest.raises(ValueError) as exc_info:
        update_dataset_from_ddf(
            dask.dataframe.from_delayed([], meta=(("a", int), )),
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=shuffle,
            partition_on=["a"],
        ).compute()
    assert str(exc_info.value) in [
        "Cannot store empty datasets",  # dask <= 2021.5.0
        "Cannot store empty datasets, partition_list must not be empty if in store mode.",  # dask > 2021.5.0 + shuffle == True
        "No data left to save outside partition columns",  # dask > 2021.5.0 + shuffle == False
    ]
Exemplo n.º 3
0
    def partition_filt(self, filt, chunk_dfs=True):
        """Write partitioned dataset using kartothek
        """
        if chunk_dfs:
            for i, df in enumerate(self.iter_df_chunks(filt)):
                if df is not None:
                    print(f"... ...ktk repartitioning {self.dataset} ({filt}, chunk {i + 1})")
                    graph = update_dataset_from_ddf(df, **self.ktk_kwargs)
                    graph.compute()
        else:
            df = get_df(self.dataIds_by_filter[filt], self.filenames_by_filter[filt])

            if df is not None:
                print(f"... ...ktk repartitioning {self.dataset} ({filt}, chunk {i + 1})")
                graph = update_dataset_from_ddf(df, **self.ktk_kwargs)
                graph.compute()
Exemplo n.º 4
0
 def partition_filt(self, filt, chunk_dfs=False):
     """Write partitioned dataset using kartothek
     """
     for i, df in enumerate(self.iter_df_chunks(filt, chunk_dfs=chunk_dfs)):
         if df is not None:
             print(f"... ...ktk repartitioning {self.dataset} ({filt}, chunk {i + 1})")
             graph = update_dataset_from_ddf(df, **self.ktk_kwargs)
             graph.compute()
Exemplo n.º 5
0
 def partition(self, chunk_by_filter=True, chunk_dfs=True):
     if chunk_by_filter:
         for filt in self.filters:
             self.partition_filt(filt, chunk_dfs=chunk_dfs)
     else:
         df = self.get_df(self.dataIds, self.filenames)
         print(f"... ...ktk repartitioning {self.dataset}")
         graph = update_dataset_from_ddf(df, **self.ktk_kwargs)
         graph.compute()
Exemplo n.º 6
0
def test_update_shuffle_no_partition_on(store_factory, bucket_by):
    df = pd.DataFrame(
        {
            "range": np.arange(10),
            "range_duplicated": np.repeat(np.arange(2), 5),
            "random": np.random.randint(0, 100, 10),
        }
    )
    ddf = dd.from_pandas(df, npartitions=10)

    with pytest.raises(
        ValueError, match="``num_buckets`` must not be None when shuffling data."
    ):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="table",
            shuffle=True,
            num_buckets=None,
            bucket_by=bucket_by,
        ).compute()

    res_default = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid_default",
        table="table",
        shuffle=True,
        bucket_by=bucket_by,
    ).compute()
    assert len(res_default.partitions) == 1

    res = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="table",
        shuffle=True,
        num_buckets=2,
        bucket_by=bucket_by,
    ).compute()

    assert len(res.partitions) == 2
Exemplo n.º 7
0
 def update_dataset_from_ddf(self, ddf, **kwargs):
     """
     partition_on=["c_date"],
     num_buckets=num_buckets,
     shuffle=True,
     delete_scope=delete_scope
     """
     return update_dataset_from_ddf(ddf,
                                    store=lambda: self.writable_store,
                                    dataset_uuid=self.dataset_uuid,
                                    table=self.table,
                                    **kwargs)
Exemplo n.º 8
0
def test_delayed_as_delete_scope(store_factory, df_all_types):
    # Check that delayed objects are allowed as delete scope.
    tasks = update_dataset_from_ddf(
        dd.from_pandas(df_all_types, npartitions=1),
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        delete_scope=dask.delayed(_return_none)(),
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    tasks.compute()
Exemplo n.º 9
0
def _update_dataset(partitions, secondary_indices=None, *args, **kwargs):
    if any(partitions):
        table_name = next(iter(dict(partitions[0]["data"]).keys()))
        delayed_partitions = [
            dask.delayed(_unwrap_partition)(part) for part in partitions
        ]
        partitions = dd.from_delayed(delayed_partitions)
    else:
        table_name = "core"
        partitions = None
    return update_dataset_from_ddf(partitions,
                                   *args,
                                   table=table_name,
                                   secondary_indices=secondary_indices,
                                   **kwargs).compute()
Exemplo n.º 10
0
def _update_dataset(partitions, *args, **kwargs):
    if any(partitions):
        table_name = next(iter(dict(partitions[0]["data"]).keys()))
        delayed_partitions = [
            dask.delayed(_unwrap_partition)(part) for part in partitions
        ]
        partitions = dd.from_delayed(delayed_partitions)
    else:
        table_name = "core"
        partitions = None
    ddf = update_dataset_from_ddf(partitions, *args, table=table_name, **kwargs)

    s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL)
    ddf = pickle.loads(s)

    return ddf.compute()
Exemplo n.º 11
0
def _update_dataset(partitions, *args, **kwargs):
    # TODO: Simplify once parse_input_to_metapartition is removed / obsolete

    if isinstance(partitions, pd.DataFrame):
        partitions = dd.from_pandas(partitions, npartitions=1)
    elif partitions is not None:
        delayed_partitions = [dask.delayed(_id)(part) for part in partitions]
        partitions = dd.from_delayed(delayed_partitions)
    else:
        partitions = None

    ddf = update_dataset_from_ddf(partitions, *args, **kwargs)

    s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL)
    ddf = pickle.loads(s)

    return ddf.compute()
Exemplo n.º 12
0
def _update_dataset(partitions, *args, **kwargs):
    # TODO: fix the parsing below to adapt for all supported formats (see: parse_input_to_metapartition)
    if any(partitions):
        table_name = next(iter(dict(partitions[0]["data"]).keys()))
        delayed_partitions = [
            dask.delayed(_unwrap_partition)(part) for part in partitions
        ]
        partitions = dd.from_delayed(delayed_partitions)
    else:
        table_name = "core"
        partitions = None
    ddf = update_dataset_from_ddf(partitions,
                                  *args,
                                  table=table_name,
                                  **kwargs)

    s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL)
    ddf = pickle.loads(s)

    return ddf.compute()
Exemplo n.º 13
0
def _update_dataset(partitions, *args, **kwargs):
    # TODO: Simplify once parse_input_to_metapartition is removed / obsolete
    if isinstance(partitions, pd.DataFrame):
        if "table" not in kwargs.keys():
            kwargs["table"] = "core"
        partitions = dd.from_pandas(partitions, npartitions=1)
    elif any(partitions):
        kwargs["table"] = next(iter(dict(partitions[0]["data"]).keys()))
        delayed_partitions = [
            dask.delayed(_unwrap_partition)(part) for part in partitions
        ]
        partitions = dd.from_delayed(delayed_partitions)
    else:
        kwargs["table"] = "core"
        partitions = None
    ddf = update_dataset_from_ddf(partitions, *args, **kwargs)

    s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL)
    ddf = pickle.loads(s)

    return ddf.compute()
Exemplo n.º 14
0
def _update_dataset(partitions, *args, **kwargs):
    # TODO: Simplify once parse_input_to_metapartition is removed / obsolete

    if isinstance(partitions, pd.DataFrame):
        partitions = dd.from_pandas(partitions, npartitions=1)
    elif partitions is not None:
        delayed_partitions = [dask.delayed(_id)(part) for part in partitions]
        partitions = dd.from_delayed(delayed_partitions)
    else:
        partitions = None

    # Replace `table_name` with `table` keyword argument to enable shared test code
    # via `bound_update_dataset` fixture
    if "table_name" in kwargs:
        kwargs["table"] = kwargs["table_name"]
        del kwargs["table_name"]

    ddf = update_dataset_from_ddf(partitions, *args, **kwargs)

    s = pickle.dumps(ddf, pickle.HIGHEST_PROTOCOL)
    ddf = pickle.loads(s)

    return ddf.compute()
Exemplo n.º 15
0
def dataset(store, dataset_uuid):
    df = pd.DataFrame({
        "A":
        np.array([1, 2, 3, 4], dtype="int32"),
        "B": [
            pd.Timestamp("2002-01-01"),
            pd.Timestamp("2002-01-02"),
            pd.Timestamp("2002-01-03"),
            pd.Timestamp("2002-01-04"),
        ],
        "C":
        pd.Series(1, index=list(range(4)), dtype="double"),
        "D": ["test", "train", "test", "prod"],
    })
    ddf = dd.from_pandas(df, npartitions=2)
    delayed = update_dataset_from_ddf(ddf,
                                      store=lambda: store,
                                      dataset_uuid=dataset_uuid,
                                      table='table',
                                      partition_on=["B"])
    delayed.compute()
    yield
    for k in store.keys(prefix=dataset_uuid):
        store.delete(k)
Exemplo n.º 16
0
def test_update_shuffle_buckets(
    store_factory,
    metadata_version,
    unique_primaries,
    unique_secondaries,
    num_buckets,
    repartition,
    npartitions,
    bucket_by,
):
    """
    Assert that certain properties are always given for the output dataset
    no matter how the input data distribution looks like

    Properties to assert:
    * All partitions have a unique value for its correspondent primary key
    * number of partitions is at least one per unique partition value, at
      most ``num_buckets`` per primary partition value.
    * If we demand a column to be sorted it is per partition monotonic
    """
    primaries = np.arange(unique_primaries)
    secondary = np.arange(unique_secondaries)
    num_rows = 100
    primaries = np.repeat(primaries,
                          np.ceil(num_rows / unique_primaries))[:num_rows]
    secondary = np.repeat(secondary,
                          np.ceil(num_rows / unique_secondaries))[:num_rows]
    # ensure that there is an unsorted column uncorrelated
    # to the primary and secondary columns which can be sorted later on per partition
    unsorted_column = np.repeat(np.arange(100 / 10), 10)
    np.random.shuffle(unsorted_column)
    np.random.shuffle(primaries)
    np.random.shuffle(secondary)

    df = pd.DataFrame({
        "primary": primaries,
        "secondary": secondary,
        "sorted_column": unsorted_column
    })
    secondary_indices = ["secondary"]
    expected_num_indices = 2  # One primary

    # used for tests later on to
    if bucket_by:
        secondary_indices.append(bucket_by)
        expected_num_indices = 3

    # shuffle all rows. properties of result should be reproducible
    df = df.sample(frac=1).reset_index(drop=True)
    ddf = dd.from_pandas(df, npartitions=npartitions)

    dataset_comp = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        secondary_indices=secondary_indices,
        shuffle=True,
        bucket_by=bucket_by,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        partition_on=["primary"],
    )

    s = pickle.dumps(dataset_comp, pickle.HIGHEST_PROTOCOL)
    dataset_comp = pickle.loads(s)

    dataset = dataset_comp.compute()
    dataset = dataset.load_all_indices(store_factory())

    assert len(dataset.partitions) <= num_buckets * unique_primaries
    assert len(dataset.partitions) >= unique_primaries

    assert len(dataset.indices) == expected_num_indices

    assert set(dataset.indices["primary"].index_dct.keys()) == set(
        range(unique_primaries))
    assert (list(
        map(lambda x: len(x), dataset.indices["primary"].index_dct.values()))
            <= [num_buckets] * unique_primaries)

    assert set(dataset.indices["secondary"].index_dct.keys()) == set(
        range(unique_secondaries))

    assert set(dataset.table_meta["core"].names) == {
        "primary",
        "secondary",
        "sorted_column",
    }

    factory = DatasetFactory("output_dataset_uuid", store_factory)
    factory.load_all_indices()

    if bucket_by:
        ind_df = factory.get_indices_as_dataframe(["primary", bucket_by])

        assert not ind_df.duplicated().any()

    for data_dct in read_dataset_as_dataframes__iterator(
            dataset_uuid=dataset.uuid, store=store_factory):
        df = data_dct["core"]
        assert len(df.primary.unique()) == 1
        assert df.sorted_column.is_monotonic

    # update the dataset
    # do not use partition_on since it should be interfered from the existing dataset
    tasks = update_dataset_from_ddf(
        ddf,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        shuffle=True,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        bucket_by=bucket_by,
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    updated_dataset = tasks.compute()

    assert len(updated_dataset.partitions) == 2 * len(dataset.partitions)

    # Not allowed to use different partition_on
    with pytest.raises(
            ValueError,
            match="Incompatible set of partition keys encountered."):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=True,
            repartition_ratio=repartition,
            partition_on=["sorted_column"],
            num_buckets=num_buckets,
            sort_partitions_by="sorted_column",
            default_metadata_version=metadata_version,
        )

    # Not allowed to update with indices which do not yet exist in dataset
    with pytest.raises(ValueError, match="indices"):
        update_dataset_from_ddf(
            ddf,
            store_factory,
            dataset_uuid="output_dataset_uuid",
            table="core",
            shuffle=True,
            partition_on=["primary"],
            repartition_ratio=repartition,
            secondary_indices=["sorted_column"],
            num_buckets=num_buckets,
            sort_partitions_by="sorted_column",
            default_metadata_version=metadata_version,
        )

    # Check that delayed objects are allowed as delete scope.
    tasks = update_dataset_from_ddf(
        None,
        store_factory,
        dataset_uuid="output_dataset_uuid",
        table="core",
        shuffle=True,
        repartition_ratio=repartition,
        num_buckets=num_buckets,
        sort_partitions_by="sorted_column",
        default_metadata_version=metadata_version,
        delete_scope=dask.delayed(_return_none)(),
        bucket_by=bucket_by,
    )

    s = pickle.dumps(tasks, pickle.HIGHEST_PROTOCOL)
    tasks = pickle.loads(s)

    tasks.compute()