def concat(dfs: List[DataframeLike], engine: Engine): if engine == Engine.PANDAS: return pd.concat(dfs, ignore_index=True, sort=False) if engine == Engine.DASK: import dask.dataframe return dask.dataframe.concat(dfs).reset_index(drop=True) if engine == Engine.CUDF: import cudf try: return cudf.concat(dfs, ignore_index=True) except TypeError as e: logger.warning( 'Failed to concat, likely due to column type issue, try converting to a string; columns' ) for df in dfs: logger.warning('df types :: %s', df.dtypes) raise e if engine == Engine.DASK: import dask.dataframe as dd return dd.concat(dfs) if engine == Engine.DASK_CUDF: import dask_cudf return dask_cudf.concat(dfs) raise NotImplementedError('Unknown engine')
def test_empty_partition(self, local_cuda_cluster: LocalCUDACluster) -> None: import dask_cudf import cudf import cupy with Client(local_cuda_cluster) as client: mult = 100 df = cudf.DataFrame({ "a": [1, 2, 3, 4, 5.1] * mult, "b": [10, 15, 29.3, 30, 31] * mult, "y": [10, 20, 30, 40., 50] * mult, }) parameters = {"tree_method": "gpu_hist", "debug_synchronize": True} empty = df.iloc[:0] ddf = dask_cudf.concat( [dask_cudf.from_cudf(empty, npartitions=1)] + [dask_cudf.from_cudf(df, npartitions=3)] + [dask_cudf.from_cudf(df, npartitions=3)]) X = ddf[ddf.columns.difference(["y"])] y = ddf[["y"]] dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) bst_empty = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")]) predt_empty = dxgb.predict(client, bst_empty, X).compute().values ddf = dask_cudf.concat([dask_cudf.from_cudf(df, npartitions=3)] + [dask_cudf.from_cudf(df, npartitions=3)]) X = ddf[ddf.columns.difference(["y"])] y = ddf[["y"]] dtrain = dxgb.DaskDeviceQuantileDMatrix(client, X, y) bst = xgb.dask.train(client, parameters, dtrain, evals=[(dtrain, "train")]) predt = dxgb.predict(client, bst, X).compute().values cupy.testing.assert_allclose(predt, predt_empty)
def concat(dfs: List[DataframeLike], engine: Engine, debug=False): if debug and len(dfs) > 1: df0 = dfs[0] for c in df0: logger.debug('checking df0: %s :: %s', c, df0[c].dtype) for df_i in dfs[1:]: if c not in df_i: logger.warning('missing df0[%s]::%s in df_i', c, df0[c].dtype) if df0[c].dtype != df_i[c].dtype: logger.warning( 'mismatching df0[c]::%s vs df_i[c]::%s for %s', df0[c].dtype, df_i[c].dtype, c) for df_i in dfs[1:]: for c in df_i: logger.debug('checking df_i: %s', c) if c not in df0: logger.warning('missing df_i[%s]::%s in df0', c, df_i[c].dtype) logger.debug('all checked!') if engine == Engine.PANDAS: return pd.concat(dfs, ignore_index=True, sort=False) if engine == Engine.DASK: import dask.dataframe return dask.dataframe.concat(dfs).reset_index(drop=True) if engine == Engine.CUDF: import cudf try: return cudf.concat(dfs, ignore_index=True) except TypeError as e: logger.warning( 'Failed to concat, likely due to column type issue, try converting to a string; columns' ) for df in dfs: logger.warning('df types :: %s', df.dtypes) raise e if engine == Engine.DASK: import dask.dataframe as dd return dd.concat(dfs) if engine == Engine.DASK_CUDF: import dask_cudf return dask_cudf.concat(dfs) raise NotImplementedError('Unknown engine')
def test_concat(): np.random.seed(0) n = 1000 df = pd.DataFrame({ "x": np.random.randint(0, 5, size=n), "y": np.random.normal(size=n) }) gdf = cudf.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) # Combine with concat concated = dgd.concat(frags) assert_frame_equal(df, concated)
def test_series_concat(): np.random.seed(0) n = 1000 df = pd.DataFrame({ "x": np.random.randint(0, 5, size=n), "y": np.random.normal(size=n) }) gdf = cudf.DataFrame.from_pandas(df) frags = _fragmented_gdf(gdf, nsplit=13) frags = [df.x for df in frags] concated = dgd.concat(frags).compute().to_pandas() assert isinstance(concated, pd.Series) np.testing.assert_array_equal(concated, df.x)
def main(client): import dask_cudf ss_ddf, ws_ddf, datedim_ddf = read_tables() datedim_ddf = datedim_ddf.map_partitions(convert_datestring_to_days) min_date = np.datetime64(Q25_DATE, "D").astype(int) # Filter by date valid_dates_ddf = datedim_ddf[ datedim_ddf["d_date"] > min_date].reset_index(drop=True) f_ss_ddf = ss_ddf[ss_ddf["ss_customer_sk"].notnull()].reset_index( drop=True) f_ws_ddf = ws_ddf[ws_ddf["ws_bill_customer_sk"].notnull()].reset_index( drop=True) # Merge ss_merged_df = f_ss_ddf.merge(valid_dates_ddf, left_on="ss_sold_date_sk", right_on="d_date_sk", how="inner") ws_merged_df = f_ws_ddf.merge(valid_dates_ddf, left_on="ws_sold_date_sk", right_on="d_date_sk", how="inner") # Roll up store sales agg_store_sales_ddf = ss_merged_df.groupby("ss_customer_sk").agg({ "ss_sold_date_sk": "max", "ss_net_paid": "sum" }) agg_store_sales_ddf["frequency"] = agg_count_distinct( ss_merged_df, "ss_customer_sk", "ss_ticket_number", client=client) # Simulate count distinct # Same rollup, just different columns for web sales agg_web_sales_ddf = ws_merged_df.groupby("ws_bill_customer_sk").agg({ "ws_sold_date_sk": "max", "ws_net_paid": "sum" }) agg_web_sales_ddf["frequency"] = agg_count_distinct( ws_merged_df, "ws_bill_customer_sk", "ws_order_number", client=client) # Simulate count distinct agg_store_sales_ddf = agg_store_sales_ddf.reset_index() agg_web_sales_ddf = agg_web_sales_ddf.reset_index() shared_columns = ["cid", "most_recent_date", "amount", "frequency"] agg_store_sales_ddf.columns = shared_columns agg_web_sales_ddf.columns = shared_columns agg_sales_ddf = dask_cudf.concat([agg_store_sales_ddf, agg_web_sales_ddf]) cluster_input_ddf = agg_sales_ddf.groupby("cid").agg({ "most_recent_date": "max", "frequency": "sum", "amount": "sum" }) cluster_input_ddf["recency"] = (37621 - cluster_input_ddf["most_recent_date"]) < 60 # Reorder to match refererence examples cluster_input_ddf = cluster_input_ddf[["recency", "frequency", "amount"]] # Prepare df for KMeans clustering cluster_input_ddf["recency"] = cluster_input_ddf["recency"].astype("int64") cluster_input_ddf["amount"] = cluster_input_ddf["amount"].astype("float64") cluster_input_ddf = cluster_input_ddf.persist() results_dict = get_clusters(client=client, ml_input_df=cluster_input_ddf) return results_dict
def concat(self, dfs, **kwargs): return dask_cudf.concat(dfs, **kwargs)