print("---" * 20) print("grouping complete ={}".format(len(grouped_df))) grouped_df = grouped_df.groupby(["i_item_sk" ]).sum(split_every=2).reset_index() grouped_df.columns = ["i_item_sk", "cnt"] result_df = grouped_df.map_partitions( lambda df: df.sort_values(by=["cnt"], ascending=False)) result_df.columns = ["lastviewed_item", "cnt"] result_df["purchased_item"] = q03_purchased_item_IN cols_order = ["purchased_item", "lastviewed_item", "cnt"] result_df = result_df[cols_order] result_df = result_df.persist() ### todo: remove this later after more testing wait(result_df) print(len(result_df)) result_df = result_df.head(q03_limit) print("result complete") print("---" * 20) return result_df if __name__ == "__main__": from xbb_tools.cluster_startup import attach_to_cluster import cudf import dask_cudf client = attach_to_cluster(cli_args) run_dask_cudf_query(cli_args=cli_args, client=client, query_func=main)
( cust_and_clicks_ddf["clicks_in_category"] > cust_and_clicks_ddf["clicks_in_category"].mean() ) .reset_index(drop=True) .astype(np.int64) ) # Converting the dataframe to float64 as cuml logistic reg requires this ml_input_df = cust_and_clicks_ddf.astype("float64") ml_input_df = ml_input_df.persist() ml_tasks = [delayed(build_and_predict_model)(df) for df in ml_input_df.to_delayed()] results_dict = client.compute(*ml_tasks, sync=True) return results_dict if __name__ == "__main__": from xbb_tools.cluster_startup import attach_to_cluster import cudf import dask_cudf import cuml client = attach_to_cluster(cli_args) run_dask_cudf_query( cli_args=cli_args, client=client, query_func=main, write_func=write_result, )