WHERE wcs_item_sk = {q02_item_sk} ) SELECT sd.wcs_item_sk as item_sk_1, count(sd.wcs_item_sk) as cnt FROM session_df sd INNER JOIN item_df id ON sd.wcs_user_sk = id.wcs_user_sk AND sd.session_id = id.session_id AND sd.wcs_item_sk <> {q02_item_sk} GROUP BY sd.wcs_item_sk ORDER BY cnt desc LIMIT {q02_limit} """ result = bc.sql(last_query) result["item_sk_2"] = q02_item_sk result_order = ["item_sk_1", "item_sk_2", "cnt"] result = result[result_order] del session_df bc.drop_table("session_df") return result if __name__ == "__main__": config = gpubdb_argparser() client, bc = attach_to_cluster(config, create_blazing_context=True) run_query(config=config, client=client, query_func=main, blazing_context=bc)
grouped_df = grouped_df[ grouped_df["cnt"] > q01_viewed_together_count].reset_index(drop=True) ### 2017 rows after filteration at sf-100 ### should scale till sf-100k grouped_df = grouped_df.repartition(npartitions=1).persist() ## converting to strings because of issue # https://github.com/rapidsai/gpu-bdb/issues/36 grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("str") grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("str") grouped_df = grouped_df.map_partitions(lambda df: df.sort_values( by=["cnt", "item_sk_1", "item_sk_2"], ascending=[False, True, True])) grouped_df = grouped_df.reset_index(drop=True) ### below is just 100 rows so should fit on `cudf` context grouped_df = grouped_df.head(q01_limit) ### writing to int to ensure same values grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("int32") grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("int32") return grouped_df if __name__ == "__main__": from bdb_tools.cluster_startup import attach_to_cluster import cudf import dask_cudf config = gpubdb_argparser() client, bc = attach_to_cluster(config) run_query(config=config, client=client, query_func=main)
if include_blazing and len(bsql_qnums) > 0: print("Blazing Queries") for qnum, q_func in bsql_queries.items(): print(qnum) qpath = f"{base_path}/queries/q{qnum}/" os.chdir(qpath) if os.path.exists("current_query_num.txt"): os.remove("current_query_num.txt") with open("current_query_num.txt", "w") as fp: fp.write(qnum) for r in range(N_REPEATS): run_query( config=config, client=client, query_func=q_func, blazing_context=bc, ) client.run(gc.collect) client.run_on_scheduler(gc.collect) gc.collect() time.sleep(3) # Run Pure Dask Queries if len(dask_qnums) > 0: print("Pure Dask Queries") for qnum, q_func in dask_queries.items(): print(qnum) qpath = f"{base_path}/queries/q{qnum}/" os.chdir(qpath)