示例#1
0
    grouped_df = grouped_df[
        grouped_df["cnt"] > q01_viewed_together_count].reset_index(drop=True)

    ### 2017 rows after filteration at sf-100
    ### should scale till sf-100k
    grouped_df = grouped_df.repartition(npartitions=1).persist()
    ## converting to strings because of issue
    # https://github.com/rapidsai/gpu-bdb/issues/36

    grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("str")
    grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("str")
    grouped_df = grouped_df.map_partitions(lambda df: df.sort_values(
        by=["cnt", "item_sk_1", "item_sk_2"], ascending=[False, True, True]))
    grouped_df = grouped_df.reset_index(drop=True)
    ### below is just 100 rows so should fit on `cudf` context
    grouped_df = grouped_df.head(q01_limit)
    ### writing to int to ensure same values
    grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("int32")
    grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("int32")
    return grouped_df


if __name__ == "__main__":
    from bdb_tools.cluster_startup import attach_to_cluster
    import cudf
    import dask_cudf

    config = gpubdb_argparser()
    client, bc = attach_to_cluster(config)
    run_query(config=config, client=client, query_func=main)
            WHERE wcs_item_sk = {q02_item_sk}
        )
        SELECT sd.wcs_item_sk as item_sk_1,
            count(sd.wcs_item_sk) as cnt
        FROM session_df sd
        INNER JOIN item_df id
        ON sd.wcs_user_sk = id.wcs_user_sk
        AND sd.session_id = id.session_id
        AND sd.wcs_item_sk <> {q02_item_sk}
        GROUP BY sd.wcs_item_sk
        ORDER BY cnt desc
        LIMIT {q02_limit}
    """
    result = bc.sql(last_query)
    result["item_sk_2"] = q02_item_sk
    result_order = ["item_sk_1", "item_sk_2", "cnt"]
    result = result[result_order]

    del session_df
    bc.drop_table("session_df")
    return result


if __name__ == "__main__":
    config = gpubdb_argparser()
    client, bc = attach_to_cluster(config, create_blazing_context=True)
    run_query(config=config,
              client=client,
              query_func=main,
              blazing_context=bc)