def main(client, config): import cudf import dask_cudf store_sales, date_dim, store, product_reviews = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) ### adding a wait call slows this down by 3-4 seconds, removing it for now ### Make TEMP_TABLE1 # filter date table q18_startDate_int = np.datetime64(q18_startDate, "ms").astype(int) q18_endDate_int = np.datetime64(q18_endDate, "ms").astype(int) date_dim_filtered = date_dim.loc[ (date_dim.d_date.astype("datetime64[ms]").astype("int") >= q18_startDate_int) & (date_dim.d_date.astype("datetime64[ms]").astype("int") <= q18_endDate_int)].reset_index(drop=True) # build the regression_analysis table ss_date_dim_join = left_semi_join( store_sales, date_dim_filtered, left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], ) temp = (ss_date_dim_join.groupby(["ss_store_sk", "ss_sold_date_sk"], ).agg( { "ss_net_paid": "sum" }).reset_index()) temp["xx"] = temp.ss_sold_date_sk * temp.ss_sold_date_sk temp["xy"] = temp.ss_sold_date_sk * temp.ss_net_paid temp.columns = ["ss_store_sk", "x", "y", "xx", "xy"] regression_analysis = (temp.groupby(["ss_store_sk"]).agg({ "x": ["count", "sum"], "xy": "sum", "y": "sum", "xx": "sum" }).reset_index(drop=False)) regression_analysis["slope"] = ( regression_analysis[("x", "count")] * regression_analysis[ ("xy", "sum")] - regression_analysis[("x", "sum")] * regression_analysis[("y", "sum")] ) / (regression_analysis[("x", "count")] * regression_analysis[ ("xx", "sum")] - regression_analysis[("x", "sum")] * regression_analysis[("x", "sum")]) regression_analysis = regression_analysis[["ss_store_sk", "slope"]] regression_analysis.columns = ["ss_store_sk", "slope"] regression_analysis["ss_store_sk"] = regression_analysis[ "ss_store_sk"].astype("int32") store["s_store_sk"] = store["s_store_sk"].astype("int32") temp_table1 = store.merge( regression_analysis[["ss_store_sk", "slope" ]].query("slope <= 0").reset_index(drop=True), left_on="s_store_sk", right_on="ss_store_sk", ) temp_table1 = temp_table1[["s_store_sk", "s_store_name"]] # repartition this table to be one partition, since its only 192 at SF1000 temp_table1 = temp_table1.repartition(npartitions=1) temp_table1 = temp_table1.persist() ### Make TEMP_TABLE2 stores_with_regression = temp_table1 pr = product_reviews # known to be small. very few relevant stores (169) at SF1000 targets = (stores_with_regression.s_store_name.str.lower().unique(). compute().tolist()) n_targets = len(targets) no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True) no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32") ### perssiting because no_nulls is used twice no_nulls = no_nulls.reset_index(drop=True).persist() temp_table2_meta_empty_df = cudf.DataFrame({ "word": ["a"], "pr_review_sk": np.ones(1, dtype=np.int64), "pr_review_date": ["a"], }).head(0) ### get relevant reviews combined = no_nulls.map_partitions( find_relevant_reviews, targets, meta=temp_table2_meta_empty_df, ) stores_with_regression[ "store_ID"] = stores_with_regression.s_store_sk.astype("str").str.cat( stores_with_regression.s_store_name, sep="_") stores_with_regression[ "s_store_name"] = stores_with_regression.s_store_name.str.lower() # Keep this commented line to illustrate that we could exactly match Spark # temp_table2 = temp_table2[['store_ID', 'pr_review_date', 'pr_review_content']] temp_table2 = combined.merge(stores_with_regression, how="inner", left_on=["word"], right_on=["s_store_name"]) temp_table2 = temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]] temp_table2 = temp_table2.persist() ### REAL QUERY (PART THREE) no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace( [". ", "? ", "! "], [EOL_CHAR], regex=False) sentences = no_nulls.map_partitions(create_sentences_from_reviews) # need the global position in the sentence tokenized df sentences["x"] = 1 sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum() del sentences["x"] # This file comes from the official TPCx-BB kit # We extracted it from bigbenchqueriesmr.jar sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] + ["sentiment_files"]) with open(f"{sentiment_dir}/negativeSentiment.txt") as fh: negativeSentiment = list(map(str.strip, fh.readlines())) # dedupe for one extra record in the source file negativeSentiment = list(set(negativeSentiment)) word_df = sentences.map_partitions( create_words_from_sentences, global_position_column="sentence_tokenized_global_pos", ) sent_df = cudf.DataFrame({"word": negativeSentiment}) sent_df["sentiment"] = "NEG" sent_df = dask_cudf.from_cudf(sent_df, npartitions=1) word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word") word_sentence_sentiment[ "sentence_idx_global_pos"] = word_sentence_sentiment[ "sentence_idx_global_pos"].astype("int64") sentences["sentence_tokenized_global_pos"] = sentences[ "sentence_tokenized_global_pos"].astype("int64") word_sentence_sentiment_with_sentence_info = word_sentence_sentiment.merge( sentences, how="left", left_on="sentence_idx_global_pos", right_on="sentence_tokenized_global_pos", ) temp_table2["pr_review_sk"] = temp_table2["pr_review_sk"].astype("int32") final = word_sentence_sentiment_with_sentence_info.merge( temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]], how="inner", left_on="review_idx_global_pos", right_on="pr_review_sk", ) keepcols = ["store_ID", "pr_review_date", "sentence", "sentiment", "word"] final = final[keepcols] final.columns = [ "s_name", "r_date", "r_sentence", "sentiment", "sentiment_word" ] final = final.persist() wait(final) final = final.sort_values( ["s_name", "r_date", "r_sentence", "sentiment_word"]) final = final.persist() wait(final) print(len(final)) return final
def main(client, config): import cudf ( store_sales_df, item_df, customer_df, store_df, date_dim_df, customer_address_df, promotion_df, ) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) # store_sales ss LEFT SEMI JOIN date_dim dd ON ss.ss_sold_date_sk = dd.d_date_sk AND dd.d_year = ${hiveconf:q17_year} AND dd.d_moy = ${hiveconf:q17_month} filtered_date_df = date_dim_df.query( f"d_year == {q17_year} and d_moy == {q17_month}", meta=date_dim_df._meta ).reset_index(drop=True) ss_date_join = left_semi_join( store_sales_df, filtered_date_df, left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], ) ss_date_join = ss_date_join[store_sales_cols] # LEFT SEMI JOIN item i ON ss.ss_item_sk = i.i_item_sk AND i.i_category IN (${hiveconf:q17_i_category_IN}) filtered_item_df = item_df.loc[ item_df["i_category"].isin(q17_i_category_IN) ].reset_index(drop=True) ss_date_item_join = left_semi_join( ss_date_join, filtered_item_df, left_on=["ss_item_sk"], right_on=["i_item_sk"] ) # LEFT SEMI JOIN store s ON ss.ss_store_sk = s.s_store_sk AND s.s_gmt_offset = ${hiveconf:q17_gmt_offset} filtered_store_df = store_df.query( f"s_gmt_offset == {q17_gmt_offset}", meta=store_df._meta ).reset_index(drop=True) ss_date_item_store_join = left_semi_join( ss_date_item_join, filtered_store_df, left_on=["ss_store_sk"], right_on=["s_store_sk"], ) # (SELECT c.c_customer_sk FROM customer c LEFT SEMI JOIN customer_address ca # ON c.c_current_addr_sk = ca.ca_address_sk AND ca.ca_gmt_offset = ${hiveconf:q17_gmt_offset} # ) sub_c filtered_customer_address = customer_address_df.query( f"ca_gmt_offset == {q17_gmt_offset}" ).reset_index(drop=True) sub_c = left_semi_join( customer_df, filtered_customer_address, left_on=["c_current_addr_sk"], right_on=["ca_address_sk"], ) # sub_c ON ss.ss_customer_sk = sub_c.c_customer_sk ss_date_item_store_customer_join = left_semi_join( ss_date_item_store_join, sub_c, left_on=["ss_customer_sk"], right_on=["c_customer_sk"], ) # JOIN promotion p ON ss.ss_promo_sk = p.p_promo_sk ss_date_item_store_customer_promotion_join = ss_date_item_store_customer_join.merge( promotion_df, left_on="ss_promo_sk", right_on="p_promo_sk", how="inner" ) final_df = ss_date_item_store_customer_promotion_join # SELECT p_channel_email, p_channel_dmail, p_channel_tv, # CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y') # THEN SUM(ss_ext_sales_price) ELSE 0 END as promotional, # SUM(ss_ext_sales_price) total # ... # GROUP BY p_channel_email, p_channel_dmail, p_channel_tv ### filling na because `pandas` and `cudf` ignore nulls when grouping stuff final_df["p_channel_email"] = final_df["p_channel_email"].fillna("None") final_df["p_channel_dmail"] = final_df["p_channel_dmail"].fillna("None") final_df["p_channel_tv"] = final_df["p_channel_tv"].fillna("None") # SELECT sum(promotional) as promotional, sum(total) as total, # CASE WHEN sum(total) > 0 THEN 100*sum(promotional)/sum(total) # ELSE 0.0 END as promo_percent group_cols = ["p_channel_email", "p_channel_dmail", "p_channel_tv"] ### max group_columnss should be 27 (3*3*3)[N,Y,None] ### so computing is fine grouped_df = final_df.groupby(by=group_cols).agg( {"ss_ext_sales_price": "sum", "ss_ext_sales_price": "sum"} ) grouped_df = grouped_df.compute() gr_df = grouped_df.reset_index() gr_df = gr_df.rename(columns={"ss_ext_sales_price": "total"}) prom_flag = ( (gr_df["p_channel_dmail"] == "Y") | (gr_df["p_channel_email"] == "Y") | (gr_df["p_channel_tv"] == "Y") ) ### CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y') ### THEN SUM(ss_ext_sales_price) ELSE 0 END as promotional, gr_df["promotional"] = 0 gr_df["promotional"][prom_flag] = gr_df["total"][prom_flag] total_sum = gr_df["total"].sum() prom_sum = gr_df["promotional"].sum() prom_per = 0 if prom_sum != 0: prom_per = prom_sum / total_sum * 100 print("Prom SUM = {}".format(prom_sum)) print("Prom Per = {}".format(prom_per)) print("Total SUM = {}".format(total_sum)) return cudf.DataFrame( {"promotional": prom_sum, "total": total_sum, "promo_percent": prom_per} )