示例#1
0
def main(client, config):
    import cudf
    import dask_cudf

    store_sales, date_dim, store, product_reviews = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    ### adding a wait call slows this down by 3-4 seconds, removing it for now
    ### Make TEMP_TABLE1

    # filter date table
    q18_startDate_int = np.datetime64(q18_startDate, "ms").astype(int)
    q18_endDate_int = np.datetime64(q18_endDate, "ms").astype(int)

    date_dim_filtered = date_dim.loc[
        (date_dim.d_date.astype("datetime64[ms]").astype("int") >=
         q18_startDate_int)
        & (date_dim.d_date.astype("datetime64[ms]").astype("int") <=
           q18_endDate_int)].reset_index(drop=True)

    # build the regression_analysis table
    ss_date_dim_join = left_semi_join(
        store_sales,
        date_dim_filtered,
        left_on=["ss_sold_date_sk"],
        right_on=["d_date_sk"],
    )

    temp = (ss_date_dim_join.groupby(["ss_store_sk", "ss_sold_date_sk"], ).agg(
        {
            "ss_net_paid": "sum"
        }).reset_index())

    temp["xx"] = temp.ss_sold_date_sk * temp.ss_sold_date_sk
    temp["xy"] = temp.ss_sold_date_sk * temp.ss_net_paid
    temp.columns = ["ss_store_sk", "x", "y", "xx", "xy"]

    regression_analysis = (temp.groupby(["ss_store_sk"]).agg({
        "x": ["count", "sum"],
        "xy":
        "sum",
        "y":
        "sum",
        "xx":
        "sum"
    }).reset_index(drop=False))

    regression_analysis["slope"] = (
        regression_analysis[("x", "count")] * regression_analysis[
            ("xy", "sum")] -
        regression_analysis[("x", "sum")] * regression_analysis[("y", "sum")]
    ) / (regression_analysis[("x", "count")] * regression_analysis[
        ("xx", "sum")] -
         regression_analysis[("x", "sum")] * regression_analysis[("x", "sum")])
    regression_analysis = regression_analysis[["ss_store_sk", "slope"]]
    regression_analysis.columns = ["ss_store_sk", "slope"]

    regression_analysis["ss_store_sk"] = regression_analysis[
        "ss_store_sk"].astype("int32")
    store["s_store_sk"] = store["s_store_sk"].astype("int32")
    temp_table1 = store.merge(
        regression_analysis[["ss_store_sk", "slope"
                             ]].query("slope <= 0").reset_index(drop=True),
        left_on="s_store_sk",
        right_on="ss_store_sk",
    )
    temp_table1 = temp_table1[["s_store_sk", "s_store_name"]]

    # repartition this table to be one partition, since its only 192 at SF1000
    temp_table1 = temp_table1.repartition(npartitions=1)
    temp_table1 = temp_table1.persist()
    ### Make TEMP_TABLE2
    stores_with_regression = temp_table1
    pr = product_reviews

    # known to be small. very few relevant stores (169) at SF1000
    targets = (stores_with_regression.s_store_name.str.lower().unique().
               compute().tolist())
    n_targets = len(targets)

    no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True)
    no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32")

    ### perssiting because no_nulls is used twice
    no_nulls = no_nulls.reset_index(drop=True).persist()

    temp_table2_meta_empty_df = cudf.DataFrame({
        "word": ["a"],
        "pr_review_sk":
        np.ones(1, dtype=np.int64),
        "pr_review_date": ["a"],
    }).head(0)

    ### get relevant reviews
    combined = no_nulls.map_partitions(
        find_relevant_reviews,
        targets,
        meta=temp_table2_meta_empty_df,
    )

    stores_with_regression[
        "store_ID"] = stores_with_regression.s_store_sk.astype("str").str.cat(
            stores_with_regression.s_store_name, sep="_")
    stores_with_regression[
        "s_store_name"] = stores_with_regression.s_store_name.str.lower()

    # Keep this commented line to illustrate that we could exactly match Spark
    # temp_table2 = temp_table2[['store_ID', 'pr_review_date', 'pr_review_content']]
    temp_table2 = combined.merge(stores_with_regression,
                                 how="inner",
                                 left_on=["word"],
                                 right_on=["s_store_name"])

    temp_table2 = temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]]
    temp_table2 = temp_table2.persist()

    ### REAL QUERY (PART THREE)
    no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace(
        [". ", "? ", "! "], [EOL_CHAR], regex=False)
    sentences = no_nulls.map_partitions(create_sentences_from_reviews)

    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]

    # This file comes from the official TPCx-BB kit
    # We extracted it from bigbenchqueriesmr.jar
    sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] +
                             ["sentiment_files"])
    with open(f"{sentiment_dir}/negativeSentiment.txt") as fh:
        negativeSentiment = list(map(str.strip, fh.readlines()))
        # dedupe for one extra record in the source file
        negativeSentiment = list(set(negativeSentiment))

    word_df = sentences.map_partitions(
        create_words_from_sentences,
        global_position_column="sentence_tokenized_global_pos",
    )
    sent_df = cudf.DataFrame({"word": negativeSentiment})
    sent_df["sentiment"] = "NEG"
    sent_df = dask_cudf.from_cudf(sent_df, npartitions=1)

    word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word")

    word_sentence_sentiment[
        "sentence_idx_global_pos"] = word_sentence_sentiment[
            "sentence_idx_global_pos"].astype("int64")
    sentences["sentence_tokenized_global_pos"] = sentences[
        "sentence_tokenized_global_pos"].astype("int64")

    word_sentence_sentiment_with_sentence_info = word_sentence_sentiment.merge(
        sentences,
        how="left",
        left_on="sentence_idx_global_pos",
        right_on="sentence_tokenized_global_pos",
    )
    temp_table2["pr_review_sk"] = temp_table2["pr_review_sk"].astype("int32")

    final = word_sentence_sentiment_with_sentence_info.merge(
        temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]],
        how="inner",
        left_on="review_idx_global_pos",
        right_on="pr_review_sk",
    )

    keepcols = ["store_ID", "pr_review_date", "sentence", "sentiment", "word"]
    final = final[keepcols]
    final.columns = [
        "s_name", "r_date", "r_sentence", "sentiment", "sentiment_word"
    ]
    final = final.persist()
    wait(final)
    final = final.sort_values(
        ["s_name", "r_date", "r_sentence", "sentiment_word"])
    final = final.persist()
    wait(final)
    print(len(final))
    return final
示例#2
0
def main(client, config):
    import cudf

    (
        store_sales_df,
        item_df,
        customer_df,
        store_df,
        date_dim_df,
        customer_address_df,
        promotion_df,
    ) = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    # store_sales ss LEFT SEMI JOIN date_dim dd ON ss.ss_sold_date_sk = dd.d_date_sk AND dd.d_year = ${hiveconf:q17_year} AND dd.d_moy = ${hiveconf:q17_month}
    filtered_date_df = date_dim_df.query(
        f"d_year == {q17_year} and d_moy == {q17_month}", meta=date_dim_df._meta
    ).reset_index(drop=True)
    ss_date_join = left_semi_join(
        store_sales_df,
        filtered_date_df,
        left_on=["ss_sold_date_sk"],
        right_on=["d_date_sk"],
    )
    ss_date_join = ss_date_join[store_sales_cols]

    # LEFT SEMI JOIN item i ON ss.ss_item_sk = i.i_item_sk AND i.i_category IN (${hiveconf:q17_i_category_IN})
    filtered_item_df = item_df.loc[
        item_df["i_category"].isin(q17_i_category_IN)
    ].reset_index(drop=True)
    ss_date_item_join = left_semi_join(
        ss_date_join, filtered_item_df, left_on=["ss_item_sk"], right_on=["i_item_sk"]
    )

    # LEFT SEMI JOIN store s ON ss.ss_store_sk = s.s_store_sk AND s.s_gmt_offset = ${hiveconf:q17_gmt_offset}
    filtered_store_df = store_df.query(
        f"s_gmt_offset == {q17_gmt_offset}", meta=store_df._meta
    ).reset_index(drop=True)

    ss_date_item_store_join = left_semi_join(
        ss_date_item_join,
        filtered_store_df,
        left_on=["ss_store_sk"],
        right_on=["s_store_sk"],
    )

    #    (SELECT c.c_customer_sk FROM customer c LEFT SEMI JOIN customer_address ca
    # ON c.c_current_addr_sk = ca.ca_address_sk AND ca.ca_gmt_offset = ${hiveconf:q17_gmt_offset}
    # ) sub_c

    filtered_customer_address = customer_address_df.query(
        f"ca_gmt_offset == {q17_gmt_offset}"
    ).reset_index(drop=True)

    sub_c = left_semi_join(
        customer_df,
        filtered_customer_address,
        left_on=["c_current_addr_sk"],
        right_on=["ca_address_sk"],
    )

    # sub_c ON ss.ss_customer_sk = sub_c.c_customer_sk

    ss_date_item_store_customer_join = left_semi_join(
        ss_date_item_store_join,
        sub_c,
        left_on=["ss_customer_sk"],
        right_on=["c_customer_sk"],
    )

    # JOIN promotion p ON ss.ss_promo_sk = p.p_promo_sk
    ss_date_item_store_customer_promotion_join = ss_date_item_store_customer_join.merge(
        promotion_df, left_on="ss_promo_sk", right_on="p_promo_sk", how="inner"
    )

    final_df = ss_date_item_store_customer_promotion_join

    # SELECT p_channel_email, p_channel_dmail, p_channel_tv,
    # CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y')
    # THEN SUM(ss_ext_sales_price) ELSE 0 END as promotional,
    # SUM(ss_ext_sales_price) total
    # ...
    # GROUP BY p_channel_email, p_channel_dmail, p_channel_tv

    ### filling na because `pandas` and `cudf` ignore nulls when grouping stuff
    final_df["p_channel_email"] = final_df["p_channel_email"].fillna("None")
    final_df["p_channel_dmail"] = final_df["p_channel_dmail"].fillna("None")
    final_df["p_channel_tv"] = final_df["p_channel_tv"].fillna("None")

    # SELECT sum(promotional) as promotional, sum(total) as total,
    #   CASE WHEN sum(total) > 0 THEN 100*sum(promotional)/sum(total)
    #                            ELSE 0.0 END as promo_percent

    group_cols = ["p_channel_email", "p_channel_dmail", "p_channel_tv"]

    ### max group_columnss should be 27 (3*3*3)[N,Y,None]
    ### so computing is fine
    grouped_df = final_df.groupby(by=group_cols).agg(
        {"ss_ext_sales_price": "sum", "ss_ext_sales_price": "sum"}
    )
    grouped_df = grouped_df.compute()

    gr_df = grouped_df.reset_index()
    gr_df = gr_df.rename(columns={"ss_ext_sales_price": "total"})
    prom_flag = (
        (gr_df["p_channel_dmail"] == "Y")
        | (gr_df["p_channel_email"] == "Y")
        | (gr_df["p_channel_tv"] == "Y")
    )

    ### CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y')
    ### THEN SUM(ss_ext_sales_price) ELSE 0 END as promotional,
    gr_df["promotional"] = 0
    gr_df["promotional"][prom_flag] = gr_df["total"][prom_flag]

    total_sum = gr_df["total"].sum()
    prom_sum = gr_df["promotional"].sum()
    prom_per = 0
    if prom_sum != 0:
        prom_per = prom_sum / total_sum * 100

    print("Prom SUM = {}".format(prom_sum))
    print("Prom Per = {}".format(prom_per))
    print("Total SUM = {}".format(total_sum))

    return cudf.DataFrame(
        {"promotional": prom_sum, "total": total_sum, "promo_percent": prom_per}
    )