Пример #1
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_distinct = """
        SELECT DISTINCT i_category_id, ws_order_number
        FROM web_sales ws, item i
        WHERE ws.ws_item_sk = i.i_item_sk
        AND i.i_category_id IS NOT NULL
    """
    result_distinct = bc.sql(query_distinct)

    result_distinct = result_distinct.persist()
    wait(result_distinct)
    bc.create_table('distinct_table', result_distinct)

    query = f"""
        SELECT category_id_1, category_id_2, COUNT (*) AS cnt
        FROM
        (
            SELECT CAST(t1.i_category_id as BIGINT) AS category_id_1,
                CAST(t2.i_category_id as BIGINT) AS category_id_2
            FROM distinct_table t1
            INNER JOIN distinct_table t2
            ON t1.ws_order_number = t2.ws_order_number
            WHERE t1.i_category_id < t2.i_category_id
        )
        GROUP BY category_id_1, category_id_2
        ORDER BY cnt DESC, category_id_1, category_id_2
        LIMIT {q29_limit}
    """
    result = bc.sql(query)

    bc.drop_table("distinct_table")
    return result
Пример #2
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = """
        WITH p AS
        (
            SELECT
                pr_item_sk,
                count(pr_item_sk) AS r_count,
                AVG( CAST(pr_review_rating AS DOUBLE) ) avg_rating  
            FROM product_reviews
            WHERE pr_item_sk IS NOT NULL
            GROUP BY pr_item_sk
        ), s AS
        (
            SELECT
                ws_item_sk
            FROM web_sales ws
            INNER JOIN date_dim d ON ws.ws_sold_date_sk = d.d_date_sk
            WHERE ws_item_sk IS NOT null
            AND CAST(d.d_date AS DATE) >= DATE '2003-01-02'
            AND CAST(d.d_date AS DATE) <= DATE '2003-02-02'
            GROUP BY ws_item_sk
        )
        SELECT p.r_count    AS x,
            p.avg_rating AS y
        FROM s INNER JOIN p ON p.pr_item_sk = s.ws_item_sk
    """

    result = bc.sql(query)
    sales_corr = result["x"].corr(result["y"]).compute()
    result_df = cudf.DataFrame([sales_corr])
    result_df.columns = ["corr(CAST(reviews_count AS DOUBLE), avg_rating)"]
    return result_df
Пример #3
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = f"""
        SELECT *
        FROM
        (
            SELECT
                cat,
                ( (count(x) * SUM(xy) - SUM(x) * SUM(y)) / (count(x) * SUM(xx) - SUM(x) * SUM(x)) )  AS slope,
                (SUM(y) - ((count(x) * SUM(xy) - SUM(x) * SUM(y)) / (count(x) * SUM(xx) - SUM(x)*SUM(x)) ) * SUM(x)) / count(x) AS intercept
            FROM
            (
                SELECT
                    i.i_category_id AS cat,
                    s.ss_sold_date_sk AS x,
                    CAST(SUM(s.ss_net_paid) AS DOUBLE) AS y,
                    CAST(s.ss_sold_date_sk * SUM(s.ss_net_paid) AS DOUBLE) AS xy,
                    CAST(s.ss_sold_date_sk * s.ss_sold_date_sk AS DOUBLE) AS xx
                FROM store_sales s
                INNER JOIN item i ON s.ss_item_sk = i.i_item_sk
                INNER JOIN date_dim d ON s.ss_sold_date_sk = d.d_date_sk
                WHERE s.ss_store_sk = {q15_store_sk}
                AND i.i_category_id IS NOT NULL
                AND CAST(d.d_date AS DATE) >= DATE '{q15_startDate}'
                AND   CAST(d.d_date AS DATE) <= DATE '{q15_endDate}'
                GROUP BY i.i_category_id, s.ss_sold_date_sk
            ) temp
            GROUP BY cat
        ) regression
        WHERE slope <= 0.0
        ORDER BY cat
    """
    result = bc.sql(query)
    return result
Пример #4
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_distinct = f"""
        SELECT DISTINCT ss_item_sk, ss_ticket_number
        FROM store_sales s, item i
        WHERE s.ss_item_sk = i.i_item_sk
        AND i.i_category_id IN ({q01_i_category_id_IN})
        AND s.ss_store_sk IN ({q01_ss_store_sk_IN})
    """
    result_distinct = bc.sql(query_distinct)

    bc.create_table("distinct_table", result_distinct)

    query = f"""
        SELECT item_sk_1, item_sk_2, COUNT(*) AS cnt
        FROM
        (
            SELECT CAST(t1.ss_item_sk as BIGINT) AS item_sk_1,
                CAST(t2.ss_item_sk AS BIGINT) AS item_sk_2
            FROM distinct_table t1
            INNER JOIN distinct_table t2
            ON t1.ss_ticket_number = t2.ss_ticket_number
            WHERE t1.ss_item_sk < t2.ss_item_sk
        )
        GROUP BY item_sk_1, item_sk_2
        HAVING  COUNT(*) > {q01_viewed_together_count}
        ORDER BY cnt DESC, CAST(item_sk_1 AS VARCHAR),
                 CAST(item_sk_2 AS VARCHAR)
        LIMIT {q01_limit}
    """
    result = bc.sql(query)
    return result
Пример #5
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = f"""
        SELECT DISTINCT wcs_user_sk
        FROM
        (
            SELECT DISTINCT
                wcs_user_sk,
                wcs_click_date_sk
            FROM web_clickstreams, item
            WHERE wcs_click_date_sk BETWEEN 37134 AND 37164
            AND i_category IN ({q12_i_category_IN})
            AND wcs_item_sk = i_item_sk
            AND wcs_user_sk IS NOT NULL
            AND wcs_sales_sk IS NULL
        ) webInRange,
        (
            SELECT DISTINCT
                ss_customer_sk,
                ss_sold_date_sk
            FROM store_sales, item
            WHERE ss_sold_date_sk BETWEEN 37134 AND 37224
            AND i_category IN ({q12_i_category_IN}) -- filter given category
            AND ss_item_sk = i_item_sk
            AND ss_customer_sk IS NOT NULL
        ) storeInRange
        WHERE wcs_user_sk = ss_customer_sk
        AND wcs_click_date_sk < ss_sold_date_sk
        ORDER BY wcs_user_sk
    """
    result = bc.sql(query)
    return result
Пример #6
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    date = datetime.datetime(2001, 3, 16)
    start = (date + timedelta(days=-30)).strftime("%Y-%m-%d")
    end = (date + timedelta(days=30)).strftime("%Y-%m-%d")
    mid = date.strftime("%Y-%m-%d")

    date_query = f"""
        SELECT d_date_sk 
        FROM date_dim 
        WHERE CAST(d_date as DATE) IN (DATE '{start}', DATE '{mid}', DATE '{end}') 
        ORDER BY CAST(d_date as date) ASC
    """

    dates = bc.sql(date_query)

    cpu_dates = dates["d_date_sk"].compute().to_pandas()
    cpu_dates.index = list(range(0, cpu_dates.shape[0]))

    last_query = f"""
        SELECT w_state, i_item_id,
        SUM
        (
            CASE WHEN ws_sold_date_sk < {str(cpu_dates[1])}
            THEN ws_sales_price - COALESCE(wr_refunded_cash,0)
            ELSE 0.0 END
        ) AS sales_before,
        SUM
        (
            CASE WHEN ws_sold_date_sk >= {str(cpu_dates[1])}
            THEN ws_sales_price - COALESCE(wr_refunded_cash,0)
            ELSE 0.0 END
        ) AS sales_after
        FROM 
        (
            SELECT ws_item_sk, 
                ws_warehouse_sk, 
                ws_sold_date_sk, 
                ws_sales_price, 
                wr_refunded_cash
            FROM web_sales ws
            LEFT OUTER JOIN web_returns wr ON 
            (
                ws.ws_order_number = wr.wr_order_number
                AND ws.ws_item_sk = wr.wr_item_sk
            )
            WHERE ws_sold_date_sk BETWEEN {str(cpu_dates[0])}
            AND {str(cpu_dates[2])}
        ) a1
        JOIN item i ON a1.ws_item_sk = i.i_item_sk
        JOIN warehouse w ON a1.ws_warehouse_sk = w.w_warehouse_sk
        GROUP BY w_state,i_item_id 
        ORDER BY w_state,i_item_id
        LIMIT 100
    """

    result = bc.sql(last_query)
    return result
Пример #7
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_1 = """
        SELECT i_item_sk,
            CAST(i_category_id AS TINYINT) AS i_category_id
        FROM item
    """
    item_df = bc.sql(query_1)

    item_df = item_df.persist()
    wait(item_df)
    bc.create_table("item_df", item_df)

    query_2 = """
        SELECT wcs_user_sk,
            (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec,
            i_category_id
        FROM web_clickstreams wcs, item_df i
        WHERE wcs.wcs_item_sk = i.i_item_sk
        AND i.i_category_id IS NOT NULL
        AND wcs.wcs_user_sk IS NOT NULL
        ORDER BY wcs.wcs_user_sk, tstamp_inSec, i_category_id
    """
    merged_df = bc.sql(query_2)

    bc.drop_table("item_df")
    del item_df

    distinct_session_df = merged_df.map_partitions(
        get_distinct_sessions,
        keep_cols=["wcs_user_sk", "i_category_id"],
        time_out=q30_session_timeout_inSec)

    del merged_df
    pair_df = distinct_session_df.map_partitions(get_pairs,
                                                 pair_col="i_category_id",
                                                 output_col_1="category_id_1",
                                                 output_col_2="category_id_2")
    del distinct_session_df

    pair_df = pair_df.persist()
    wait(pair_df)
    bc.create_table('pair_df', pair_df)

    last_query = f"""
        SELECT CAST(category_id_1 AS BIGINT) AS category_id_1,
            CAST(category_id_2 AS BIGINT) AS category_id_2,
            COUNT(category_id_2) AS cnt
        FROM pair_df
        GROUP BY category_id_1, category_id_2
        ORDER BY cnt desc
        LIMIT {q30_limit}
    """
    result = bc.sql(last_query)

    bc.drop_table("pair_df")
    return result
Пример #8
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_web_page = """
        SELECT wp_type, wp_web_page_sk
        FROM web_page_wo_categorical
    """
    wp = bc.sql(query_web_page)

    # Convert wp_type to categorical and get cat_id of review and dynamic type
    wp["wp_type"] = wp["wp_type"].map_partitions(
        lambda ser: ser.astype("category"))
    cpu_categories = wp["wp_type"].compute().cat.categories.to_pandas()
    DYNAMIC_CAT_CODE = cpu_categories.get_loc("dynamic")
    ORDER_CAT_CODE = cpu_categories.get_loc("order")

    # ### cast to minimum viable dtype
    import cudf
    codes_min_signed_type = cudf.utils.dtypes.min_signed_type(
        len(cpu_categories))
    wp["wp_type_codes"] = wp["wp_type"].cat.codes.astype(codes_min_signed_type)
    wp["wp_type"] = wp["wp_type"].cat.codes.astype(codes_min_signed_type)
    cols_2_keep = ["wp_web_page_sk", "wp_type_codes"]
    wp = wp[cols_2_keep]

    wp = wp.persist()
    wait(wp)
    bc.create_table('web_page', wp)

    query = """
        SELECT
            c.wcs_user_sk,
            w.wp_type_codes,
            (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec
        FROM web_clickstreams c, web_page w
        WHERE c.wcs_web_page_sk = w.wp_web_page_sk
        AND   c.wcs_web_page_sk IS NOT NULL
        AND   c.wcs_user_sk     IS NOT NULL
        AND   c.wcs_sales_sk    IS NULL --abandoned implies: no sale
        ORDER BY wcs_user_sk, tstamp_inSec
    """
    merged_df = bc.sql(query)

    keep_cols = ["wcs_user_sk", "wp_type_codes", "tstamp_inSec"]
    result_df = merged_df.map_partitions(reduction_function, keep_cols,
                                         DYNAMIC_CAT_CODE, ORDER_CAT_CODE)

    result = result_df["pagecount"].sum() / result_df["count"].sum()
    # Persist before computing to ensure scalar transfer only on compute
    result = result.persist()

    result = result.compute()
    result_df = cudf.DataFrame({"sum(pagecount)/count(*)": [result]})
    bc.drop_table("web_page")
    return result_df
Пример #9
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = f"""
        WITH concat_table AS
        (
            (
                SELECT
                    ss_customer_sk AS cid,
                    count(distinct ss_ticket_number) AS frequency,
                    max(ss_sold_date_sk) AS most_recent_date,
                    CAST( SUM(ss_net_paid) AS DOUBLE) AS amount
                FROM store_sales ss
                JOIN date_dim d ON ss.ss_sold_date_sk = d.d_date_sk
                WHERE CAST(d.d_date AS DATE) > DATE '{q25_date}'
                AND ss_customer_sk IS NOT NULL
                GROUP BY ss_customer_sk
            ) union all
            (
                SELECT
                    ws_bill_customer_sk AS cid,
                    count(distinct ws_order_number) AS frequency,
                    max(ws_sold_date_sk)   AS most_recent_date,
                    CAST( SUM(ws_net_paid) AS DOUBLE) AS amount
                FROM web_sales ws
                JOIN date_dim d ON ws.ws_sold_date_sk = d.d_date_sk
                WHERE CAST(d.d_date AS DATE) > DATE '{q25_date}'
                AND ws_bill_customer_sk IS NOT NULL
                GROUP BY ws_bill_customer_sk
            )
        )
        SELECT
            cid AS cid,
            CASE WHEN 37621 - max(most_recent_date) < 60 THEN 1.0
                ELSE 0.0 END AS recency, -- 37621 == 2003-01-02
            CAST( SUM(frequency) AS BIGINT) AS frequency, --total frequency
            CAST( SUM(amount) AS DOUBLE)    AS amount --total amount
        FROM concat_table
        GROUP BY cid
        ORDER BY cid
    """
    cluster_input_ddf = bc.sql(query)

    # Prepare df for KMeans clustering
    cluster_input_ddf["recency"] = cluster_input_ddf["recency"].astype("int64")

    cluster_input_ddf = cluster_input_ddf.repartition(npartitions=1)
    cluster_input_ddf = cluster_input_ddf.persist()
    cluster_input_ddf = cluster_input_ddf.set_index('cid')
    results_dict = get_clusters(client=client, ml_input_df=cluster_input_ddf)

    return results_dict
Пример #10
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_1 = """
        SELECT i_item_sk,
            CAST(i_category_id AS TINYINT) AS i_category_id
        FROM item
    """
    item_df = bc.sql(query_1)

    bc.create_table("item_df", item_df)

    query_2 = """
        SELECT CAST(w.wcs_user_sk AS INTEGER) as wcs_user_sk,
            wcs_click_date_sk * 86400 + wcs_click_time_sk AS tstamp,
            CAST(w.wcs_item_sk AS INTEGER) as wcs_item_sk,
            COALESCE(w.wcs_sales_sk, 0) as wcs_sales_sk,
            i.i_category_id
        FROM web_clickstreams AS w
        INNER JOIN item_df AS i ON w.wcs_item_sk = i.i_item_sk
        WHERE w.wcs_user_sk IS NOT NULL
        AND w.wcs_item_sk IS NOT NULL
        ORDER BY w.wcs_user_sk
    """
    merged_df = bc.sql(query_2)

    query_3 = f"""
        SELECT i_item_sk, i_category_id
        FROM item_df
        WHERE i_category_id IN ({q03_purchased_item_category_IN})
    """
    item_df_filtered = bc.sql(query_3)

    product_view_results = merged_df.map_partitions(
        apply_find_items_viewed, item_mappings=item_df_filtered
    )
    del merged_df

    bc.create_table('product_result', product_view_results)

    last_query = f"""
        SELECT CAST({q03_purchased_item_IN} AS BIGINT) AS purchased_item,
            i_item_sk AS lastviewed_item,
            COUNT(i_item_sk) AS cnt
        FROM product_result
        GROUP BY i_item_sk
        ORDER BY purchased_item, cnt desc, lastviewed_item
        LIMIT {q03_limit}
    """
    result = bc.sql(last_query)
    return result
Пример #11
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_1 = """
        SELECT
            CAST(wcs_user_sk AS INTEGER) AS wcs_user_sk,
            CAST(wcs_item_sk AS INTEGER) AS wcs_item_sk,
            (wcs_click_date_sk * 86400 + wcs_click_time_sk) AS tstamp_inSec
        FROM web_clickstreams
        WHERE wcs_item_sk IS NOT NULL
        AND   wcs_user_sk IS NOT NULL
        ORDER BY wcs_user_sk
    """
    wcs_result = bc.sql(query_1)

    session_df = wcs_result.map_partitions(
        get_distinct_sessions,
        keep_cols=["wcs_user_sk", "wcs_item_sk"],
        time_out=q02_session_timeout_inSec,
    )
    del wcs_result

    session_df = session_df.persist()
    wait(session_df)
    bc.create_table('session_df', session_df)

    last_query = f"""
        WITH item_df AS (
            SELECT wcs_user_sk, session_id
            FROM session_df
            WHERE wcs_item_sk = {q02_item_sk}
        )
        SELECT sd.wcs_item_sk as item_sk_1,
            count(sd.wcs_item_sk) as cnt
        FROM session_df sd
        INNER JOIN item_df id
        ON sd.wcs_user_sk = id.wcs_user_sk
        AND sd.session_id = id.session_id
        AND sd.wcs_item_sk <> {q02_item_sk}
        GROUP BY sd.wcs_item_sk
        ORDER BY cnt desc
        LIMIT {q02_limit}
    """
    result = bc.sql(last_query)
    result["item_sk_2"] = q02_item_sk
    result_order = ["item_sk_1", "item_sk_2", "cnt"]
    result = result[result_order]

    del session_df
    bc.drop_table("session_df")
    return result
Пример #12
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = """
		WITH temp_table as 
		(
			SELECT
				i_item_sk, 
				imp_sk,
				(imp_competitor_price - i_current_price) / i_current_price AS price_change,
				imp_start_date, 
				(imp_end_date - imp_start_date) AS no_days_comp_price
			FROM item i ,item_marketprices imp 
			WHERE i.i_item_sk = imp.imp_item_sk
			AND i.i_item_sk = 10000
			ORDER BY i_item_sk, imp_sk, imp_start_date
		)
		SELECT ws_item_sk,
		-- avg ( (current_ss_quant + current_ws_quant - prev_ss_quant - prev_ws_quant) / ((prev_ss_quant + prev_ws_quant) * ws.price_change) ) -- single node
			sum( (current_ss_quant+current_ws_quant-prev_ss_quant-prev_ws_quant) / (prev_ss_quant*ws.price_change+prev_ws_quant*ws.price_change) ) 
			/ count( (current_ss_quant + current_ws_quant - prev_ss_quant - prev_ws_quant) / ((prev_ss_quant + prev_ws_quant) * ws.price_change) ) AS cross_price_elasticity
		FROM
		( 
			SELECT
				ws_item_sk,
				imp_sk,
				price_change,
				SUM( CASE WHEN ( (ws_sold_date_sk >= c.imp_start_date) AND (ws_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ws_quantity ELSE 0 END ) AS current_ws_quant,
				SUM( CASE WHEN ( (ws_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ws_sold_date_sk < c.imp_start_date)) THEN ws_quantity ELSE 0 END ) AS prev_ws_quant
			FROM web_sales ws
			JOIN temp_table c ON ws.ws_item_sk = c.i_item_sk
			GROUP BY ws_item_sk, imp_sk, price_change
		) ws JOIN
		( 
			SELECT
				ss_item_sk,
				imp_sk,
				price_change,
				SUM( CASE WHEN ((ss_sold_date_sk >= c.imp_start_date) AND (ss_sold_date_sk < (c.imp_start_date + c.no_days_comp_price))) THEN ss_quantity ELSE 0 END) AS current_ss_quant,
				SUM( CASE WHEN ((ss_sold_date_sk >= (c.imp_start_date - c.no_days_comp_price)) AND (ss_sold_date_sk < c.imp_start_date)) THEN ss_quantity ELSE 0 END) AS prev_ss_quant
			FROM store_sales ss
			JOIN temp_table c ON c.i_item_sk = ss.ss_item_sk
			GROUP BY ss_item_sk, imp_sk, price_change
		) ss
		ON (ws.ws_item_sk = ss.ss_item_sk and ws.imp_sk = ss.imp_sk)
		GROUP BY  ws.ws_item_sk
  	"""

    result = bc.sql(query)
    return result
Пример #13
0
def main(client, config):
    q_st = time.time()
    product_reviews_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    product_reviews_df = product_reviews_df[
        product_reviews_df["pr_review_content"].notnull()
    ]

    # 90% train/test split
    train_data, test_data = product_reviews_df.random_split([0.9, 0.10])

    train_data = train_data.reset_index(drop=True)
    test_data = test_data.reset_index(drop=True)
    del product_reviews_df

    final_data, acc, prec, cmat = post_etl_processing(
        client=client, train_data=train_data, test_data=test_data
    )
    payload = {
        "df": final_data,
        "acc": acc,
        "prec": prec,
        "cmat": cmat,
        "output_type": "supervised",
    }
    return payload
Пример #14
0
def main(client, config):

    import cudf

    model_path = os.path.join(config["data_dir"], "../../q27_model_dir")
    product_reviews_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    product_reviews_df = product_reviews_df[product_reviews_df.pr_item_sk ==
                                            q27_pr_item_sk].persist()

    meta_d = {
        "review_sk": np.ones(1, dtype=np.int64),
        "item_sk": np.ones(1, dtype=np.int64),
        "company_name": "",
        "review_sentence": "",
    }
    meta_df = cudf.DataFrame(meta_d)
    output_df = product_reviews_df.map_partitions(run_single_part_workflow,
                                                  model_path,
                                                  meta=meta_df)
    output_df = output_df.persist()
    wait(output_df)
    client.run(del_model_attribute)
    return output_df
Пример #15
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    # 10 % of data
    query1 = """
        SELECT
            pr_review_sk,
            pr_review_rating,
            pr_review_content
        FROM product_reviews
        WHERE mod(pr_review_sk, 10) IN (0)
        AND pr_review_content IS NOT NULL
        -- in a near future we want to use ORDER BY again
        --ORDER BY pr_review_sk
    """
    test_data = bc.sql(query1)
    # in a near future we want to reuse ORDER BY instead of bc.partition()
    test_data = bc.partition(test_data, by=["pr_review_sk"])

    # 90 % of data
    query2 = """
        SELECT
            pr_review_sk,
            pr_review_rating,
            pr_review_content
        FROM product_reviews
        WHERE mod(pr_review_sk, 10) IN (1,2,3,4,5,6,7,8,9)
        AND pr_review_content IS NOT NULL
        --ORDER BY pr_review_sk
    """
    train_data = bc.sql(query2)
    # in a near future we want to reuse ORDER BY instead of bc.partition()
    train_data = bc.partition(train_data, by=["pr_review_sk"])

    final_data, acc, prec, cmat = post_etl_processing(client=client,
                                                      train_data=train_data,
                                                      test_data=test_data)

    payload = {
        "df": final_data,
        "acc": acc,
        "prec": prec,
        "cmat": cmat,
        "output_type": "supervised",
    }

    return payload
Пример #16
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query_date = f"""
        select min(d_date_sk) as min_d_date_sk,
            max(d_date_sk) as max_d_date_sk
        from date_dim
        where d_year = {q17_year}
        and d_moy = {q17_month}
    """
    dates_result = bc.sql(query_date).compute()

    min_date_sk_val = dates_result["min_d_date_sk"][0]
    max_date_sk_val = dates_result["max_d_date_sk"][0]

    query = f"""
        SELECT sum(promotional) as promotional,
            sum(total) as total,
            CASE WHEN sum(total) > 0.0 THEN (100.0 * sum(promotional)) / sum(total)
                ELSE 0.0 END as promo_percent
        FROM
        (
            SELECT p_channel_email,
                p_channel_dmail,
                p_channel_tv,
                SUM( CAST(ss_ext_sales_price AS DOUBLE) ) total,
                CASE WHEN (p_channel_dmail = 'Y' OR p_channel_email = 'Y' OR p_channel_tv = 'Y')
                    THEN SUM(CAST(ss_ext_sales_price AS DOUBLE)) ELSE 0 END as promotional
            FROM store_sales ss
            INNER JOIN promotion p ON ss.ss_promo_sk = p.p_promo_sk
            inner join item i on ss.ss_item_sk = i.i_item_sk
            inner join store s on ss.ss_store_sk = s.s_store_sk
            inner join customer c on c.c_customer_sk = ss.ss_customer_sk
            inner join customer_address ca
            on c.c_current_addr_sk = ca.ca_address_sk
            WHERE i.i_category IN ({q17_i_category_IN})
            AND s.s_gmt_offset = {q17_gmt_offset}
            AND ca.ca_gmt_offset = {q17_gmt_offset}
            AND ss.ss_sold_date_sk >= {min_date_sk_val}
            AND ss.ss_sold_date_sk <= {max_date_sk_val}
            GROUP BY p_channel_email, p_channel_dmail, p_channel_tv
        ) sum_promotional
        -- we don't need a 'ON' join condition. result is just two numbers.
    """
    result = bc.sql(query)
    return result
Пример #17
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = f"""
        SELECT
            w_warehouse_name,
            i_item_id,
            SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp))
                / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END) AS inv_before,
            SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp))
                / 1000000 >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS inv_after
        FROM
            inventory inv,
            item i,
            warehouse w,
            date_dim d
        WHERE i_current_price BETWEEN {q22_i_current_price_min} AND {q22_i_current_price_max}
        AND i_item_sk        = inv_item_sk
        AND inv_warehouse_sk = w_warehouse_sk
        AND inv_date_sk      = d_date_sk
        AND timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= -30
        AND timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 <= 30
        GROUP BY w_warehouse_name, i_item_id
        HAVING SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date}', CAST(d_date || ' 00:00:00' AS timestamp))
            / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END) > 0
        AND
        (
            CAST(
            SUM (CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS DOUBLE)
            / CAST( SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END)
            AS DOUBLE) >= 0.666667
        )
        AND
        (
            CAST(
            SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 >= 0 THEN inv_quantity_on_hand ELSE 0 END) AS DOUBLE)
            / CAST ( SUM(CASE WHEN timestampdiff(DAY, timestamp '{q22_date} 00:00:00', CAST(d_date || ' 00:00:00' AS timestamp)) / 1000000 < 0 THEN inv_quantity_on_hand ELSE 0 END)
         AS DOUBLE) <= 1.50
        )
        ORDER BY w_warehouse_name, i_item_id
        LIMIT 100
    """
    result = bc.sql(query)
    return result
Пример #18
0
def main(client, config):
    import cudf

    wp, wcs_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    ### downcasting the column inline with q03
    wcs_df["wcs_user_sk"] = wcs_df["wcs_user_sk"].astype("int32")

    f_wcs_df = wcs_df[wcs_df["wcs_web_page_sk"].notnull()
                      & wcs_df["wcs_user_sk"].notnull()
                      & wcs_df["wcs_sales_sk"].isnull()].reset_index(drop=True)

    f_wcs_df["tstamp_inSec"] = (f_wcs_df["wcs_click_date_sk"] * 24 * 60 * 60 +
                                f_wcs_df["wcs_click_time_sk"])
    keep_cols = ["wcs_user_sk", "tstamp_inSec", "wcs_web_page_sk"]
    f_wcs_df = f_wcs_df[keep_cols]

    f_wcs_df = f_wcs_df.repartition(columns=["wcs_user_sk"])

    # Convert wp_type to categorical and get cat_id of review and dynamic type
    wp["wp_type"] = wp["wp_type"].map_partitions(
        lambda ser: ser.astype("category"))
    cpu_categories = wp["wp_type"].compute().cat.categories.to_pandas()
    DYNAMIC_CAT_CODE = cpu_categories.get_loc("dynamic")
    ORDER_CAT_CODE = cpu_categories.get_loc("order")
    # ### cast to minimum viable dtype
    codes_min_signed_type = cudf.utils.dtypes.min_signed_type(
        len(cpu_categories))
    wp["wp_type_codes"] = wp["wp_type"].cat.codes.astype(codes_min_signed_type)
    cols_2_keep = ["wp_web_page_sk", "wp_type_codes"]
    wp = wp[cols_2_keep]

    # Continue remaining workflow with wp_type as category codes
    merged_df = f_wcs_df.merge(wp,
                               left_on="wcs_web_page_sk",
                               right_on="wp_web_page_sk",
                               how="inner")
    merged_df = merged_df[["wcs_user_sk", "tstamp_inSec", "wp_type_codes"]]

    keep_cols = ["wcs_user_sk", "wp_type_codes", "tstamp_inSec"]
    result_df = merged_df.map_partitions(reduction_function, keep_cols,
                                         DYNAMIC_CAT_CODE, ORDER_CAT_CODE)

    result = result_df["pagecount"].sum() / result_df["count"].sum()
    # Persist before computing to ensure scalar transfer only on compute
    result = result.persist()

    result = result.compute()
    result_df = cudf.DataFrame({"sum(pagecount)/count(*)": [result]})
    return result_df
Пример #19
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = """
		WITH temp_table as 
		(
			SELECT k.i_item_sk
			FROM item k,
			(
				SELECT i_category, 
					SUM(j.i_current_price) / COUNT(j.i_current_price) * 1.2 AS avg_price
				FROM item j
				GROUP BY j.i_category
			) avgCategoryPrice
			WHERE avgCategoryPrice.i_category = k.i_category
			AND k.i_current_price > avgCategoryPrice.avg_price 
		)
		SELECT ca_state, COUNT(*) AS cnt
		FROM
			customer_address a,
			customer c,
			store_sales s,
			temp_table highPriceItems
		WHERE a.ca_address_sk = c.c_current_addr_sk
		AND c.c_customer_sk = s.ss_customer_sk
		AND ca_state IS NOT NULL
		AND ss_item_sk = highPriceItems.i_item_sk
		AND s.ss_sold_date_sk IN
		( 
			SELECT d_date_sk
			FROM date_dim
			WHERE d_year = 2004
			AND d_moy = 7
		)
		GROUP BY ca_state
		HAVING COUNT(*) >= 10
		ORDER BY cnt DESC, ca_state
		LIMIT 10
	"""

    result = bc.sql(query)
    return result
Пример #20
0
def main(client, config):

    item_df, ws_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    ### setting index on ws_order_number
    ws_df = ws_df.shuffle(on=["ws_order_number"])
    ### at sf-100k we will have max of 17M rows and 17 M rows with 2 columns, 1 part is very reasonable
    item_df = item_df.repartition(npartitions=1)

    # SELECT DISTINCT i_category_id, ws_order_number
    # FROM web_sales ws, item i
    # WHERE ws.ws_item_sk = i.i_item_sk
    # AND i.i_category_id IS NOT NULL

    f_item_df = item_df[item_df["i_category_id"].notnull()]
    ### doing below to retain the `ws_order_number` partition boundry after merge
    ws_item_join = ws_df.merge(f_item_df,
                               left_on=["ws_item_sk"],
                               right_on=["i_item_sk"])
    ws_item_join = ws_item_join[["i_category_id", "ws_order_number"]]
    ws_item_join = ws_item_join.map_partitions(lambda df: df.drop_duplicates())

    ### do pair inner join
    ### pair_df =  get_pairs(ws_item_join)
    ### because of setting index we can do it in map_partitions
    ### this can have  better memory and scaling props at larger scale factors
    pair_df = ws_item_join.map_partitions(get_pairs)

    # SELECT category_id_1, category_id_2, COUNT (*) AS cnt
    # FROM (
    #  ...
    #    )
    # GROUP BY category_id_1, category_id_2
    # ORDER BY cnt DESC, category_id_1, category_id_2
    # LIMIT {q29_limit}
    grouped_df = pair_df.groupby(["category_id_1",
                                  "category_id_2"]).size().persist()

    ### 36 rows after filtration at sf-100
    ### should scale till sf-100k
    grouped_df = grouped_df.reset_index().compute()
    grouped_df.columns = ["category_id_1", "category_id_2", "cnt"]
    grouped_df["category_id_1"] = grouped_df["category_id_1"]
    grouped_df["category_id_2"] = grouped_df["category_id_2"]
    grouped_df = grouped_df.sort_values(
        by=["cnt", "category_id_1", "category_id_2"],
        ascending=[False, True, True]).reset_index(drop=True)
    grouped_df = grouped_df.head(q29_limit)

    return grouped_df
Пример #21
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = f"""
        SELECT
            ss.ss_customer_sk AS cid,
            CAST( count(CASE WHEN i.i_class_id=1  THEN 1 ELSE NULL END) AS DOUBLE ) AS id1,
			CAST( count(CASE WHEN i.i_class_id=2  THEN 1 ELSE NULL END) AS DOUBLE ) AS id2,
			CAST( count(CASE WHEN i.i_class_id=3  THEN 1 ELSE NULL END) AS DOUBLE ) AS id3,
			CAST( count(CASE WHEN i.i_class_id=4  THEN 1 ELSE NULL END) AS DOUBLE ) AS id4,
			CAST( count(CASE WHEN i.i_class_id=5  THEN 1 ELSE NULL END) AS DOUBLE ) AS id5,
			CAST( count(CASE WHEN i.i_class_id=6  THEN 1 ELSE NULL END) AS DOUBLE ) AS id6,
			CAST( count(CASE WHEN i.i_class_id=7  THEN 1 ELSE NULL END) AS DOUBLE ) AS id7,
			CAST( count(CASE WHEN i.i_class_id=8  THEN 1 ELSE NULL END) AS DOUBLE ) AS id8,
			CAST( count(CASE WHEN i.i_class_id=9  THEN 1 ELSE NULL END) AS DOUBLE ) AS id9,
			CAST( count(CASE WHEN i.i_class_id=10 THEN 1 ELSE NULL END) AS DOUBLE ) AS id10,
			CAST( count(CASE WHEN i.i_class_id=11 THEN 1 ELSE NULL END) AS DOUBLE ) AS id11,
			CAST( count(CASE WHEN i.i_class_id=12 THEN 1 ELSE NULL END) AS DOUBLE ) AS id12,
			CAST( count(CASE WHEN i.i_class_id=13 THEN 1 ELSE NULL END) AS DOUBLE ) AS id13,
			CAST( count(CASE WHEN i.i_class_id=14 THEN 1 ELSE NULL END) AS DOUBLE ) AS id14,
			CAST( count(CASE WHEN i.i_class_id=15 THEN 1 ELSE NULL END) AS DOUBLE ) AS id15
        FROM store_sales ss
        INNER JOIN item i
        ON
        (
            ss.ss_item_sk = i.i_item_sk
            AND i.i_category IN ('{q26_i_category_IN}')
            AND ss.ss_customer_sk IS NOT NULL
        )
        GROUP BY ss.ss_customer_sk
        HAVING count(ss.ss_item_sk) > {q26_count_ss_item_sk}
        ORDER BY cid
    """
    result = bc.sql(query)
    result = result.repartition(npartitions=1)
    result_ml = result.set_index('cid')
    ml_result_dict = get_clusters(client=client, kmeans_input_df=result_ml)
    return ml_result_dict
Пример #22
0
def main(client, config):

    wcs_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    ### filter nulls
    # SELECT
    #  wcs_user_sk,
    #  wcs_item_sk,
    #  (wcs_click_date_sk * 24 * 60 * 60 + wcs_click_time_sk) AS tstamp_inSec
    # FROM web_clickstreams
    # WHERE wcs_item_sk IS NOT NULL
    # AND   wcs_user_sk IS NOT NULL

    f_wcs_df = wcs_df.map_partitions(pre_repartition_task)
    f_wcs_df = f_wcs_df.shuffle(on=["wcs_user_sk"])

    ### Main Query
    # SELECT
    #  item_sk_1,${hiveconf:q02_item_sk} AS item_sk_2, COUNT (*) AS cnt
    # FROM
    # (
    # )
    # GROUP BY item_sk_1
    # ORDER BY
    #  cnt DESC,
    #  item_sk_1
    # LIMIT ${hiveconf:q02_limit};

    # q02_limit=30
    grouped_df = f_wcs_df.map_partitions(reduction_function,
                                         q02_session_timeout_inSec)
    items_value_counts = grouped_df.groupby(["i_item_sk"]).cnt.sum()

    items_value_counts = items_value_counts.map_partitions(
        lambda ser: ser.sort_values(ascending=False))

    ### final calculation on 30 values
    result_df = items_value_counts.reset_index(drop=False)
    result_df.columns = ["item_sk_1", "cnt"]
    result_df = result_df.head(q02_limit)
    result_df["item_sk_2"] = q02_item_sk
    result_order = ["item_sk_1", "item_sk_2", "cnt"]
    result_df = result_df[result_order]
    return result_df
Пример #23
0
def main(data_dir, client, bc, config):
    benchmark(read_tables, data_dir, bc, dask_profile=config["dask_profile"])

    query = """ 
		SELECT CASE WHEN pmc > 0.0 THEN CAST (amc AS DOUBLE) / CAST (pmc AS DOUBLE) ELSE -1.0 END AS am_pm_ratio
		FROM 
		(
			SELECT SUM(amc1) AS amc, SUM(pmc1) AS pmc
			FROM
			(
				SELECT
					CASE WHEN t_hour BETWEEN 7 AND 8 THEN COUNT(1) ELSE 0 END AS amc1,
					CASE WHEN t_hour BETWEEN 19 AND 20 THEN COUNT(1) ELSE 0 END AS pmc1
				FROM web_sales ws
				JOIN household_demographics hd ON (hd.hd_demo_sk = ws.ws_ship_hdemo_sk and hd.hd_dep_count = 5)
				JOIN web_page wp ON (wp.wp_web_page_sk = ws.ws_web_page_sk and wp.wp_char_count BETWEEN 5000 AND 6000)
				JOIN time_dim td ON (td.t_time_sk = ws.ws_sold_time_sk and td.t_hour IN (7,8,19,20))
				GROUP BY t_hour
			) cnt_am_pm
		) sum_am_pm
	"""

    result = bc.sql(query)
    return result
Пример #24
0
def main(client, config):
    import cudf

    ss_ddf, items_ddf = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    items_filtered = items_ddf[items_ddf.i_category ==
                               Q26_CATEGORY].reset_index(drop=True)
    items_filtered = items_filtered[["i_item_sk", "i_class_id"]]

    f_ss_ddf = ss_ddf[ss_ddf["ss_customer_sk"].notnull()].reset_index(
        drop=True)
    merged_ddf = f_ss_ddf.merge(items_filtered,
                                left_on="ss_item_sk",
                                right_on="i_item_sk",
                                how="inner")
    keep_cols = ["ss_customer_sk", "i_class_id"]
    merged_ddf = merged_ddf[keep_cols]

    # One-Hot-Encode i_class_id
    merged_ddf = merged_ddf.map_partitions(
        cudf.DataFrame.one_hot_encoding,
        column="i_class_id",
        prefix="id",
        cats=[i for i in range(1, 16)],
        prefix_sep="",
        dtype="float32",
    )
    merged_ddf["total"] = 1.0  # Will keep track of total count
    all_categories = ["total"] + ["id%d" % i for i in range(1, 16)]

    # Aggregate using agg to get sorted ss_customer_sk
    agg_dict = dict.fromkeys(all_categories, "sum")
    rollup_ddf = merged_ddf.groupby("ss_customer_sk").agg(agg_dict)
    rollup_ddf = rollup_ddf[rollup_ddf.total > Q26_ITEM_COUNT][
        all_categories[1:]]

    # Prepare data for KMeans clustering
    rollup_ddf = rollup_ddf.astype("float64")

    kmeans_input_df = rollup_ddf.persist()

    results_dict = get_clusters(client=client, kmeans_input_df=kmeans_input_df)
    return results_dict
Пример #25
0
def main(client, config):
    date_df, inv_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    expr = (
        f"d_year == {q23_year} and (d_moy >= {q23_month} and d_moy <= {q23_month + 1})"
    )
    selected_dates_df = date_df.query(expr)

    merged_inv_dates = inv_df.merge(selected_dates_df,
                                    left_on="inv_date_sk",
                                    right_on="d_date_sk",
                                    how="inner")
    n_workers = len(client.scheduler_info()["workers"])
    iteration1_df = get_iteration1(merged_inv_dates, n_workers)

    # Select only the columns we are interested in
    iteration1_df = iteration1_df[[
        "inv_warehouse_sk", "inv_item_sk", "d_moy", "qty_cov"
    ]].repartition(npartitions=1)  # iteration1_df has 40k rows at sf-100

    expr_1 = f"d_moy == {q23_month}"
    inv1_df = iteration1_df.query(expr_1)  # inv1_df has 13k rows at sf-100

    expr_2 = f"d_moy == {q23_month + 1}"
    inv2_df = iteration1_df.query(expr_2)  # 31k rows at sf-100

    result_df = inv1_df.merge(inv2_df, on=["inv_warehouse_sk", "inv_item_sk"])
    result_df = result_df.rename(
        columns={
            "d_moy_x": "d_moy",
            "d_moy_y": "inv2_d_moy",
            "qty_cov_x": "cov",
            "qty_cov_y": "inv2_cov",
        })

    result_df = result_df.persist()

    result_df = result_df.sort_values(by=["inv_warehouse_sk", "inv_item_sk"])
    result_df = result_df.reset_index(drop=True)

    result_df = result_df.persist()
    wait(result_df)
    return result_df
Пример #26
0
def main(client, config):

    ws_df, item_df, imp_df, ss_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    ## helper table
    item_imp_join_df = get_helper_query_table(imp_df, item_df)

    r_ss = get_ss(ss_df, item_imp_join_df)

    r_ws = get_ws(ws_df, item_imp_join_df)

    result_df = r_ws.merge(
        r_ss,
        left_on=["ws_item_sk", "imp_sk"],
        right_on=["ss_item_sk", "imp_sk"],
        how="inner",
        suffixes=("ws", "ss"),
    )

    result_df["cross_price_elasticity"] = (result_df["current_ss_quant"] +
                                           result_df["current_ws_quant"] -
                                           result_df["prev_ss_quant"] -
                                           result_df["prev_ws_quant"])
    result_df[
        "cross_price_elasticity"] = result_df["cross_price_elasticity"] / (
            (result_df["prev_ss_quant"] + result_df["prev_ws_quant"]) *
            result_df["price_change"])
    final_cols_2_keep = ["ws_item_sk", "cross_price_elasticity"]
    result_df = result_df[final_cols_2_keep]
    result_df = result_df.groupby(["ws_item_sk"
                                   ]).agg({"cross_price_elasticity": "mean"})
    result_df = result_df.reset_index(drop=False)
    wait(result_df)
    return result_df
Пример #27
0
def main(client, config):

    item_df, ss_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    # SELECT DISTINCT ss_item_sk,ss_ticket_number
    # FROM store_sales s, item i
    # -- Only products in certain categories sold in specific stores are considered,
    # WHERE s.ss_item_sk = i.i_item_sk
    # AND i.i_category_id IN ({q01_i_category_id_IN})
    # AND s.ss_store_sk IN ({q01_ss_store_sk_IN})

    f_ss_df = ss_df.loc[ss_df["ss_store_sk"].isin(q01_ss_store_sk_IN)][
        ["ss_item_sk", "ss_ticket_number"]
    ].reset_index(drop=True)

    f_item_df = item_df.loc[item_df["i_category_id"].isin(q01_i_category_id_IN)][
        ["i_item_sk"]
    ].reset_index(drop=True)

    ss_item_join = f_item_df.merge(
        f_ss_df, left_on=["i_item_sk"], right_on=["ss_item_sk"]
    )
    ss_item_join = ss_item_join[["ss_item_sk", "ss_ticket_number"]]

    ## keep to a  single partitions
    ## We only have 41,910,265 rows in the dataframe at sf-10k and dont need to split_out.
    ss_item_join = ss_item_join.drop_duplicates()

    ### do pair inner join
    pair_df = get_pairs(ss_item_join)

    # SELECT item_sk_1, item_sk_2, COUNT(*) AS cnt
    # FROM
    # (
    #    ...
    # )
    # GROUP BY item_sk_1, item_sk_2
    # -- 'frequently'
    # HAVING cnt > {q01_viewed_together_count}
    # ORDER BY cnt DESC, item_sk_1, item_sk_2

    grouped_df = (
        pair_df.groupby(["item_sk_1", "item_sk_2"])
        .size()
        .reset_index()
        .rename(columns={0: "cnt"})
    )
    grouped_df = grouped_df[grouped_df["cnt"] > q01_viewed_together_count].reset_index(
        drop=True
    )

    ### 2017 rows after filteration at sf-100
    ### should scale till sf-100k
    grouped_df = grouped_df.repartition(npartitions=1).persist()
    ## converting to strings because of issue
    # https://github.com/rapidsai/tpcx-bb/issues/36

    grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("str")
    grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("str")
    grouped_df = grouped_df.map_partitions(
        lambda df: df.sort_values(
            by=["cnt", "item_sk_1", "item_sk_2"], ascending=[False, True, True]
        )
    )
    grouped_df = grouped_df.reset_index(drop=True)
    ### below is just 100 rows so should fit on `cudf` context
    grouped_df = grouped_df.head(q01_limit)
    ### writing to int to ensure same values
    grouped_df["item_sk_1"] = grouped_df["item_sk_1"].astype("int32")
    grouped_df["item_sk_2"] = grouped_df["item_sk_2"].astype("int32")
    return grouped_df
Пример #28
0
def main(client, config):
    import dask_cudf
    import cudf

    item_df = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    wcs_tstamp_min = get_wcs_minima(config)

    item_df["i_item_sk"] = item_df["i_item_sk"].astype("int32")
    item_df["i_category_id"] = item_df["i_category_id"].astype("int8")

    # we eventually will only care about these categories, so we can filter now
    item_df_filtered = item_df.loc[item_df.i_category_id.isin(
        q03_purchased_item_category_IN)].reset_index(drop=True)

    # The main idea is that we don't fuse a filtration task with reading task yet
    # this causes more memory pressures as we try to read the whole thing ( and spill that)
    # at once and then do filtration .

    ### Below Pr has the dashboard snapshot which makes the problem clear
    ### https://github.com/rapidsai/tpcx-bb-internal/pull/496#issue-399946141

    web_clickstream_flist = glob.glob(
        os.path.join(config["data_dir"], "web_clickstreams/*.parquet"))
    task_ls = [
        delayed(pre_repartition_task)(fn, item_df.to_delayed()[0],
                                      wcs_tstamp_min)
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int32),
        "tstamp": np.ones(1, dtype=np.int32),
        "wcs_item_sk": np.ones(1, dtype=np.int32),
        "wcs_sales_sk": np.ones(1, dtype=np.int32),
        "i_category_id": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)

    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.shuffle(on="wcs_user_sk")

    meta_d = {
        "i_item_sk": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
        "cnt": np.ones(1, dtype=merged_df["wcs_item_sk"].dtype),
    }
    meta_df = cudf.DataFrame(meta_d)

    grouped_df = merged_df.map_partitions(reduction_function,
                                          item_df_filtered.to_delayed()[0],
                                          meta=meta_df)

    ### todo: check if this has any impact on stability
    grouped_df = grouped_df.persist(priority=10000)
    ### todo: remove this later after more testing
    wait(grouped_df)
    print("---" * 20)
    print("grouping complete ={}".format(len(grouped_df)))
    grouped_df = grouped_df.groupby(["i_item_sk"
                                     ]).sum(split_every=2).reset_index()
    grouped_df.columns = ["i_item_sk", "cnt"]
    result_df = grouped_df.map_partitions(
        lambda df: df.sort_values(by=["cnt"], ascending=False))

    result_df.columns = ["lastviewed_item", "cnt"]
    result_df["purchased_item"] = q03_purchased_item_IN
    cols_order = ["purchased_item", "lastviewed_item", "cnt"]
    result_df = result_df[cols_order]
    result_df = result_df.persist()
    ### todo: remove this later after more testing
    wait(result_df)
    print(len(result_df))
    result_df = result_df.head(q03_limit)
    print("result complete")
    print("---" * 20)
    return result_df
Пример #29
0
def main(client, config):
    import cudf
    import dask_cudf

    (date_dim_df, web_page_df, web_sales_df) = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )

    date_dim_cov_df = date_dim_df.map_partitions(convert_datestring_to_days)
    q08_start_dt = np.datetime64(q08_STARTDATE, "D").astype(int)
    q08_end_dt = np.datetime64(q08_ENDDATE, "D").astype(int)
    filtered_date_df = date_dim_cov_df.query(
        f"d_date >= {q08_start_dt} and d_date <= {q08_end_dt}",
        meta=date_dim_cov_df._meta,
    ).reset_index(drop=True)

    # Convert wp_type to categorical and get cat_id of review and dynamic type
    # see https://github.com/rapidsai/cudf/issues/4093 for more info
    web_page_df = web_page_df.persist()

    # map_partitions is a bit faster than ddf[col].astype('category')
    web_page_df["wp_type"] = web_page_df["wp_type"].map_partitions(
        lambda ser: ser.astype("category"))
    cpu_categories = web_page_df["wp_type"].compute().cat.categories.to_pandas(
    )
    REVIEW_CAT_CODE = cpu_categories.get_loc("review")

    # cast to minimum viable dtype
    codes_min_signed_type = cudf.utils.dtypes.min_signed_type(
        len(cpu_categories))

    web_page_df["wp_type_codes"] = web_page_df["wp_type"].cat.codes.astype(
        codes_min_signed_type)
    web_page_newcols = ["wp_web_page_sk", "wp_type_codes"]
    web_page_df = web_page_df[web_page_newcols]

    web_clickstream_flist = glob.glob(config["data_dir"] +
                                      "web_clickstreams/*.parquet")

    task_ls = [
        delayed(etl_wcs)(fn, filtered_date_df.to_delayed()[0],
                         web_page_df.to_delayed()[0])
        for fn in web_clickstream_flist
    ]

    meta_d = {
        "wcs_user_sk": np.ones(1, dtype=np.int64),
        "tstamp_inSec": np.ones(1, dtype=np.int64),
        "wcs_sales_sk": np.ones(1, dtype=np.int64),
        "wp_type_codes": np.ones(1, dtype=np.int8),
    }
    meta_df = cudf.DataFrame(meta_d)
    merged_df = dask_cudf.from_delayed(task_ls, meta=meta_df)

    merged_df = merged_df.repartition(columns=["wcs_user_sk"])
    reviewed_sales = merged_df.map_partitions(
        reduction_function,
        REVIEW_CAT_CODE,
        meta=cudf.DataFrame({"wcs_sales_sk": np.ones(1, dtype=np.int64)}),
    )
    reviewed_sales = reviewed_sales.persist()
    wait(reviewed_sales)
    del merged_df

    all_sales_in_year = filtered_date_df.merge(web_sales_df,
                                               left_on=["d_date_sk"],
                                               right_on=["ws_sold_date_sk"],
                                               how="inner")
    all_sales_in_year = all_sales_in_year[["ws_net_paid", "ws_order_number"]]

    all_sales_in_year = all_sales_in_year.persist()
    wait(all_sales_in_year)

    # note: switch to mainline
    # once https://github.com/dask/dask/pull/6066
    # lands

    q08_reviewed_sales = hash_merge(
        lhs=all_sales_in_year,
        rhs=reviewed_sales,
        left_on=["ws_order_number"],
        right_on=["wcs_sales_sk"],
        how="inner",
    )

    q08_reviewed_sales_sum = q08_reviewed_sales["ws_net_paid"].sum()
    q08_all_sales_sum = all_sales_in_year["ws_net_paid"].sum()

    q08_reviewed_sales_sum, q08_all_sales_sum = client.compute(
        [q08_reviewed_sales_sum, q08_all_sales_sum])
    q08_reviewed_sales_sum, q08_all_sales_sum = (
        q08_reviewed_sales_sum.result(),
        q08_all_sales_sum.result(),
    )

    no_q08_review_sales_amount = q08_all_sales_sum - q08_reviewed_sales_sum

    final_result_df = cudf.DataFrame()
    final_result_df["q08_review_sales_amount"] = [q08_reviewed_sales_sum]
    final_result_df["q08_review_sales_amount"] = final_result_df[
        "q08_review_sales_amount"].astype("int")
    final_result_df["no_q08_review_sales_amount"] = [
        no_q08_review_sales_amount
    ]
    final_result_df["no_q08_review_sales_amount"] = final_result_df[
        "no_q08_review_sales_amount"].astype("int")

    return final_result_df
Пример #30
0
def main(client, config):
    import cudf
    import dask_cudf

    store_sales, date_dim, store, product_reviews = benchmark(
        read_tables,
        config=config,
        compute_result=config["get_read_time"],
        dask_profile=config["dask_profile"],
    )
    ### adding a wait call slows this down by 3-4 seconds, removing it for now
    ### Make TEMP_TABLE1

    # filter date table
    q18_startDate_int = np.datetime64(q18_startDate, "ms").astype(int)
    q18_endDate_int = np.datetime64(q18_endDate, "ms").astype(int)

    date_dim_filtered = date_dim.loc[
        (date_dim.d_date.astype("datetime64[ms]").astype("int") >=
         q18_startDate_int)
        & (date_dim.d_date.astype("datetime64[ms]").astype("int") <=
           q18_endDate_int)].reset_index(drop=True)

    # build the regression_analysis table
    ss_date_dim_join = left_semi_join(
        store_sales,
        date_dim_filtered,
        left_on=["ss_sold_date_sk"],
        right_on=["d_date_sk"],
    )

    temp = (ss_date_dim_join.groupby(["ss_store_sk", "ss_sold_date_sk"], ).agg(
        {
            "ss_net_paid": "sum"
        }).reset_index())

    temp["xx"] = temp.ss_sold_date_sk * temp.ss_sold_date_sk
    temp["xy"] = temp.ss_sold_date_sk * temp.ss_net_paid
    temp.columns = ["ss_store_sk", "x", "y", "xx", "xy"]

    regression_analysis = (temp.groupby(["ss_store_sk"]).agg({
        "x": ["count", "sum"],
        "xy":
        "sum",
        "y":
        "sum",
        "xx":
        "sum"
    }).reset_index(drop=False))

    regression_analysis["slope"] = (
        regression_analysis[("x", "count")] * regression_analysis[
            ("xy", "sum")] -
        regression_analysis[("x", "sum")] * regression_analysis[("y", "sum")]
    ) / (regression_analysis[("x", "count")] * regression_analysis[
        ("xx", "sum")] -
         regression_analysis[("x", "sum")] * regression_analysis[("x", "sum")])
    regression_analysis = regression_analysis[["ss_store_sk", "slope"]]
    regression_analysis.columns = ["ss_store_sk", "slope"]

    regression_analysis["ss_store_sk"] = regression_analysis[
        "ss_store_sk"].astype("int32")
    store["s_store_sk"] = store["s_store_sk"].astype("int32")
    temp_table1 = store.merge(
        regression_analysis[["ss_store_sk", "slope"
                             ]].query("slope <= 0").reset_index(drop=True),
        left_on="s_store_sk",
        right_on="ss_store_sk",
    )
    temp_table1 = temp_table1[["s_store_sk", "s_store_name"]]

    # repartition this table to be one partition, since its only 192 at SF1000
    temp_table1 = temp_table1.repartition(npartitions=1)
    temp_table1 = temp_table1.persist()
    ### Make TEMP_TABLE2
    stores_with_regression = temp_table1
    pr = product_reviews

    # known to be small. very few relevant stores (169) at SF1000
    targets = (stores_with_regression.s_store_name.str.lower().unique().
               compute().tolist())
    n_targets = len(targets)

    no_nulls = pr[~pr.pr_review_content.isnull()].reset_index(drop=True)
    no_nulls["pr_review_sk"] = no_nulls["pr_review_sk"].astype("int32")

    ### perssiting because no_nulls is used twice
    no_nulls = no_nulls.reset_index(drop=True).persist()

    temp_table2_meta_empty_df = cudf.DataFrame({
        "word": ["a"],
        "pr_review_sk":
        np.ones(1, dtype=np.int64),
        "pr_review_date": ["a"],
    }).head(0)

    ### get relevant reviews
    combined = no_nulls.map_partitions(
        find_relevant_reviews,
        targets,
        meta=temp_table2_meta_empty_df,
    )

    stores_with_regression[
        "store_ID"] = stores_with_regression.s_store_sk.astype("str").str.cat(
            stores_with_regression.s_store_name, sep="_")
    stores_with_regression[
        "s_store_name"] = stores_with_regression.s_store_name.str.lower()

    # Keep this commented line to illustrate that we could exactly match Spark
    # temp_table2 = temp_table2[['store_ID', 'pr_review_date', 'pr_review_content']]
    temp_table2 = combined.merge(stores_with_regression,
                                 how="inner",
                                 left_on=["word"],
                                 right_on=["s_store_name"])

    temp_table2 = temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]]
    temp_table2 = temp_table2.persist()

    ### REAL QUERY (PART THREE)
    no_nulls["pr_review_content"] = no_nulls.pr_review_content.str.replace(
        [". ", "? ", "! "], [EOL_CHAR], regex=False)
    sentences = no_nulls.map_partitions(create_sentences_from_reviews)

    # need the global position in the sentence tokenized df
    sentences["x"] = 1
    sentences["sentence_tokenized_global_pos"] = sentences.x.cumsum()
    del sentences["x"]

    # This file comes from the official TPCx-BB kit
    # We extracted it from bigbenchqueriesmr.jar
    sentiment_dir = "/".join(config["data_dir"].split("/")[:-3] +
                             ["sentiment_files"])
    with open(f"{sentiment_dir}/negativeSentiment.txt") as fh:
        negativeSentiment = list(map(str.strip, fh.readlines()))
        # dedupe for one extra record in the source file
        negativeSentiment = list(set(negativeSentiment))

    word_df = sentences.map_partitions(
        create_words_from_sentences,
        global_position_column="sentence_tokenized_global_pos",
    )
    sent_df = cudf.DataFrame({"word": negativeSentiment})
    sent_df["sentiment"] = "NEG"
    sent_df = dask_cudf.from_cudf(sent_df, npartitions=1)

    word_sentence_sentiment = word_df.merge(sent_df, how="inner", on="word")

    word_sentence_sentiment[
        "sentence_idx_global_pos"] = word_sentence_sentiment[
            "sentence_idx_global_pos"].astype("int64")
    sentences["sentence_tokenized_global_pos"] = sentences[
        "sentence_tokenized_global_pos"].astype("int64")

    word_sentence_sentiment_with_sentence_info = word_sentence_sentiment.merge(
        sentences,
        how="left",
        left_on="sentence_idx_global_pos",
        right_on="sentence_tokenized_global_pos",
    )
    temp_table2["pr_review_sk"] = temp_table2["pr_review_sk"].astype("int32")

    final = word_sentence_sentiment_with_sentence_info.merge(
        temp_table2[["store_ID", "pr_review_date", "pr_review_sk"]],
        how="inner",
        left_on="review_idx_global_pos",
        right_on="pr_review_sk",
    )

    keepcols = ["store_ID", "pr_review_date", "sentence", "sentiment", "word"]
    final = final[keepcols]
    final.columns = [
        "s_name", "r_date", "r_sentence", "sentiment", "sentiment_word"
    ]
    final = final.persist()
    wait(final)
    final = final.sort_values(
        ["s_name", "r_date", "r_sentence", "sentiment_word"])
    final = final.persist()
    wait(final)
    print(len(final))
    return final