예제 #1
0
def q17(lineitem, part):
    t1 = time.time()
    left = lineitem.loc[:, ["L_PARTKEY", "L_QUANTITY", "L_EXTENDEDPRICE"]]
    right = part[((part["P_BRAND"] == "Brand#23") &
                  (part["P_CONTAINER"] == "MED BOX"))]
    right = right.loc[:, ["P_PARTKEY"]]
    line_part_merge = left.merge(right,
                                 left_on='L_PARTKEY',
                                 right_on='P_PARTKEY',
                                 how='inner')
    line_part_merge = line_part_merge.loc[:, [
        "L_QUANTITY", "L_EXTENDEDPRICE", "P_PARTKEY"
    ]]
    lineitem_filtered = lineitem.loc[:, ["L_PARTKEY", "L_QUANTITY"]]
    lineitem_avg = lineitem_filtered.groupby(
        ["L_PARTKEY"], as_index=False).agg(
            avg=pd.NamedAgg(column="L_QUANTITY", aggfunc="mean"))
    lineitem_avg["avg"] = 0.2 * lineitem_avg["avg"]
    lineitem_avg = lineitem_avg.loc[:, ["L_PARTKEY", "avg"]]
    total = line_part_merge.merge(lineitem_avg,
                                  left_on='P_PARTKEY',
                                  right_on='L_PARTKEY',
                                  how='inner')
    total = total[total["L_QUANTITY"] < total["avg"]]
    total = pd.DataFrame(
        {"avg_yearly": [total["L_EXTENDEDPRICE"].sum() / 7.0]})
    print(total)
    print("Q17 Execution time (s): ", time.time() - t1)
예제 #2
0
def q15(lineitem, supplier):
    t1 = time.time()
    lineitem_filtered = lineitem[
        (lineitem["L_SHIPDATE"] >= pd.Timestamp('1996-01-01'))
        & (lineitem["L_SHIPDATE"] <
           (pd.Timestamp('1996-01-01') + pd.DateOffset(months=3)))]
    lineitem_filtered["REVENUE_PARTS"] = lineitem_filtered[
        "L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"])
    lineitem_filtered = lineitem_filtered.loc[:,
                                              ["L_SUPPKEY", "REVENUE_PARTS"]]
    revenue_table = lineitem_filtered.groupby(
        "L_SUPPKEY", as_index=False).agg(TOTAL_REVENUE=pd.NamedAgg(
            column="REVENUE_PARTS", aggfunc="sum")).rename(
                columns={"L_SUPPKEY": "SUPPLIER_NO"}, copy=False)
    max_revenue = revenue_table["TOTAL_REVENUE"].max()
    revenue_table = revenue_table[revenue_table["TOTAL_REVENUE"] ==
                                  max_revenue]
    supplier_filtered = supplier.loc[:, [
        "S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE"
    ]]
    total = supplier_filtered.merge(revenue_table,
                                    left_on="S_SUPPKEY",
                                    right_on="SUPPLIER_NO",
                                    how="inner")
    total = total.loc[:, [
        "S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"
    ]]
    print(total)
    print("Q15 Execution time (s): ", time.time() - t1)
예제 #3
0
def q11(partsupp, supplier, nation):
    t1 = time.time()
    partsupp_filtered = partsupp.loc[:, ["PS_PARTKEY", "PS_SUPPKEY"]]
    partsupp_filtered[
        "TOTAL_COST"] = partsupp["PS_SUPPLYCOST"] * partsupp["PS_AVAILQTY"]
    supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]]
    ps_supp_merge = partsupp_filtered.merge(supplier_filtered,
                                            left_on='PS_SUPPKEY',
                                            right_on='S_SUPPKEY',
                                            how='inner')
    ps_supp_merge.loc[:, ["PS_PARTKEY", "S_NATIONKEY", "TOTAL_COST"]]
    nation_filtered = nation[(nation["N_NAME"] == "GERMANY")]
    nation_filtered = nation_filtered.loc[:, ["N_NATIONKEY"]]
    ps_supp_n_merge = ps_supp_merge.merge(nation_filtered,
                                          left_on='S_NATIONKEY',
                                          right_on='N_NATIONKEY',
                                          how='inner')
    ps_supp_n_merge = ps_supp_n_merge.loc[:, ["PS_PARTKEY", "TOTAL_COST"]]
    sum_val = ps_supp_n_merge["TOTAL_COST"].sum() * 0.0001
    total = ps_supp_n_merge.groupby(["PS_PARTKEY"], as_index=False).agg(
        VALUE=pd.NamedAgg(column="TOTAL_COST", aggfunc="sum"))
    total = total[total["VALUE"] > sum_val]
    total = total.sort_values("VALUE", ascending=False)
    print(total)
    print("Q11 Execution time (s): ", time.time() - t1)
def q13(customer, orders):
    t1 = time.time()
    customer_filtered = customer.loc[:, ["C_CUSTKEY"]]
    orders_filtered = orders[~orders["O_COMMENT"].str.contains("special(\S|\s)*requests")]
    orders_filtered = orders_filtered.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]]
    c_o_merged = customer_filtered.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='left')
    c_o_merged = c_o_merged.loc[:, ["C_CUSTKEY", "O_ORDERKEY"]]
    count_df = c_o_merged.groupby(["C_CUSTKEY"], as_index=False).agg(C_COUNT=pd.NamedAgg(column="O_ORDERKEY", aggfunc="count"))
    total = count_df.groupby(["C_COUNT"], as_index = False).size()
    total.columns = ["C_COUNT","CUSTDIST"]
    total = total.sort_values(by=["CUSTDIST","C_COUNT"], ascending=[False,False,])
    print(total)
    print("Q13 Execution time (s): ", time.time() - t1)
def q22(customer, orders):
    t1 = time.time()
    customer_filtered = customer.loc[:, ["C_ACCTBAL", "C_CUSTKEY"]]
    customer_filtered["CNTRYCODE"] = customer["C_PHONE"].str.slice(0, 2)
    customer_filtered = customer_filtered[(customer["C_ACCTBAL"] > 0.00) & customer_filtered["CNTRYCODE"].isin(["13", "31", "23", "29", "30", "18", "17"])]
    avg_value = customer_filtered["C_ACCTBAL"].mean()
    customer_filtered = customer_filtered[customer_filtered["C_ACCTBAL"] > avg_value]
    # Select only the keys that don't match by performing a left join and only selecting columns with an na value
    orders_filtered = orders.loc[:, ["O_CUSTKEY"]].drop_duplicates()
    customer_keys = customer_filtered.loc[:, ["C_CUSTKEY"]].drop_duplicates()
    customer_selected = customer_keys.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='left')
    customer_selected= customer_selected[customer_selected["O_CUSTKEY"].isna()]
    customer_selected = customer_selected.loc[:, ["C_CUSTKEY"]]
    customer_selected = customer_selected.merge(customer_filtered, on="C_CUSTKEY", how="inner")
    customer_selected = customer_selected.loc[:, ["CNTRYCODE", "C_ACCTBAL"]]
    agg1 = customer_selected.groupby(["CNTRYCODE"], as_index = False).size()
    agg1.columns = ["CNTRYCODE", "NUMCUST"]
    agg2 = customer_selected.groupby(["CNTRYCODE"], as_index = False).agg(TOTACCTBAL=pd.NamedAgg(column="C_ACCTBAL", aggfunc="sum"))
    total = agg1.merge(agg2, on="CNTRYCODE", how="inner")
    total = total.sort_values(by=["CNTRYCODE",], ascending=[True,])
    print(total)
    print("Q22 Execution time (s): ", time.time() - t1)
def q07(lineitem, supplier, orders, customer, nation):
    """ This version is faster than q07_old. Keeping the old one for reference """
    t1 = time.time()

    lineitem_filtered = lineitem[(lineitem["L_SHIPDATE"] >= pd.Timestamp("1995-01-01")) & (lineitem["L_SHIPDATE"] < pd.Timestamp("1997-01-01"))]
    lineitem_filtered["L_YEAR"] = lineitem_filtered["L_SHIPDATE"].apply(lambda x: x.year)
    lineitem_filtered["VOLUME"] = lineitem_filtered["L_EXTENDEDPRICE"] * (1.0 - lineitem_filtered["L_DISCOUNT"])
    lineitem_filtered = lineitem_filtered.loc[:, ["L_ORDERKEY", "L_SUPPKEY", "L_YEAR", "VOLUME"]]
    supplier_filtered = supplier.loc[:, ["S_SUPPKEY", "S_NATIONKEY"]]
    orders_filtered = orders.loc[:, ["O_ORDERKEY", "O_CUSTKEY"]]
    customer_filtered = customer.loc[:, ["C_CUSTKEY", "C_NATIONKEY"]]
    n1 = nation[(nation["N_NAME"] == "FRANCE")].loc[:, ["N_NATIONKEY", "N_NAME"]]
    n2 = nation[(nation["N_NAME"] == "GERMANY")].loc[:, ["N_NATIONKEY", "N_NAME"]]

    # ----- do nation 1 -----
    N1_C = customer_filtered.merge(n1, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N1_C = N1_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"})
    N1_C_O = N1_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner')
    N1_C_O = N1_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"])

    # NOTE: this is faster than first merging lineitem with N1_C_O
    N2_S = supplier_filtered.merge(n2, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N2_S = N2_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"})
    N2_S_L = N2_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner')
    N2_S_L = N2_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"])

    total1 = N1_C_O.merge(N2_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner')
    total1 = total1.drop(columns=["O_ORDERKEY", "L_ORDERKEY"])

    # ----- do nation 2 ----- (same as nation 1 section but with nation 2)
    N2_C = customer_filtered.merge(n2, left_on='C_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N2_C = N2_C.drop(columns=["C_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "CUST_NATION"})
    N2_C_O = N2_C.merge(orders_filtered, left_on='C_CUSTKEY', right_on='O_CUSTKEY', how='inner')
    N2_C_O = N2_C_O.drop(columns=["C_CUSTKEY", "O_CUSTKEY"])

    N1_S = supplier_filtered.merge(n1, left_on='S_NATIONKEY', right_on='N_NATIONKEY', how='inner')
    N1_S = N1_S.drop(columns=["S_NATIONKEY", "N_NATIONKEY"]).rename(columns={"N_NAME": "SUPP_NATION"})
    N1_S_L = N1_S.merge(lineitem_filtered, left_on='S_SUPPKEY', right_on='L_SUPPKEY', how='inner')
    N1_S_L = N1_S_L.drop(columns=["S_SUPPKEY", "L_SUPPKEY"])

    total2 = N2_C_O.merge(N1_S_L, left_on='O_ORDERKEY', right_on='L_ORDERKEY', how='inner')
    total2 = total2.drop(columns=["O_ORDERKEY", "L_ORDERKEY"])

    # concat results
    total = pd.concat([total1, total2])

    total = total.groupby(["SUPP_NATION", "CUST_NATION", "L_YEAR"], as_index = False).agg(REVENUE=pd.NamedAgg(column="VOLUME", aggfunc="sum"))
    total = total.sort_values(by=["SUPP_NATION","CUST_NATION","L_YEAR"], ascending=[True,True,True,])
    print(total)
    print("Q07 Execution time (s): ", time.time() - t1)