def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], ) store_sales_cols = [ "ss_store_sk", "ss_sold_date_sk", "ss_net_paid", ] date_cols = ["d_date_sk", "d_date"] store_cols = ["s_store_sk", "s_store_name"] store_sales = table_reader.read("store_sales", relevant_cols=store_sales_cols) date_dim = table_reader.read("date_dim", relevant_cols=date_cols) store = table_reader.read("store", relevant_cols=store_cols) ### splitting by row groups for better parallelism pr_table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=True, ) product_reviews_cols = [ "pr_review_date", "pr_review_content", "pr_review_sk" ] product_reviews = pr_table_reader.read( "product_reviews", relevant_cols=product_reviews_cols, ) return store_sales, date_dim, store, product_reviews
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ss_cols = [ "ss_customer_sk", "ss_sold_date_sk", "ss_ticket_number", "ss_net_paid" ] ws_cols = [ "ws_bill_customer_sk", "ws_sold_date_sk", "ws_order_number", "ws_net_paid", ] datedim_cols = ["d_date_sk", "d_date"] ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) ws_ddf = table_reader.read("web_sales", relevant_cols=ws_cols, index=False) datedim_ddf = table_reader.read("date_dim", relevant_cols=datedim_cols, index=False) return (ss_ddf, ws_ddf, datedim_ddf)
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) product_review_cols = [ "pr_review_rating", "pr_item_sk", ] web_sales_cols = [ "ws_sold_date_sk", "ws_net_paid", "ws_item_sk", ] date_cols = ["d_date_sk", "d_date"] pr_df = table_reader.read("product_reviews", relevant_cols=product_review_cols) # we only read int columns here so it should scale up to sf-10k as just 26M rows pr_df = pr_df.repartition(npartitions=1) ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) date_df = table_reader.read("date_dim", relevant_cols=date_cols) return pr_df, ws_df, date_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) item_df = table_reader.read("item", relevant_cols=item_cols) customer_df = table_reader.read("customer", relevant_cols=customer_cols) store_df = table_reader.read("store", relevant_cols=store_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_address_df = table_reader.read( "customer_address", relevant_cols=customer_address_cols ) promotion_df = table_reader.read("promotion", relevant_cols=promotion_cols) return ( store_sales_df, item_df, customer_df, store_df, date_dim_df, customer_address_df, promotion_df, )
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) store_sales_cols = [ "ss_customer_sk", "ss_ticket_number", "ss_item_sk", "ss_net_paid", ] store_returns_cols = [ "sr_item_sk", "sr_customer_sk", "sr_ticket_number", "sr_return_amt", ] store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) store_returns_df = table_reader.read("store_returns", relevant_cols=store_returns_cols) return store_sales_df, store_returns_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) item_cols = ["i_item_sk", "i_current_price", "i_category"] store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"] store_cols = ["s_store_sk"] date_cols = ["d_date_sk", "d_year", "d_moy"] customer_cols = ["c_customer_sk", "c_current_addr_sk"] customer_address_cols = ["ca_address_sk", "ca_state"] item_df = table_reader.read("item", relevant_cols=item_cols) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) store_df = table_reader.read("store", relevant_cols=store_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_df = table_reader.read("customer", relevant_cols=customer_cols) customer_address_df = table_reader.read( "customer_address", relevant_cols=customer_address_cols) return ( item_df, store_sales_df, store_df, date_dim_df, customer_df, customer_address_df, )
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ss_columns = [ "ss_quantity", "ss_sold_date_sk", "ss_addr_sk", "ss_store_sk", "ss_cdemo_sk", "ss_sales_price", "ss_net_profit", ] store_sales = table_reader.read("store_sales", relevant_cols=ss_columns) ca_columns = ["ca_address_sk", "ca_country", "ca_state"] customer_address = table_reader.read("customer_address", relevant_cols=ca_columns) cd_columns = ["cd_demo_sk", "cd_marital_status", "cd_education_status"] customer_demographics = table_reader.read( "customer_demographics", relevant_cols=cd_columns ) dd_columns = ["d_year", "d_date_sk"] date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) s_columns = ["s_store_sk"] store = table_reader.read("store", relevant_cols=s_columns) return store_sales, customer_address, customer_demographics, date_dim, store
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) item_cols = ["i_category_id", "i_item_sk"] item_df = table_reader.read("item", relevant_cols=item_cols) return item_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) item_df = table_reader.read("item", relevant_cols=item_cols) ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) return item_df, ss_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) wcs_cols = [ "wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk" ] wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) return wcs_df
def read_tables(config): ### splitting by row groups for better parallelism table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=True, ) product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] product_reviews_df = table_reader.read("product_reviews", relevant_cols=product_reviews_cols) return product_reviews_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], ) item_cols = ["i_item_sk", "i_category_id"] item_df = table_reader.read("item", relevant_cols=item_cols) ws_cols = ["ws_order_number", "ws_item_sk"] ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) return item_df, ws_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ### read tables ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) item_df = table_reader.read("item", relevant_cols=item_cols) imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols) ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) return ws_df, item_df, imp_df, ss_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) item_df = table_reader.read("item", relevant_cols=item_cols) warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols) return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) date_dim_cols = ["d_date_sk", "d_date"] web_page_cols = ["wp_web_page_sk", "wp_type"] web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"] date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols) web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) return (date_dim_df, web_page_df, web_sales_df)
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], ) date_cols = ["d_date_sk", "d_year", "d_moy"] date_df = table_reader.read("date_dim", relevant_cols=date_cols) inv_cols = [ "inv_warehouse_sk", "inv_item_sk", "inv_date_sk", "inv_quantity_on_hand", ] inv_df = table_reader.read("inventory", relevant_cols=inv_cols) return date_df, inv_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ss_cols = ["ss_customer_sk", "ss_item_sk"] items_cols = ["i_item_sk", "i_category", "i_class_id"] ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) items_ddf = table_reader.read("item", relevant_cols=items_cols, index=False) return (ss_ddf, items_ddf)
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) web_sales_cols = [ "ws_bill_customer_sk", "ws_sold_date_sk", "ws_ext_list_price", "ws_ext_wholesale_cost", "ws_ext_discount_amt", "ws_ext_sales_price", ] store_sales_cols = [ "ss_customer_sk", "ss_sold_date_sk", "ss_ext_list_price", "ss_ext_wholesale_cost", "ss_ext_discount_amt", "ss_ext_sales_price", ] date_cols = ["d_date_sk", "d_year", "d_moy"] customer_cols = [ "c_customer_sk", "c_customer_id", "c_email_address", "c_first_name", "c_last_name", "c_preferred_cust_flag", "c_birth_country", "c_login", ] ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) ss_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) date_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_df = table_reader.read("customer", relevant_cols=customer_cols) return (ws_df, ss_df, date_df, customer_df)
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ws_columns = ["ws_ship_hdemo_sk", "ws_web_page_sk", "ws_sold_time_sk"] web_sales = table_reader.read("web_sales", relevant_cols=ws_columns) hd_columns = ["hd_demo_sk", "hd_dep_count"] household_demographics = table_reader.read("household_demographics", relevant_cols=hd_columns) wp_columns = ["wp_web_page_sk", "wp_char_count"] web_page = table_reader.read("web_page", relevant_cols=wp_columns) td_columns = ["t_time_sk", "t_hour"] time_dim = table_reader.read("time_dim", relevant_cols=td_columns) return web_sales, household_demographics, web_page, time_dim
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) date_cols = ["d_date_sk", "d_year"] date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_cols = [ "c_customer_sk", "c_customer_id", "c_first_name", "c_last_name" ] customer_df = table_reader.read("customer", relevant_cols=customer_cols) s_sales_cols = ["ss_sold_date_sk", "ss_customer_sk", "ss_net_paid"] s_sales_df = table_reader.read("store_sales", relevant_cols=s_sales_cols) w_sales_cols = ["ws_sold_date_sk", "ws_bill_customer_sk", "ws_net_paid"] web_sales_df = table_reader.read("web_sales", relevant_cols=w_sales_cols) return date_dim_df, customer_df, s_sales_df, web_sales_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) inv_columns = [ "inv_item_sk", "inv_warehouse_sk", "inv_date_sk", "inv_quantity_on_hand", ] inventory = table_reader.read("inventory", relevant_cols=inv_columns) item_columns = ["i_item_id", "i_current_price", "i_item_sk"] item = table_reader.read("item", relevant_cols=item_columns) warehouse_columns = ["w_warehouse_sk", "w_warehouse_name"] warehouse = table_reader.read("warehouse", relevant_cols=warehouse_columns) dd_columns = ["d_date_sk", "d_date"] date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) return inventory, item, warehouse, date_dim