def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], ) date_dim_cols = ["d_week_seq", "d_date_sk", "d_date"] date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) store_returns_cols = ["sr_returned_date_sk", "sr_item_sk", "sr_return_quantity"] store_returns_df = table_reader.read( "store_returns", relevant_cols=store_returns_cols ) web_returns_cols = ["wr_returned_date_sk", "wr_item_sk", "wr_return_quantity"] web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) ### splitting by row groups for better parallelism pr_table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=True, ) product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] product_reviews = pr_table_reader.read( "product_reviews", relevant_cols=product_reviews_cols ) return date_dim_df, store_returns_df, web_returns_df, product_reviews
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], ) store_sales_cols = [ "ss_store_sk", "ss_sold_date_sk", "ss_net_paid", ] date_cols = ["d_date_sk", "d_date"] store_cols = ["s_store_sk", "s_store_name"] store_sales = table_reader.read("store_sales", relevant_cols=store_sales_cols) date_dim = table_reader.read("date_dim", relevant_cols=date_cols) store = table_reader.read("store", relevant_cols=store_cols) ### splitting by row groups for better parallelism pr_table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=True, ) product_reviews_cols = [ "pr_review_date", "pr_review_content", "pr_review_sk" ] product_reviews = pr_table_reader.read( "product_reviews", relevant_cols=product_reviews_cols, ) return store_sales, date_dim, store, product_reviews
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) item_df = table_reader.read("item", relevant_cols=item_cols) customer_df = table_reader.read("customer", relevant_cols=customer_cols) store_df = table_reader.read("store", relevant_cols=store_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_address_df = table_reader.read( "customer_address", relevant_cols=customer_address_cols ) promotion_df = table_reader.read("promotion", relevant_cols=promotion_cols) return ( store_sales_df, item_df, customer_df, store_df, date_dim_df, customer_address_df, promotion_df, )
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) item_cols = ["i_item_sk", "i_current_price", "i_category"] store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"] store_cols = ["s_store_sk"] date_cols = ["d_date_sk", "d_year", "d_moy"] customer_cols = ["c_customer_sk", "c_current_addr_sk"] customer_address_cols = ["ca_address_sk", "ca_state"] item_df = table_reader.read("item", relevant_cols=item_cols) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) store_df = table_reader.read("store", relevant_cols=store_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_df = table_reader.read("customer", relevant_cols=customer_cols) customer_address_df = table_reader.read( "customer_address", relevant_cols=customer_address_cols ) return ( item_df, store_sales_df, store_df, date_dim_df, customer_df, customer_address_df, )
def read_tables(): table_reader = build_reader( data_format=cli_args["file_format"], basepath=cli_args["data_dir"], split_row_groups=cli_args["split_row_groups"], ) product_review_cols = [ "pr_review_rating", "pr_item_sk", ] web_sales_cols = [ "ws_sold_date_sk", "ws_net_paid", "ws_item_sk", ] date_cols = ["d_date_sk", "d_date"] pr_df = table_reader.read("product_reviews", relevant_cols=product_review_cols) # we only read int columns here so it should scale up to sf-10k as just 26M rows pr_df = pr_df.repartition(npartitions=1) ws_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) date_df = table_reader.read("date_dim", relevant_cols=date_cols) return pr_df, ws_df, date_df
def read_tables(): table_reader = build_reader( data_format=cli_args["file_format"], basepath=cli_args["data_dir"], split_row_groups=cli_args["split_row_groups"], ) store_sales_cols = [ "ss_customer_sk", "ss_ticket_number", "ss_item_sk", "ss_net_paid", ] store_returns_cols = [ "sr_item_sk", "sr_customer_sk", "sr_ticket_number", "sr_return_amt", ] store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) store_returns_df = table_reader.read("store_returns", relevant_cols=store_returns_cols) return store_sales_df, store_returns_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ss_columns = [ "ss_quantity", "ss_sold_date_sk", "ss_addr_sk", "ss_store_sk", "ss_cdemo_sk", "ss_sales_price", "ss_net_profit", ] store_sales = table_reader.read("store_sales", relevant_cols=ss_columns) ca_columns = ["ca_address_sk", "ca_country", "ca_state"] customer_address = table_reader.read("customer_address", relevant_cols=ca_columns) cd_columns = ["cd_demo_sk", "cd_marital_status", "cd_education_status"] customer_demographics = table_reader.read("customer_demographics", relevant_cols=cd_columns) dd_columns = ["d_year", "d_date_sk"] date_dim = table_reader.read("date_dim", relevant_cols=dd_columns) s_columns = ["s_store_sk"] store = table_reader.read("store", relevant_cols=s_columns) return store_sales, customer_address, customer_demographics, date_dim, store
def read_tables(): table_reader = build_reader( data_format=cli_args["file_format"], basepath=cli_args["data_dir"], split_row_groups=cli_args["split_row_groups"], ) ss_cols = [ "ss_customer_sk", "ss_sold_date_sk", "ss_ticket_number", "ss_net_paid" ] ws_cols = [ "ws_bill_customer_sk", "ws_sold_date_sk", "ws_order_number", "ws_net_paid", ] datedim_cols = ["d_date_sk", "d_date"] ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) ws_ddf = table_reader.read("web_sales", relevant_cols=ws_cols, index=False) datedim_ddf = table_reader.read("date_dim", relevant_cols=datedim_cols, index=False) return (ss_ddf, ws_ddf, datedim_ddf)
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) item_cols = ["i_item_sk", "i_current_price", "i_category"] store_sales_cols = ["ss_item_sk", "ss_customer_sk", "ss_sold_date_sk"] store_cols = ["s_store_sk"] date_cols = ["d_date_sk", "d_year", "d_moy"] customer_cols = ["c_customer_sk", "c_current_addr_sk"] customer_address_cols = ["ca_address_sk", "ca_state"] item_df = table_reader.read("item", relevant_cols=item_cols) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) store_df = table_reader.read("store", relevant_cols=store_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) customer_df = table_reader.read("customer", relevant_cols=customer_cols) customer_address_df = table_reader.read( "customer_address", relevant_cols=customer_address_cols) return ( item_df, store_sales_df, store_df, date_dim_df, customer_df, customer_address_df, )
def read_tables(): ### splitting by row groups for better parallelism table_reader = build_reader(basepath=cli_args["data_dir"], split_row_groups=True) product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] product_reviews_df = table_reader.read("product_reviews", relevant_cols=product_reviews_cols) return product_reviews_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) wcs_cols = ["wcs_user_sk", "wcs_item_sk", "wcs_click_date_sk", "wcs_click_time_sk"] wcs_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) return wcs_df
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) ### read tables ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) item_df = table_reader.read("item", relevant_cols=item_cols) imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols) ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) return ws_df, item_df, imp_df, ss_df
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) item_df = table_reader.read("item", relevant_cols=item_cols) return store_sales_df, date_dim_df, item_df
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) ss_cols = ["ss_customer_sk", "ss_item_sk"] items_cols = ["i_item_sk", "i_category", "i_class_id"] ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) items_ddf = table_reader.read("item", relevant_cols=items_cols, index=False) return (ss_ddf, items_ddf)
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) item_df = table_reader.read("item", relevant_cols=item_cols) ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) return item_df, ss_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) item_cols = ["i_category_id", "i_item_sk"] item_df = table_reader.read("item", relevant_cols=item_cols) return item_df
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) item_df = table_reader.read("item", relevant_cols=item_cols) warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols) return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) item_df = table_reader.read("item", relevant_cols=item_cols) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) web_clickstreams_df = table_reader.read( "web_clickstreams", relevant_cols=web_clickstreams_cols) return item_df, store_sales_df, web_clickstreams_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], ) item_cols = ["i_item_sk", "i_category_id"] item_df = table_reader.read("item", relevant_cols=item_cols) ws_cols = ["ws_order_number", "ws_item_sk"] ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) return item_df, ws_df
def read_tables(): ### splitting by row groups for better parallelism table_reader = build_reader(basepath=cli_args["data_dir"], split_row_groups=True) columns = [ "pr_review_content", "pr_review_rating", "pr_review_sk", ] ret = table_reader.read("product_reviews", relevant_cols=columns) return ret
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) date_dim_cols = ["d_date_sk", "d_date"] web_page_cols = ["wp_web_page_sk", "wp_type"] web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"] date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols) web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) return (date_dim_df, web_page_df, web_sales_df)
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) item_ddf = table_reader.read("item", relevant_cols=items_columns, index=False) customer_ddf = table_reader.read( "customer", relevant_cols=customer_columns, index=False ) customer_dem_ddf = table_reader.read( "customer_demographics", relevant_cols=customer_dem_columns, index=False ) return (item_ddf, customer_ddf, customer_dem_ddf)
def read_tables(): table_reader = build_reader( data_format=cli_args["file_format"], basepath=cli_args["data_dir"], split_row_groups=cli_args["split_row_groups"], ) ### read tables ws_df = table_reader.read("web_sales", relevant_cols=ws_cols) item_df = table_reader.read("item", relevant_cols=item_cols) imp_df = table_reader.read("item_marketprices", relevant_cols=imp_cols) ss_df = table_reader.read("store_sales", relevant_cols=ss_cols) return ws_df, item_df, imp_df, ss_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) web_sales_df = table_reader.read("web_sales", relevant_cols=websale_cols) web_returns_df = table_reader.read("web_returns", relevant_cols=web_returns_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) item_df = table_reader.read("item", relevant_cols=item_cols) warehouse_df = table_reader.read("warehouse", relevant_cols=warehouse_cols) return web_sales_df, web_returns_df, date_dim_df, item_df, warehouse_df
def read_tables(): table_reader = build_reader( data_format=cli_args["file_format"], basepath=cli_args["data_dir"], split_row_groups=cli_args["split_row_groups"], ) store_sales_df = table_reader.read("store_sales", relevant_cols=store_sales_cols) date_dim_df = table_reader.read("date_dim", relevant_cols=date_cols) item_df = table_reader.read("item", relevant_cols=item_cols) return store_sales_df, date_dim_df, item_df
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) date_cols = ["d_date_sk", "d_year", "d_moy"] date_df = table_reader.read("date_dim", relevant_cols=date_cols) inv_cols = [ "inv_warehouse_sk", "inv_item_sk", "inv_date_sk", "inv_quantity_on_hand", ] inv_df = table_reader.read("inventory", relevant_cols=inv_cols) return date_df, inv_df
def read_tables(config): ### splitting by row groups for better parallelism table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=True, ) product_reviews_cols = ["pr_item_sk", "pr_review_content", "pr_review_sk"] product_reviews_df = table_reader.read( "product_reviews", relevant_cols=product_reviews_cols, ) return product_reviews_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) date_dim_cols = ["d_date_sk", "d_date"] web_page_cols = ["wp_web_page_sk", "wp_type"] web_sales_cols = ["ws_net_paid", "ws_order_number", "ws_sold_date_sk"] date_dim_df = table_reader.read("date_dim", relevant_cols=date_dim_cols) web_page_df = table_reader.read("web_page", relevant_cols=web_page_cols) web_sales_df = table_reader.read("web_sales", relevant_cols=web_sales_cols) return (date_dim_df, web_page_df, web_sales_df)
def read_tables(): table_reader = build_reader(basepath=cli_args["data_dir"]) wp_cols = ["wp_type", "wp_web_page_sk"] wp_df = table_reader.read("web_page", relevant_cols=wp_cols) wcs_cols = [ "wcs_user_sk", "wcs_click_date_sk", "wcs_click_time_sk", "wcs_web_page_sk", "wcs_sales_sk", ] web_clicksteams_df = table_reader.read("web_clickstreams", relevant_cols=wcs_cols) return wp_df, web_clicksteams_df
def read_tables(config): table_reader = build_reader( data_format=config["file_format"], basepath=config["data_dir"], split_row_groups=config["split_row_groups"], ) ss_cols = ["ss_customer_sk", "ss_item_sk"] items_cols = ["i_item_sk", "i_category", "i_class_id"] ss_ddf = table_reader.read("store_sales", relevant_cols=ss_cols, index=False) items_ddf = table_reader.read("item", relevant_cols=items_cols, index=False) return (ss_ddf, items_ddf)