def assert_eq(df_c: DataFrame, df_p: pd.DataFrame, sort=False): if sort: # sort by every column print( df_c.sort_values(by=[*range(0, df_c.shape[1])]).to_numpy( order='F', zero_copy_only=False)) print(df_p.to_numpy()) assert np.array_equal( df_c.sort_values( by=[*range(0, df_c.shape[1])]).to_pandas().to_numpy(), df_p.sort_values(by=[*range(0, df_p.shape[1])]).to_numpy()) else: assert np.array_equal(df_c.to_numpy(order='F', zero_copy_only=False), df_p.to_numpy())
def test_df_dist_sorting(): df1 = DataFrame( [random.sample(range(10, 30), 5), random.sample(range(10, 30), 5)]) def check_sort(df, col, ascending): arr = df.to_pandas()[col] for i in range(len(arr) - 1): if ascending: assert arr[i] <= arr[i + 1] else: assert arr[i] >= arr[i + 1] # local sort df = df1.sort_values('0', ascending=True) check_sort(df, '0', True) df = df1.sort_values('0', ascending=False) check_sort(df, '0', False) # distributed sort env = CylonEnv(config=MPIConfig(), distributed=True) print("Distributed Sort", env.rank, env.world_size) df3 = df1.sort_values(by=[0], env=env, ascending=True) check_sort(df3, '0', True) df3 = df1.sort_values(by=[0], env=env, ascending=False) check_sort(df3, '0', False)
def create_high_price_items_df(env: CylonEnv, item_df: DataFrame) -> DataFrame: """ grouped_item_df = ( item_df[["i_category", "i_current_price"]] .groupby(["i_category"]) .agg({"i_current_price": "mean"}) ) grouped_item_df = grouped_item_df.rename(columns={"i_current_price": "avg_price"}) grouped_item_df = grouped_item_df.reset_index(drop=False) item_df = item_df.merge(grouped_item_df) item_df = item_df[ item_df["i_current_price"] > item_df["avg_price"] * q07_HIGHER_PRICE_RATIO ].reset_index(drop=True) high_price_items_df = item_df del item_df return high_price_items_df """ # item_df = ["i_item_sk", "i_current_price", "i_category"] grouped_item_df = item_df[["i_category", "i_current_price"]].groupby(by=0, env=env) \ .agg({"i_current_price": 'mean'}) # [i_category, i_current_price_mean] grouped_item_df.rename({"mean_i_current_price": "avg_price"}) # [i_category, avg_price] # todo: this join has unbalanced work & can create empty tables item_df = item_df.merge(grouped_item_df, how='inner', algorithm='sort', on=['i_category'], suffixes=("", "rt-"), env=env) \ .drop(['rt-i_category']) item_df.rename({"rt-avg_price": "avg_price"}) # [i_item_sk, i_current_price, i_category, avg_price] if len(item_df): item_df = item_df[item_df["i_current_price"] > item_df["avg_price"] * q07_HIGHER_PRICE_RATIO] # [i_item_sk, i_current_price, i_category, avg_price] return item_df
def read(self, env: CylonEnv, table, relevant_cols=None, **kwargs) -> DataFrame: filepath = self.table_path_mapping[table].replace('$TABLE', table) names, _ = get_schema(table) # csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) # .with_delimiter('|') read_opts = ReadOptions(column_names=names, block_size=(1 << 30)) parse_opts = ParseOptions(delimiter='|') convert_opts = ConvertOptions(include_columns=relevant_cols) # if table is in refresh_tables list, read that table and concat # NOTE: refresh tables have the same parallelism as its data tables if table in REFRESH_TABLES: data_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) refresh_path = filepath.replace('/data/', '/data_refresh/') refresh_table = pa_read_csv(refresh_path, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) pa_table = pa_concat_tables([data_table, refresh_table]) else: pa_table = pa_read_csv(filepath, read_options=read_opts, parse_options=parse_opts, convert_options=convert_opts) return DataFrame(Table.from_arrow(env.context, pa_table))
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## import random from pycylon import DataFrame, CylonEnv from pycylon.net import MPIConfig df1 = DataFrame([random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) # local sort df3 = df1.sort_values(by=[0]) print("Local Sort") print(df3) # distributed sort env = CylonEnv(config=MPIConfig()) df1 = DataFrame([random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5), random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5)]) print("Distributed Sort", env.rank) df3 = df1.sort_values(by=[0], env=env) print(df3)
## # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## from pycylon import DataFrame df1 = DataFrame([[0, 0, 1, 1], [1, 10, 1, 5], [10, 20, 30, 40]]) df3 = df1.groupby(by=0).agg({"1": "sum", "2": "min"}) print(df3) df4 = df1.groupby(by=0).min() print(df4) df5 = df1.groupby(by=[0, 1]).max() print(df5)
def create_df(data) -> Union[DataFrame, pd.DataFrame]: # np.T is a temp fix to address inconsistencies return DataFrame(data), pd.DataFrame(np.array(data).T)
def test_pd_read_csv(): env = CylonEnv(config=MPIConfig()) df1 = DataFrame(pd.read_csv('/tmp/user_usage_tm_1.csv')) df2 = DataFrame(pd.read_csv('/tmp/user_device_tm_1.csv')) df1 = df1.set_index([3], drop=True) df2 = df2.set_index([0], drop=True) df1.to_table().retain_memory(False) df2.to_table().retain_memory(False) df3 = df1.merge(right=df2, left_on=[3], right_on=[0], algorithm='sort', env=env) assert len(df3)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## from pycylon import DataFrame, read_csv, CylonEnv from pycylon.net import MPIConfig import sys import pandas as pd # using cylon native reader df = read_csv(sys.argv[1]) print(df) # using pandas to load csv df = DataFrame( pd.read_csv( "http://data.un.org/_Docs/SYB/CSV/SYB63_1_202009_Population,%20Surface%20Area%20and%20Density.csv", skiprows=6, usecols=[0, 1])) print(df) # loading json df = DataFrame(pd.read_json("https://api.exchangerate-api.com/v4/latest/USD")) print(df) # distributed loading : run in distributed mode with MPI or UCX env = CylonEnv(config=MPIConfig()) df = read_csv(sys.argv[1], slice=True, env=env) print(df)
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## import random import pycylon as cn from pycylon import DataFrame, CylonEnv from pycylon.net import MPIConfig df1 = DataFrame( [random.sample(range(10, 100), 5), random.sample(range(10, 100), 5)]) df2 = DataFrame( [random.sample(range(10, 100), 5), random.sample(range(10, 100), 5)]) df3 = DataFrame( [random.sample(range(10, 100), 10), random.sample(range(10, 100), 10)]) # local unique df4 = cn.concat(axis=0, objs=[df1, df2, df3]) print("Local concat axis0") print(df4) df2.rename(['00', '11']) df3.rename(['000', '111'])
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## from pycylon import DataFrame, CylonEnv from pycylon.net import MPIConfig import random df1 = DataFrame( [random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) df2 = DataFrame( [random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) df2.set_index([0], inplace=True) # local join df3 = df1.join(other=df2, on=[0]) print("Local Join") print(df3) # distributed join env = CylonEnv(config=MPIConfig()) df1 = DataFrame([
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## from pycylon import DataFrame, CylonEnv from pycylon.net import MPIConfig import random df1 = DataFrame( [random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) df2 = DataFrame( [random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) # local merge df3 = df1.merge(right=df2, on=[0]) print("Local Merge") print(df3) # distributed join env = CylonEnv(config=MPIConfig()) df1 = DataFrame([ random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5),
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ## from pycylon import DataFrame, CylonEnv from pycylon.net import MPIConfig import random df1 = DataFrame( [random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) # local unique df3 = df1.drop_duplicates() print("Local Unique") print(df3) # distributed unique env = CylonEnv(config=MPIConfig()) df1 = DataFrame([ random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5), random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5) ]) print("Distributed Unique", env.rank)
def main(env, config): # import cudf # Conf variables q09_year = 2001 q09_part1_ca_country = "United States" # q09_part1_ca_state_IN = "KY", "GA", "NM" q09_part1_ca_state_IN = ["KY", "GA", "NM"] q09_part1_net_profit_min = 0 q09_part1_net_profit_max = 2000 q09_part1_education_status = "4 yr Degree" q09_part1_marital_status = "M" q09_part1_sales_price_min = 100 q09_part1_sales_price_max = 150 q09_part2_ca_country = "United States" # q09_part2_ca_state_IN = "MT", "OR", "IN" q09_part2_ca_state_IN = ["MT", "OR", "IN"] q09_part2_net_profit_min = 150 q09_part2_net_profit_max = 3000 q09_part2_education_status = "4 yr Degree" q09_part2_marital_status = "M" q09_part2_sales_price_min = 50 q09_part2_sales_price_max = 200 q09_part3_ca_country = "United States" # q09_part3_ca_state_IN = "WI", "MO", "WV" q09_part3_ca_state_IN = ["WI", "MO", "WV"] q09_part3_net_profit_min = 50 q09_part3_net_profit_max = 25000 q09_part3_education_status = "4 yr Degree" q09_part3_marital_status = "M" q09_part3_sales_price_min = 150 q09_part3_sales_price_max = 200 """ ( store_sales, customer_address, customer_demographics, date_dim, store, ) = benchmark( read_tables, config=config, compute_result=config["get_read_time"], dask_profile=config["dask_profile"], ) """ ( store_sales, customer_address, customer_demographics_1part, date_dim_1part, store, ) = read_tables(env, config=config) """ date_dim = date_dim.query( "d_year==@q09_year", meta=date_dim._meta, local_dict={"q09_year": q09_year} ).reset_index(drop=True) """ date_dim_1part = date_dim_1part[date_dim_1part['d_year'] == q09_year] # d_year: int64 d_date_sk: int64 """ output_table = store_sales.merge( date_dim, left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], how="inner" ) """ output_table = store_sales.merge(date_dim_1part, how="inner", algorithm='sort', left_on=["ss_sold_date_sk"], right_on=["d_date_sk"], suffixes=('', '')) # local # output_table.rename([x.split('-')[1] for x in output_table.column_names]) # ss_quantity: int64 # ss_sold_date_sk: int64 # ss_addr_sk: int64 # ss_store_sk: int64 # ss_cdemo_sk: int64 # ss_sales_price: double # ss_net_profit: double # d_year: int64 # d_date_sk: int64 """ output_table = output_table.drop( columns=["d_year", "d_date_sk", "ss_sold_date_sk"] ) # Drop the columns that are not needed """ # Drop the columns that are not needed output_table = output_table.drop( ["d_year", "d_date_sk", "ss_sold_date_sk"]) """ output_table = output_table.merge( store, left_on=["ss_store_sk"], right_on=["s_store_sk"], how="inner" ) output_table = output_table.drop(columns=["ss_store_sk", "s_store_sk"]) """ output_table = output_table.merge(store, how="inner", algorithm='sort', left_on=["ss_store_sk"], right_on=["s_store_sk"], suffixes=('', ''), env=env) output_table = output_table.drop(["ss_store_sk", "s_store_sk"]) # output_table.rename([x.split('-')[1] for x in output_table.column_names]) # ss_quantity: int64 # ss_addr_sk: int64 # ss_cdemo_sk: int64 # ss_sales_price: double # ss_net_profit: double """ output_table = output_table.merge( customer_demographics, left_on=["ss_cdemo_sk"], right_on=["cd_demo_sk"], how="inner", ) """ output_table = output_table.merge(customer_demographics_1part, how="inner", algorithm='sort', left_on=["ss_cdemo_sk"], right_on=["cd_demo_sk"], suffixes=('', '')) # local # output_table.rename([x.split('-')[1] for x in output_table.column_names]) # ss_quantity: int64 # ss_addr_sk: int64 # ss_cdemo_sk: int64 # ss_sales_price: double # ss_net_profit: double # cd_demo_sk: int64 # cd_marital_status: string # cd_education_status: string """ output_table = output_table[ ( (output_table["cd_marital_status"] == q09_part1_marital_status) & (output_table["cd_education_status"] == q09_part1_education_status) & (output_table["ss_sales_price"] >= q09_part1_sales_price_min) & (output_table["ss_sales_price"] <= q09_part1_sales_price_max) ) | ( (output_table["cd_marital_status"] == q09_part2_marital_status) & (output_table["cd_education_status"] == q09_part2_education_status) & (output_table["ss_sales_price"] >= q09_part2_sales_price_min) & (output_table["ss_sales_price"] <= q09_part2_sales_price_max) ) | ( (output_table["cd_marital_status"] == q09_part3_marital_status) & (output_table["cd_education_status"] == q09_part3_education_status) & (output_table["ss_sales_price"] >= q09_part3_sales_price_min) & (output_table["ss_sales_price"] <= q09_part3_sales_price_max) ) ].reset_index(drop=True) output_table = output_table.drop( columns=[ "ss_cdemo_sk", "cd_demo_sk", "cd_marital_status", "cd_education_status", "ss_sales_price", ] ) """ output_table = output_table[ ((output_table["cd_marital_status"] == q09_part1_marital_status) & (output_table["cd_education_status"] == q09_part1_education_status) & (output_table["ss_sales_price"] >= q09_part1_sales_price_min) & (output_table["ss_sales_price"] <= q09_part1_sales_price_max)) | ((output_table["cd_marital_status"] == q09_part2_marital_status) & (output_table["cd_education_status"] == q09_part2_education_status) & (output_table["ss_sales_price"] >= q09_part2_sales_price_min) & (output_table["ss_sales_price"] <= q09_part2_sales_price_max)) | ((output_table["cd_marital_status"] == q09_part3_marital_status) & (output_table["cd_education_status"] == q09_part3_education_status) & (output_table["ss_sales_price"] >= q09_part3_sales_price_min) & (output_table["ss_sales_price"] <= q09_part3_sales_price_max))] output_table = output_table.drop([ "ss_cdemo_sk", "cd_demo_sk", "cd_marital_status", "cd_education_status", "ss_sales_price", ]) # ss_quantity: int64 # ss_addr_sk: int64 # ss_net_profit: double """ output_table = output_table.merge( customer_address, left_on=["ss_addr_sk"], right_on=["ca_address_sk"], how="inner", )""" output_table = output_table.merge(customer_address, how="inner", algorithm='sort', left_on=["ss_addr_sk"], right_on=["ca_address_sk"], suffixes=('', ''), env=env) # output_table.rename([x.split('-')[1] for x in output_table.column_names]) """ output_table = output_table[ ( (output_table["ca_country"] == q09_part1_ca_country) & (output_table["ca_state"].str.contains("|".join(q09_part1_ca_state_IN))) & (output_table["ss_net_profit"] >= q09_part1_net_profit_min) & (output_table["ss_net_profit"] <= q09_part1_net_profit_max) ) | ( (output_table["ca_country"] == q09_part2_ca_country) & (output_table["ca_state"].str.contains("|".join(q09_part2_ca_state_IN))) & (output_table["ss_net_profit"] >= q09_part2_net_profit_min) & (output_table["ss_net_profit"] <= q09_part2_net_profit_max) ) | ( (output_table["ca_country"] == q09_part3_ca_country) & (output_table["ca_state"].str.contains("|".join(q09_part3_ca_state_IN))) & (output_table["ss_net_profit"] >= q09_part3_net_profit_min) & (output_table["ss_net_profit"] <= q09_part3_net_profit_max) ) ].reset_index(drop=True) output_table = output_table.drop( columns=[ "ss_addr_sk", "ca_address_sk", "ca_country", "ca_state", "ss_net_profit", ] )""" output_table = output_table[ ((output_table["ca_country"] == q09_part1_ca_country) & (output_table["ca_state"].isin(q09_part1_ca_state_IN)) & (output_table["ss_net_profit"] >= q09_part1_net_profit_min) & (output_table["ss_net_profit"] <= q09_part1_net_profit_max)) | ((output_table["ca_country"] == q09_part2_ca_country) & (output_table["ca_state"].isin(q09_part2_ca_state_IN)) & (output_table["ss_net_profit"] >= q09_part2_net_profit_min) & (output_table["ss_net_profit"] <= q09_part2_net_profit_max)) | ((output_table["ca_country"] == q09_part3_ca_country) & (output_table["ca_state"].isin(q09_part3_ca_state_IN)) & (output_table["ss_net_profit"] >= q09_part3_net_profit_min) & (output_table["ss_net_profit"] <= q09_part3_net_profit_max))] output_table = output_table.drop([ "ss_addr_sk", "ca_address_sk", "ca_country", "ca_state", "ss_net_profit", ]) ### this is a scaler so no need to transform """ result = output_table["ss_quantity"].sum().persist() result = result.compute() result_df = cudf.DataFrame({"sum(ss_quantity)": [result]}) """ # print(output_table.to_arrow()) result = DataFrame(output_table.to_table().sum("ss_quantity")) result.rename(["sum(ss_quantity)"]) print(result) return result
def inventory_before_after(df: DataFrame, date) -> DataFrame: df["inv_before"] = df["inv_quantity_on_hand"].copy() df.loc[df["d_date"] >= date, "inv_before"] = 0 df["inv_after"] = df["inv_quantity_on_hand"].copy() df.loc[df["d_date"] < date, "inv_after"] = 0 return df