def null_handling_op(num_rows: int, num_cols: int, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) df = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor, with_null=True) index_df = get_dataframe(num_rows=num_rows, num_cols=1, unique_factor=1.0, with_null=False) index_column = 'index_col' df[index_column] = index_df ct = Table.from_pandas(ctx, df) df.set_index(index_column, inplace=True, drop=True) ct.set_index(index_column, indexing_type=IndexingType.LINEAR, drop=True) pandas_time = time.time() df_isna = df.dropna(axis=1) pandas_time = time.time() - pandas_time cylon_time = time.time() ct_isna = ct.dropna(axis=0) cylon_time = time.time() - cylon_time pandas_eval_time = time.time() pd.eval('df.dropna(axis=1)') pandas_eval_time = time.time() - pandas_eval_time print(df_isna.shape, ct_isna.shape) return pandas_time, cylon_time, pandas_eval_time
def math_op(num_rows: int, num_cols: int, unique_factor: float, op=add): ctx: CylonContext = CylonContext(config=None, distributed=False) ctx.add_config("compute_engine", "numpy") pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) filter_column = pdf.columns[0] filter_column_data = pdf[pdf.columns[0]] random_index = np.random.randint(low=0, high=pdf.shape[0]) math_value = filter_column_data.values[random_index] tb = Table.from_pandas(ctx, pdf) cylon_math_op_time = time.time() tb_filter = op(tb, math_value) cylon_math_op_time = time.time() - cylon_math_op_time pandas_math_op_time = time.time() pdf_filter = op(pdf, math_value) # pdf[filter_column] > filter_value pandas_math_op_time = time.time() - pandas_math_op_time pandas_eval_math_op_time = time.time() pdf_filter = pd.eval("op(pdf, math_value)") pandas_eval_math_op_time = time.time() - pandas_eval_math_op_time return pandas_math_op_time, pandas_eval_math_op_time, cylon_math_op_time
def indexing_op(num_rows: int, num_cols: int, unique_factor: float): from pycylon.indexing.index import IndexingType ctx: cn.CylonContext = cn.CylonContext(config=None, distributed=False) pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) filter_column = pdf.columns[0] filter_column_data = pdf[pdf.columns[0]] random_index = np.random.randint(low=0, high=pdf.shape[0]) filter_value = filter_column_data.values[random_index] filter_values = filter_column_data.values.tolist()[0:pdf.shape[0] // 2] tb = Table.from_pandas(ctx, pdf) cylon_indexing_time = time.time() tb.set_index(filter_column, indexing_type=IndexingType.LINEAR, drop=True) cylon_indexing_time = time.time() - cylon_indexing_time pdf_indexing_time = time.time() pdf.set_index(filter_column, drop=True, inplace=True) pdf_indexing_time = time.time() - pdf_indexing_time cylon_filter_time = time.time() tb_filter = tb.loc[filter_values] cylon_filter_time = time.time() - cylon_filter_time pandas_filter_time = time.time() pdf_filtered = pdf.loc[filter_values] pandas_filter_time = time.time() - pandas_filter_time print(tb_filter.shape, pdf_filtered.shape) return pandas_filter_time, cylon_filter_time, pdf_indexing_time, cylon_indexing_time
def join_op(num_rows: int, num_cols: int, algorithm: str, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) pdf_left = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor, stringify=False) pdf_right = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor, stringify=False) # NOTE: sort join breaks when loaded data in-memory via Pandas dataframe pdf_left.to_csv("/tmp/left_table.csv", index=False) pdf_right.to_csv("/tmp/right_table.csv", index=False) csv_read_options = CSVReadOptions() \ .use_threads(True) \ .block_size(1 << 30) tb_left = read_csv(ctx, "/tmp/left_table.csv", csv_read_options) tb_right = read_csv(ctx, "/tmp/right_table.csv", csv_read_options) join_col = tb_left.column_names[0] cylon_time = time.time() tb2 = tb_left.join(tb_right, join_type='inner', algorithm=algorithm, on=[join_col]) cylon_time = time.time() - cylon_time pandas_time = time.time() pdf2 = pdf_left.join(pdf_right, how="inner", on=join_col, lsuffix="_l", rsuffix="_r") pandas_time = time.time() - pandas_time pandas_eval_time = time.time() pdf2 = pd.eval( "pdf_left.join(pdf_right, how='inner', on=join_col, lsuffix='_l', rsuffix='_r')" ) pandas_eval_time = time.time() - pandas_eval_time ctx.finalize() return pandas_time, cylon_time, pandas_eval_time
def duplicate_op(num_rows: int, num_cols: int, filter_size: int, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) tb = Table.from_pandas(ctx, pdf) filter_columns = tb.column_names[0:filter_size] cylon_time = time.time() tb2 = tb.unique(columns=filter_columns) cylon_time = time.time() - cylon_time pandas_time = time.time() pdf2 = pdf.drop_duplicates(subset=filter_columns) pandas_time = time.time() - pandas_time pandas_eval_time = time.time() pdf2 = pd.eval("pdf.drop_duplicates(subset=filter_columns)") pandas_eval_time = time.time() - pandas_eval_time return pandas_time, cylon_time, pandas_eval_time
def tb_creation_op(num_rows: int, num_cols: int, duplication_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) ctx.add_config("compute_engine", "numpy") pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=duplication_factor) # data_row_list data_row = [np.random.randint(num_rows)] * num_rows # data row np array data_row_ar = np.array(data_row) # data row pa array data_row_pa_ar = pa.array(data_row) data_set = [data_row for i in range(num_cols)] data_set_ar = [data_row_ar for i in range(num_cols)] data_set_pa_ar = [data_row_pa_ar for i in range(num_cols)] column_names = ["data_" + str(i) for i in range(num_cols)] t_pandas = time.time() tb = Table.from_pandas(ctx, pdf) t_pandas = time.time() - t_pandas t_list = time.time() tb1 = Table.from_list(ctx, column_names, data_set) t_list = time.time() - t_list t_numpy = time.time() tb2 = Table.from_numpy(ctx, column_names, data_set_ar) t_numpy = time.time() - t_numpy t_arrow = time.time() tb3 = pa.Table.from_arrays(data_set_pa_ar, column_names) t_arrow = time.time() - t_arrow t_cylon_from_arrow = time.time() tb4 = Table.from_arrow(ctx, tb3) t_cylon_from_arrow = time.time() - t_cylon_from_arrow return t_pandas, t_numpy, t_list, t_arrow, t_cylon_from_arrow
def isin_op(num_rows: int, num_cols: int, filter_size: int, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) ctx.add_config("compute_engine", "arrow") df = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) ct = Table.from_pandas(ctx, df) cmp_data = np.random.randn(filter_size) cmp_data = cmp_data.tolist() pandas_time = time.time() df.isin(cmp_data) pandas_time = time.time() - pandas_time cylon_time = time.time() ct.isin(cmp_data) cylon_time = time.time() - cylon_time pandas_eval_time = time.time() pd.eval('df.isin(cmp_data)') pandas_eval_time = time.time() - pandas_eval_time return pandas_time, cylon_time, pandas_eval_time