def test_loc_with_list(): ctx = CylonContext(config=None, distributed=False) dataset = [] num_rows = 10_000 num_columns = 2 filter_size = 1_000 data = np.random.randn(num_rows) index_vals = [i for i in range(0, num_rows)] filter_vals = [i for i in range(0, filter_size)] pdf = pd.DataFrame({'data{}'.format(i): data for i in range(num_columns)}) index_df_col = pd.DataFrame(index_vals) pdf['index'] = index_df_col tb1 = Table.from_pandas(ctx, pdf) tb1['index'] = Table.from_pandas(ctx, index_df_col) index_column = 'index' tb1.set_index(index_column, drop=True) pdf.set_index(index_column, drop=True, inplace=True) print(tb1.shape, pdf.shape) i0 = pdf.index.values[0] print(type(i0), i0)
def test_rename(): col_names = ['col1', 'col2', 'col3', 'col4'] data_list_numeric = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15], [16, 17, 18, 19, 20]] ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb = Table.from_list(ctx, col_names, data_list_numeric) index_values = [0, 1, 2, 3, 4] cn_tb.set_index(index_values) prev_col_names = cn_tb.column_names # with dictionary columns = {'col1': 'col-1', 'col3': 'col-3'} cn_tb.rename(columns) new_col_names = cn_tb.column_names for key in columns: value = columns[key] assert prev_col_names.index(key) == new_col_names.index(value) # with list cn_tb_list = Table.from_list(ctx, col_names, data_list_numeric) cn_tb_list.set_index(index_values) prev_col_names = cn_tb_list.column_names new_column_names = ['col-1', 'col-2', 'col-3', 'col-4'] cn_tb_list.rename(new_column_names) assert cn_tb_list.column_names == new_column_names
def shuffle(): mpi_config = MPIConfig() ctx = CylonContext(config=mpi_config, distributed=True) rows = 5 tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range( rows)], 'c3': [i * 3 for i in range(rows)]}) tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows), np.random.random(size=rows), np.random.random(size=rows)]) print(tb.shape) tb_shuffle = tb.shuffle(['c1']) tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all') print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape) from pycylon.io import CSVWriteOptions csv_write_options = CSVWriteOptions().with_delimiter(',') # # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options) ctx.finalize()
def test_pdf_to_pdf_assign(): index1 = [0, 1, 2, 3, 4, 5, 6, 7, 8] index2 = [0, 1, 2, 3, 4] index3 = [10, 11, 12, 13, 14, 15, 16, 17, 7] pdf1 = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 110, 111, 112, 113], 'b': [10, 11, 12, 13, 14, 5, 4, 3, 2] }, index=index1) pdf2 = pd.DataFrame( { 'a': [10, 20, 30, 40, 50], 'b': [100, 101, 102, 103, 104] }, index=index2) pdf3 = pd.DataFrame( { 'a': [1, 2, 3, 4, 5, 110, 111, 112, 113], 'b': [1110, 1111, 1112, 1113, 1114, 115, 114, 113, 112] }, index=index3) tb1 = Table.from_pandas(ctx, pdf1) tb1.set_index(index1) tb2 = Table.from_pandas(ctx, pdf2) tb2.set_index(index2) tb3 = Table.from_pandas(ctx, pdf3) tb3.set_index(index3) print(pdf1) print("-----------") print(pdf2) print("-----------") print(pdf3) print("-----------") gp = pdf1['b'] # print(pdf1['b'] < 6) print(gp[pdf1['b'] < 6]) print(gp) print("-----------") gp[pdf1['b'] < 6] = pdf3['b'] print(gp) tb_gp = tb1['b'] print(tb_gp) print(tb_gp.index.index_values) tb_sample = tb_gp[tb1['b'] < 6] print(tb_sample) print(tb_sample.index.index_values)
def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table, pdf_filter: DataFrame, pdf_result: DataFrame, is_full_table): if is_full_table: assert tb_filter.to_pandas().values.tolist( ) == pdf_filter.values.tolist() assert tb_result.to_pandas().fillna( 0).values.tolist() == pdf_result.fillna(0).values.tolist() else: assert tb_filter.to_pandas().values.flatten().tolist( ) == pdf_filter.values.tolist() assert tb_result.to_pandas().values.tolist( ) == pdf_result.values.tolist()
def math_op(num_rows: int, num_cols: int, unique_factor: float, op=add): ctx: CylonContext = CylonContext(config=None, distributed=False) ctx.add_config("compute_engine", "numpy") pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) filter_column = pdf.columns[0] filter_column_data = pdf[pdf.columns[0]] random_index = np.random.randint(low=0, high=pdf.shape[0]) math_value = filter_column_data.values[random_index] tb = Table.from_pandas(ctx, pdf) cylon_math_op_time = time.time() tb_filter = op(tb, math_value) cylon_math_op_time = time.time() - cylon_math_op_time pandas_math_op_time = time.time() pdf_filter = op(pdf, math_value) # pdf[filter_column] > filter_value pandas_math_op_time = time.time() - pandas_math_op_time pandas_eval_math_op_time = time.time() pdf_filter = pd.eval("op(pdf, math_value)") pandas_eval_math_op_time = time.time() - pandas_eval_math_op_time return pandas_math_op_time, pandas_eval_math_op_time, cylon_math_op_time
def math_op_base(): ctx: CylonContext = CylonContext(config=None, distributed=False) num_rows = 10_000_000 data = np.random.randn(num_rows) df = pd.DataFrame({'data{}'.format(i): data for i in range(100)}) np_key = np.random.randint(0, 100, size=num_rows) np_all = df.to_numpy() df['key'] = np_key rb = pa.record_batch(df) t = pa.Table.from_pandas(df) ct = Table.from_pandas(ctx, df) t1 = time.time() np_key + 1 t2 = time.time() ct['key'] + 1 t3 = time.time() df['key'] + 1 t4 = time.time() artb = ct.to_arrow().combine_chunks() ar_key = ct['key'].to_arrow().combine_chunks().columns[0].chunks[0] pc.add(ar_key, 1) t5 = time.time() print(f"Numpy Time: {t2 - t1} s") print(f"PyCylon Time: {t3 - t2} s") print(f"Pandas Time: {t4 - t3} s") print(f"PyArrow Time: {t5 - t4} s")
def test_additions_and_maps(): from pycylon import Table from pycylon import CylonContext import pandas as pd import numpy as np pdf = pd.DataFrame({ 'idx': ['x', 'y', 'z'], 'col-1': ["a", "b", "c"], 'col-2': [10, 20, 30], 'col-3': ['Y', 'N', 'Y'] }) tb = Table.from_pandas(ctx, pdf) print(tb) tb_s = tb['col-1'].applymap(lambda x: x + "_i") tb_log = tb['col-2'].applymap(lambda x: np.log10(x)) tb_y = tb['col-3'].applymap(lambda x: (x == 'Y')) tb['col-1'] = tb_s tb['col-2'] = tb_log tb = tb[tb_y] pdf = pdf[pdf['col-3'].map(lambda x: (x == 'Y'))] print(pdf.to_dict()) print(tb.to_pydict())
def null_handling_op(num_rows: int, num_cols: int, unique_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) df = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor, with_null=True) index_df = get_dataframe(num_rows=num_rows, num_cols=1, unique_factor=1.0, with_null=False) index_column = 'index_col' df[index_column] = index_df ct = Table.from_pandas(ctx, df) df.set_index(index_column, inplace=True, drop=True) ct.set_index(index_column, indexing_type=IndexingType.LINEAR, drop=True) pandas_time = time.time() df_isna = df.dropna(axis=1) pandas_time = time.time() - pandas_time cylon_time = time.time() ct_isna = ct.dropna(axis=0) cylon_time = time.time() - cylon_time pandas_eval_time = time.time() pd.eval('df.dropna(axis=1)') pandas_eval_time = time.time() - pandas_eval_time print(df_isna.shape, ct_isna.shape) return pandas_time, cylon_time, pandas_eval_time
def test_i_bitwise_ops(): # TODO: Improve test and functionality: https://github.com/cylondata/cylon/issues/229 npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50], [36.2, 13.2, 16.4, 12.2, 10.8]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) a = cn_tb['0'] > 10 b = cn_tb['1'] > 2 a_pdf = pdf[0] > 10 b_pdf = pdf[1] > 2 d = a & b a &= b d_pdf = a_pdf & b_pdf a_pdf &= b_pdf assert d.to_pandas().values.tolist() == a.to_pandas().values.tolist() assert a.to_pandas().values.flatten().tolist() == a_pdf.values.tolist() ## OR a = cn_tb['0'] > 10 b = cn_tb['1'] > 2 a_pdf = pdf[0] > 10 b_pdf = pdf[1] > 2 d = a | b a |= b d_pdf = a_pdf | b_pdf a_pdf |= b_pdf assert d.to_pandas().values.tolist() == a.to_pandas().values.tolist() assert a.to_pandas().values.flatten().tolist() == a_pdf.values.tolist()
def benchmark_map_numeric(): N = 10_000_000 a_rand = np.random.random(size=N) b_rand = np.random.random(size=N) a = pa.array(a_rand) b = pa.array(b_rand) tb = pa.Table.from_arrays([a, b], ['c1', 'c2']) pdf: pd.DataFrame = tb.to_pandas() ctx: CylonContext = CylonContext(config=None, distributed=False) cntb: Table = Table.from_arrow(ctx, tb) print(pdf.shape, cntb.shape) map_func = lambda x: x + x t1 = time.time() new_ct = cntb.applymap(map_func) t2 = time.time() new_pdf = pdf.applymap(map_func) t3 = time.time() print(f"Time for Cylon Apply Map {t2 - t1} s") print(f"Time for Pandas Apply Map {t3 - t2} s")
def test_math_ops_for_table_values(): pdf = DataFrame({ '0': [1, 2, 3, 4], '1': [5, 6, 7, 9], '2': [1., 2., 3., 4.] }) ctx: CylonContext = CylonContext() cn_tb: Table = Table.from_pandas(ctx, pdf) from operator import add, sub, mul, truediv ops = [add] #, sub, mul, truediv] for op in ops: # test column division cn_res = op(cn_tb['0'], cn_tb['0']) pd_res = op(pdf['0'], pdf['0']) # pandas series.values returns an array, whereas dataframe.values list of lists. Hence it # needs to be flattened to compare assert pd_res.values.tolist() == cn_res.to_pandas().values.flatten( ).tolist() # test table division cn_res2 = op(cn_tb, cn_tb['0']) pd_res2 = getattr(pdf, op.__name__)(pdf['0'], axis=0) assert pd_res2.values.tolist() == cn_res2.to_pandas().values.tolist()
def test_setitem(): npr = np.array([[1, 2, 3, 4, 5], [-1, -2, -3, -4, -5]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) # replacing an existing column cn_tb['0'] = cn_tb['4'] assert cn_tb['0'].to_pandas().values.tolist() == cn_tb['4'].to_pandas( ).values.tolist() # adding a new column at the end cn_tb['5'] = cn_tb['4'] assert cn_tb['5'].to_pandas().values.tolist() == cn_tb['4'].to_pandas( ).values.tolist() cn_tb['6'] = 1 # create new column assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(), np.full(cn_tb.row_count, 1)) cn_tb['6'] = 1.0 # replace column assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(), np.full(cn_tb.row_count, 1.0)) cn_tb['6'] = 'aaa' # replace column assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(), np.full(cn_tb.row_count, 'aaa'))
def indexing_op(num_rows: int, num_cols: int, unique_factor: float): from pycylon.indexing.index import IndexingType ctx: cn.CylonContext = cn.CylonContext(config=None, distributed=False) pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor) filter_column = pdf.columns[0] filter_column_data = pdf[pdf.columns[0]] random_index = np.random.randint(low=0, high=pdf.shape[0]) filter_value = filter_column_data.values[random_index] filter_values = filter_column_data.values.tolist()[0:pdf.shape[0] // 2] tb = Table.from_pandas(ctx, pdf) cylon_indexing_time = time.time() tb.set_index(filter_column, indexing_type=IndexingType.LINEAR, drop=True) cylon_indexing_time = time.time() - cylon_indexing_time pdf_indexing_time = time.time() pdf.set_index(filter_column, drop=True, inplace=True) pdf_indexing_time = time.time() - pdf_indexing_time cylon_filter_time = time.time() tb_filter = tb.loc[filter_values] cylon_filter_time = time.time() - cylon_filter_time pandas_filter_time = time.time() pdf_filtered = pdf.loc[filter_values] pandas_filter_time = time.time() - pandas_filter_time print(tb_filter.shape, pdf_filtered.shape) return pandas_filter_time, cylon_filter_time, pdf_indexing_time, cylon_indexing_time
def test_reset_index(): from pycylon.indexing.index import IndexingType from pycylon.indexing.index_utils import IndexUtil pdf_float = pd.DataFrame({'a': pd.Series([1, 4, 7, 10, 20, 23, 10], dtype=np.int64()), 'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int')}) pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8], [10, 11], [20, 22], [23, 25], [10, 12]]) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf_float) indexing_type = IndexingType.LINEAR drop_index = True # cn_tb.set_index('a', indexing_schema, drop_index) cn_tb.set_index('a', indexing_type, drop_index) # assert cn_tb.get_index().get_type() == IndexingSchema.LINEAR assert cn_tb.get_index().get_type() == IndexingType.LINEAR rest_drop_index = False # cn_tb.reset_index(rest_drop_index) cn_tb.reset_index(rest_drop_index) assert cn_tb.column_names == ['index', 'b'] # assert cn_tb.get_index().get_schema() == IndexingSchema.RANGE assert cn_tb.get_index().get_type() == IndexingType.RANGE
def test_math_ops_for_scalar(): npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50], [10.2, 13.2, 16.4, 12.2, 10.8]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) from operator import add, sub, mul, truediv ops = [add, sub, mul, truediv] for op in ops: cn_tb_1 = cn_tb pdf_1 = pdf # test column division cn_tb_1['0'] = op(cn_tb_1['0'], 2) pdf_1[0] = op(pdf_1[0], 2) assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist() # test table division cn_tb_2 = cn_tb pdf_2 = pdf cn_tb_2 = op(cn_tb_2, 2) pdf_2 = op(pdf, 2) assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()
def test_loc_op_mode_3(): from pycylon.indexing.index import IndexingType from pycylon.indexing.index_utils import IndexUtil pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]), 'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'), 'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'), 'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'), 'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121], dtype='int')}) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf_float) indexing_type = IndexingType.LINEAR drop_index = True print("Before Indexing") print(cn_tb) cn_tb.set_index('a', indexing_type, drop_index) pdf_float = pdf_float.set_index('a') print("After Indexing") assert cn_tb.column_names == ['b', 'c', 'd', 'e'] assert cn_tb.get_index().get_type() == IndexingType.LINEAR loc_cn_1 = cn_tb.loc["7":"20"] loc_pd_1 = pdf_float.loc["7":"20"] print(loc_cn_1.get_index().get_index_array()) print(loc_pd_1.index.values) assert loc_pd_1.values.tolist() == loc_cn_1.to_pandas().values.tolist() assert loc_cn_1.get_index().get_index_array() == pa.array(loc_pd_1.index) loc_cn_2 = cn_tb.loc["7":] loc_pd_2 = pdf_float.loc["7":] assert loc_pd_2.values.tolist() == loc_cn_2.to_pandas().values.tolist() assert loc_cn_2.get_index().get_index_array() == pa.array(loc_pd_2.index) loc_cn_3 = cn_tb.loc[:"7"] loc_pd_3 = pdf_float.loc[:"7"] assert loc_pd_3.values.tolist() == loc_cn_3.to_pandas().values.tolist() assert loc_cn_3.get_index().get_index_array() == pa.array(loc_pd_3.index) loc_cn_4 = cn_tb.loc[:] loc_pd_4 = pdf_float.loc[:] assert loc_pd_4.values.tolist() == loc_cn_4.to_pandas().values.tolist() assert loc_cn_4.get_index().get_index_array() == pa.array(loc_pd_4.index) loc_cn_5 = cn_tb.loc[["7", "20"], :] loc_pd_5 = pdf_float.loc[["7", "20"], :] assert loc_pd_5.values.tolist() == loc_cn_5.to_pandas().values.tolist() assert loc_cn_5.get_index().get_index_array() == pa.array(loc_pd_5.index)
def test_neg(): npr = np.array([[1, 2, 3, 4, 5, -6, -7], [-1, -2, -3, -4, -5, 6, 7]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) neg_cn_tb: Table = -cn_tb neg_pdf = -pdf assert neg_cn_tb.to_pandas().values.tolist() == neg_pdf.values.tolist()
def fixed_filter_bench(): ctx: CylonContext = CylonContext(config=None, distributed=False) num_rows = 10_000_000 data = np.random.randn(num_rows) df = pd.DataFrame({'data{}'.format(i): data for i in range(2)}) np_key = np.random.randint(0, 100, size=num_rows) np_all = df.to_numpy() df['key'] = np_key rb = pa.record_batch(df) t = pa.Table.from_pandas(df) ct = Table.from_pandas(ctx, df) print(ct.shape, df.shape) pdf_time = [] ct_time = [] rep = 1 t1 = time.time() ct_filter = ct['key'] > 5 t2 = time.time() df_filter = df['key'] > 5 t3 = time.time() ct_res = ct[ct_filter] t4 = time.time() df_res = df[df_filter] t5 = time.time() np_filter = np_key > 5 t6 = time.time() np_res = np_all[np_filter] t7 = time.time() print(f"PyCylon filter time : {t2 - t1} s") print(f"Pandas filter time: {t3 - t2} s") print(f"Numpy filter time: {t6 - t5} s") print(f"PyCylon assign time: {t4 - t3} s") print(f"Pandas assign time: {t5 - t4} s") print(f"Numpy assign time: {t7 - t6} s") artb = t artb_filter = ct_filter.to_arrow().combine_chunks() artb_array_filter = artb_filter.columns[0].chunks[0] t_ar_s = time.time() artb = artb.combine_chunks() from pyarrow import compute res = [] for chunk_arr in artb.itercolumns(): res.append(chunk_arr.filter(artb_array_filter)) t_ar_e = time.time() res_t = pa.Table.from_arrays(res, ct.column_names) t_ar_e_2 = time.time() print(f"PyArrow Filter Time : {t_ar_e - t_ar_s}") print(f"PyArrow Table Creation : {t_ar_e_2 - t_ar_e}")
def test_tb_to_pydict_with_index(): pdf = pd.DataFrame({ 'idx': ['x', 'y', 'z'], 'col-1': ["a", "b", "c"], 'col-2': [10, 20, 30], 'col-3': ['Y', 'N', 'Y'] }) tb = Table.from_pandas(ctx, pdf) assert tb.to_pydict(with_index=True) == pdf.to_dict()
def test_add_suffix(): npr = np.array([[20.2, 2.0, 3.2, 4.3, 5.5], [10, -20, -30, -40, -50], [36.8, 13.2, 16.4, 12.2, 10.8]]) pdf = DataFrame(npr) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb: Table = Table.from_pandas(ctx, pdf) suffix = "item_" cn_tb_with_suffix = cn_tb.add_suffix(suffix) pdf_with_suffix = pdf.add_suffix(suffix) assert pdf_with_suffix.columns.tolist() == cn_tb_with_suffix.column_names
def test_column_str_ops(): file_path = '/home/vibhatha/data/cylon/none_data.csv' df = pd.read_csv(file_path, na_values='Nan', header=0) tb_cn = Table.from_pandas(ctx, df) print(df) df['d'] = df['d'].str.replace('-', '') print(df) tb_cn['d'] = tb_cn['d'].applymap(lambda x: x.replace('-', '')) print(tb_cn)
def test_string_type_filters(): ctx: CylonContext = CylonContext() tb: Table = Table.from_pydict(ctx, { "A": ['a', 'b', 'c', 'ab', 'a'], "B": [1, 2, 3, 4, 5] }) pdf = tb.to_pandas() def generate_filter_and_result(op, column: str, input, comparison_value): if column: filter = op(input[column], comparison_value) return filter, input[filter] else: filter = op(input, comparison_value) return filter, input[filter] def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table, pdf_filter: DataFrame, pdf_result: DataFrame, is_full_table): if is_full_table: assert tb_filter.to_pandas().values.tolist( ) == pdf_filter.values.tolist() assert tb_result.to_pandas().fillna( 0).values.tolist() == pdf_result.fillna(0).values.tolist() else: assert tb_filter.to_pandas().values.flatten().tolist( ) == pdf_filter.values.tolist() assert tb_result.to_pandas().values.tolist( ) == pdf_result.values.tolist() ops = [ operator.__eq__, operator.__ne__, operator.__lt__, operator.__gt__, operator.__le__, operator.__ge__ ] value = "a" columns = ["A"] is_full_table_flags = [False] for column, is_full_table in zip(columns, is_full_table_flags): for op in ops: tb_filter_all, tb_filter_all_result = generate_filter_and_result( op, column, tb, value) pdf_filter_all, pdf_filter_all_result = generate_filter_and_result( op, column, pdf, value) do_comparison_on_pdf_and_tb(tb_filter=tb_filter_all, tb_result=tb_filter_all_result, pdf_filter=pdf_filter_all, pdf_result=pdf_filter_all_result, is_full_table=is_full_table)
def tb_creation_op(num_rows: int, num_cols: int, duplication_factor: float): ctx: CylonContext = CylonContext(config=None, distributed=False) ctx.add_config("compute_engine", "numpy") pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=duplication_factor) # data_row_list data_row = [np.random.randint(num_rows)] * num_rows # data row np array data_row_ar = np.array(data_row) # data row pa array data_row_pa_ar = pa.array(data_row) data_set = [data_row for i in range(num_cols)] data_set_ar = [data_row_ar for i in range(num_cols)] data_set_pa_ar = [data_row_pa_ar for i in range(num_cols)] column_names = ["data_" + str(i) for i in range(num_cols)] t_pandas = time.time() tb = Table.from_pandas(ctx, pdf) t_pandas = time.time() - t_pandas t_list = time.time() tb1 = Table.from_list(ctx, column_names, data_set) t_list = time.time() - t_list t_numpy = time.time() tb2 = Table.from_numpy(ctx, column_names, data_set_ar) t_numpy = time.time() - t_numpy t_arrow = time.time() tb3 = pa.Table.from_arrays(data_set_pa_ar, column_names) t_arrow = time.time() - t_arrow t_cylon_from_arrow = time.time() tb4 = Table.from_arrow(ctx, tb3) t_cylon_from_arrow = time.time() - t_cylon_from_arrow return t_pandas, t_numpy, t_list, t_arrow, t_cylon_from_arrow
def test_fillna(): col_names = ['col1', 'col2'] data_list_numeric = [[1, 2, None, 4, 5], [6, 7, 8, 9, None]] fill_value = 0 ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb_numeric = Table.from_list(ctx, col_names, data_list_numeric) cn_tb_numeric_fillna = cn_tb_numeric.fillna(fill_value) data_list = list(cn_tb_numeric_fillna.to_pydict().values()) for col in data_list: assert not col.__contains__(None) assert col.__contains__(fill_value)
def test_uno_data_load(): file_path = "/home/vibhatha/sandbox/UNO/Benchmarks/Data/Pilot1/" file_name = "combined_single_response_agg" save_file = "/tmp/combined_single_response_agg_enum" path = os.path.join(file_path, file_name) csv_read_options = CSVReadOptions().use_threads(True).block_size( 1 << 30).with_delimiter("\t") t1 = time.time() tb = read_csv(ctx, path, csv_read_options) t2 = time.time() print(t2 - t1) print(tb.shape) print(tb.to_arrow()) print(tb.column_names) tb_drugs = tb['DRUG'] tb_drug = tb.unique(columns=['DRUG'], keep='first')['DRUG'] tb_drug_ar_tb = tb_drug.to_arrow().combine_chunks() tb_drug_list = tb_drug_ar_tb.column(0).chunk(0).tolist() tb_drugs_ar_tb = tb_drugs.to_arrow().combine_chunks() tb_drugs_list = tb_drugs_ar_tb.column(0).chunk(0).tolist() tb_drug_list_dict = {} for index, drug in enumerate(tb_drug_list): tb_drug_list_dict[drug] = index tb_drugs_enum_list = [] for drug in tb_drugs_list: tb_drugs_enum_list.append(tb_drug_list_dict[drug]) tb_enum_drug = Table.from_list(ctx, ['DRUG'], [tb_drugs_enum_list]) print(tb_enum_drug.shape, tb_drugs.shape) tb = tb.drop(['DRUG']) tb['DRUG'] = tb_enum_drug print(tb.to_arrow()) pdf = tb.to_pandas() pdf.to_csv(save_file, sep="\t")
def benchmark_table_conversion(): N = 10_000 a_rand = np.random.random(size=N) num_cols = 10_000 data = [] col_names = [] for i in range(num_cols): data.append(a_rand) col_names.append('c_' + str(i)) tb = pa.Table.from_arrays(data, col_names) pdf: pd.DataFrame = tb.to_pandas() ctx: CylonContext = CylonContext(config=None, distributed=False) cntb: Table = Table.from_arrow(ctx, tb) print(pdf.shape, cntb.shape) t1 = time.time() npr_cn = cntb.to_numpy() t2 = time.time() npr_df = pdf.to_numpy() t3 = time.time() def map_func(x): return x + 1 t4 = time.time() npr_map = map_func(npr_df) t5 = time.time() pdf_map = pdf.applymap(map_func) t6 = time.time() cn_map = cntb.applymap(map_func) t7 = time.time() print(f"PyCylon Time to table to Numpy {t2 - t1} s") print(f"Pandas Time to table to Numpy {t3 - t2} s") print(f"Numpy virtual map time : {t5-t4} s") print(f"Pandas map time : {t6 - t5} s") print(f"PyCylon map time : {t7 - t6} s") print(npr_map.shape) lst = [] tx = time.time() for i in range(npr_map.shape[1]): lst.append(npr_map[:][i]) ty = time.time() print(ty - tx)
def test_default_indexing(): pdf = pd.DataFrame({ 'idx': ['x', 'y', 'z'], 'col-1': ["a", "b", "c"], 'col-2': [10, 20, 30], 'col-3': ['Y', 'N', 'Y'] }) tb = Table.from_pandas(ctx, pdf) tb_idx_values = tb.index.index_values pdf_idx_values = pdf.index.values.tolist() assert tb_idx_values == pdf_idx_values
def test_non_numeric_applymap(): a = pa.array(['Rayan', 'Reynolds', 'Jack', 'Mat']) b = pa.array(['Cameron', 'Selena', 'Roger', 'Murphy']) tb: pa.Table = pa.Table.from_arrays([a, b], ['c1', 'c2']) pdf: pd.DataFrame = tb.to_pandas() ctx: CylonContext = CylonContext(config=None, distributed=False) cntb: Table = Table.from_arrow(ctx, tb) map_func = lambda x: "Hello, " + x new_cntb = cntb.applymap(map_func) new_pdf = pdf.applymap(map_func) assert new_cntb.to_pandas().values.tolist() == new_pdf.values.tolist()
def test_invert(): # Bool Invert Test data_list = [[False, True, False, True, True], [False, True, False, True, True]] pdf = DataFrame(data_list) ctx: CylonContext = CylonContext(config=None, distributed=False) cn_tb = Table.from_pandas(ctx, pdf) invert_cn_tb = ~cn_tb invert_pdf = ~pdf assert invert_cn_tb.to_pandas().values.tolist( ) == invert_pdf.values.tolist()