def shuffle(): mpi_config = MPIConfig() ctx = CylonContext(config=mpi_config, distributed=True) rows = 5 tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range( rows)], 'c3': [i * 3 for i in range(rows)]}) tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows), np.random.random(size=rows), np.random.random(size=rows)]) print(tb.shape) tb_shuffle = tb.shuffle(['c1']) tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all') print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape) from pycylon.io import CSVWriteOptions csv_write_options = CSVWriteOptions().with_delimiter(',') # # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options) ctx.finalize()
def test_df_dist_sorting(): df1 = DataFrame( [random.sample(range(10, 30), 5), random.sample(range(10, 30), 5)]) def check_sort(df, col, ascending): arr = df.to_pandas()[col] for i in range(len(arr) - 1): if ascending: assert arr[i] <= arr[i + 1] else: assert arr[i] >= arr[i + 1] # local sort df = df1.sort_values('0', ascending=True) check_sort(df, '0', True) df = df1.sort_values('0', ascending=False) check_sort(df, '0', False) # distributed sort env = CylonEnv(config=MPIConfig(), distributed=True) print("Distributed Sort", env.rank, env.world_size) df3 = df1.sort_values(by=[0], env=env, ascending=True) check_sort(df3, '0', True) df3 = df1.sort_values(by=[0], env=env, ascending=False) check_sort(df3, '0', False)
def test_distributed_sort(): import numpy as np mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank = ctx.get_rank() size = ctx.get_world_size() assert size == 4 table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv' assert os.path.exists(table1_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) print(tb1) tb2 = tb1.distributed_sort(order_by='use_id') col_data = tb2['use_id'].to_numpy() col_data = np.reshape(col_data, (col_data.shape[0])) def is_sort_array(array): for i in range(array.shape[0] - 1): if array[i] > array[i + 1]: return False return True assert is_sort_array(col_data)
def test_shuffle(): # only testing the functionality # accuracy was tested via shuffle op and unique test cases in C++ test cases mpi_config = MPIConfig() ctx = cn.CylonContext(config=mpi_config, distributed=True) tb: cn.Table = None rank = ctx.get_rank() size = ctx.get_world_size() assert size == 2 if rank == 0: tb = cn.Table.from_pydict( ctx, { 'c1': [1, 1, 3, 3, 4, 5], 'c2': [2, 2, 2, 4, 6, 6], 'c3': [3, 3, 3, 5, 7, 7] }) if rank == 1: tb = cn.Table.from_pydict( ctx, { 'c1': [5, 1, 1, 4, 1, 10], 'c2': [6, 2, 1, 5, 0, 1], 'c3': [7, 3, 0, 5, 1, 5] }) tb = tb.distributed_unique(['c1', 'c2', 'c3'])
def test_multi_process(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank, size = ctx.get_rank(), ctx.get_world_size() assert size == 4 csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) table1_path = f'/tmp/user_device_tm_{rank + 1}.csv' table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv' assert os.path.exists(table1_path) and os.path.exists(table2_path) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print(tb1.column_names) print(tb2.column_names) configs = { 'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, 'right_col': 0 } tb3: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[3]) tb4: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['use_id'], right_on=['use_id']) tb5: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['use_id']) assert tb3.column_count == tb4.column_count == tb4.column_count == 8 if rank == 0: assert tb3.row_count == tb4.row_count == tb5.row_count == 640 if rank == 1: assert tb3.row_count == tb4.row_count == tb5.row_count == 624 if rank == 2: assert tb3.row_count == tb4.row_count == tb5.row_count == 592 if rank == 3: assert tb3.row_count == tb4.row_count == tb5.row_count == 688
def multi_process(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print(tb1.column_names) print(tb2.column_names) configs = {'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0, 'right_col': 0} tb3: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[0], right_on=[3] ) tb3.show() tb4: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=['use_id'], right_on=['use_id'] ) tb4.show() tb4: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], on=['use_id'] ) tb4.show() # tb5: Table = tb1.distributed_join(ctx, table=tb2, # join_type=configs['join_type'], # algorithm=configs['algorithm'], # on=[0] # ) # # tb5.show() ctx.finalize()
def test_pd_read_csv(): env = CylonEnv(config=MPIConfig()) df1 = DataFrame(pd.read_csv('/tmp/user_usage_tm_1.csv')) df2 = DataFrame(pd.read_csv('/tmp/user_device_tm_1.csv')) df1 = df1.set_index([3], drop=True) df2 = df2.set_index([0], drop=True) df1.to_table().retain_memory(False) df2.to_table().retain_memory(False) df3 = df1.merge(right=df2, left_on=[3], right_on=[0], algorithm='sort', env=env) assert len(df3)
def test_conversion_check(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank, size = ctx.get_rank(), ctx.get_world_size() assert size == 2 table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv' table2_path = f'/tmp/user_device_tm_{rank + 1}.csv' assert os.path.exists(table1_path) assert os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) tb3: Table = tb1.distributed_join(table=tb2, join_type='inner', algorithm='sort', left_on=[3], right_on=[0]) # pdf: pd.DataFrame = tb3.to_pandas() npy: np.ndarray = tb3.to_numpy(order='C') # Cylon table rows must be equal to the rows of pandas dataframe extracted from the table # assert tb3.rows == pdf.shape[0] # Cylon table columns must be equal to the columns of pandas dataframe extracted from the table # assert tb3.columns == pdf.shape[1] # Cylon table rows must be equal to the rows of numpy ndarray extracted from the table assert tb3.row_count == npy.shape[0] # Cylon table columns must be equal to the columns of numpy ndarray extracted from the table assert tb3.column_count == npy.shape[1] print( f"Rank[{ctx.get_rank()}]: Table.Rows={tb3.row_count}, Table.Columns={tb3.column_count}, " f"Numpy Array Shape = {npy.shape}") print(f"Array Config Rank[{ctx.get_rank()}], {npy.flags} {npy.dtype}")
def test_distributed_run(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) table1_path = '/tmp/user_usage_tm_1.csv' table2_path = '/tmp/user_device_tm_1.csv' assert os.path.exists(table1_path) assert os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) configs = {'join_type': 'inner', 'algorithm': 'sort'} tb3: Table = tb1.distributed_join(table=tb2, join_type=configs['join_type'], algorithm=configs['algorithm'], left_on=[3], right_on=[0]) row_count = tb3.row_count column_count = tb3.column_count assert ctx.get_world_size() == 4 assert column_count == 8 rank = ctx.get_rank() if rank == 0: assert row_count == 640 elif rank == 1: assert row_count == 624 elif rank == 2: assert row_count == 592 elif rank == 3: assert row_count == 688 else: raise Exception("Parallelism not supported in this test")
def test_data_split(): mpi_config = MPIConfig() env: CylonEnv = CylonEnv(config=mpi_config, distributed=True) rows = 100 data_file = "/tmp/test_split.csv" if env.rank == 0: # remove if the file already exists try: os.remove(data_file) except OSError: pass data = numpy.random.randint(100, size=(rows + 1, 4)) with open(data_file, 'w') as f: numpy.savetxt(f, data, delimiter=",", fmt='%1f') env.barrier() data_full = read_csv(data_file, slice=False, env=env) data = read_csv(data_file, slice=True, env=env) np_data = data.to_numpy() np_data_full = data_full.to_numpy() seg_size = int(rows / env.world_size) for i in range(0, seg_size): assert numpy.array_equal(np_data[i], np_data_full[(seg_size * env.rank) + i]) env.barrier() if env.rank == 0: os.remove(data_file)
def test_distributed_ra(): mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) rank = ctx.get_rank() size = ctx.get_world_size() assert size == 4 table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv' table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv' assert os.path.exists(table1_path) assert os.path.exists(table2_path) csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30) tb1: Table = read_csv(ctx, table1_path, csv_read_options) tb2: Table = read_csv(ctx, table2_path, csv_read_options) print("First Hello World From Rank {}, Size {}".format( ctx.get_rank(), ctx.get_world_size())) tb3: Table = tb1.distributed_join(table=tb2, join_type='inner', algorithm='hash', left_on=[0], right_on=[0]) tb4: Table = tb1.distributed_union(tb2) tb5: Table = tb1.distributed_subtract(tb2) tb6: Table = tb1.distributed_intersect(tb2) ctx.barrier() join_row_count = tb3.row_count join_column_count = tb3.column_count subtract_row_count = tb5.row_count subtract_column_count = tb5.column_count union_row_count = tb4.row_count union_column_count = tb4.column_count intersect_row_count = tb6.row_count intersect_column_count = tb6.column_count if rank == 0: assert join_row_count == 1424 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 112 and union_column_count == 4 assert intersect_row_count == 112 and intersect_column_count == 4 if rank == 1: assert join_row_count == 1648 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 122 and union_column_count == 4 assert intersect_row_count == 122 and intersect_column_count == 4 if rank == 2: assert join_row_count == 2704 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 102 and union_column_count == 4 assert intersect_row_count == 102 and intersect_column_count == 4 if rank == 3: assert join_row_count == 1552 and join_column_count == 8 assert subtract_row_count == 0 and subtract_column_count == 4 assert union_row_count == 144 and union_column_count == 4 assert intersect_row_count == 144 and intersect_column_count == 4
import random from pycylon import DataFrame, CylonEnv from pycylon.net import MPIConfig df1 = DataFrame([random.sample(range(10, 100), 50), random.sample(range(10, 100), 50)]) # local sort df3 = df1.sort_values(by=[0]) print("Local Sort") print(df3) # distributed sort env = CylonEnv(config=MPIConfig()) df1 = DataFrame([random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5), random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5)]) print("Distributed Sort", env.rank) df3 = df1.sort_values(by=[0], env=env) print(df3) # distributed sort print("Distributed Sort with sort options", env.rank) bins = env.world_size * 2 df3 = df1.sort_values(by=[0], num_bins=bins, num_samples=bins, env=env) print(df3) env.finalize()
pm_trues = output_table[output_table["pm"] == True] # get the distributed count of these tables am_pm_ratio = am_trues.to_table().count(0) / pm_trues.to_table().count(0) # table API np_am_pm_ratio = am_pm_ratio.to_numpy()[0,0] if np.isinf(np_am_pm_ratio): np_am_pm_ratio = -1.0 print(np_am_pm_ratio) from pycylon import Table return Table.from_pydict(env, {"am_pm_ratio": [np_am_pm_ratio]}) if __name__ == "__main__": config = tpcxbb_argparser() mpi_config = MPIConfig() ctx: CylonEnv = CylonEnv(config=mpi_config, distributed=True) res = main(ctx, config) if ctx.rank == 0: import os os.makedirs(config['output_dir'], exist_ok=True) res.to_pandas().to_csv(f"{config['output_dir']}/q14_results.csv", index=False) ctx.finalize()
def test_concat_op(): from pycylon.net import MPIConfig mpi_config = MPIConfig() ctx: CylonContext = CylonContext(config=mpi_config, distributed=True) columns = ['c1', 'c2', 'c3'] dataset_1 = [[1, 2, 3, 4, 5], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]] dataset_2 = [[1, 20, 3, 4, 50], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]] dataset_3 = [[1, 20, 3, 40, 50, 60], [21, 31, 41, 51, 50, 70], [32, 42, 52, 62, 72, 82]] tb1 = Table.from_list(ctx, columns, dataset_1) tb1 = tb1.add_prefix('d1_') tb2 = Table.from_list(ctx, columns, dataset_2) tb2 = tb2.add_prefix('d2_') tb3 = Table.from_list(ctx, columns, dataset_3) tb3 = tb3.add_prefix('d3_') tb4 = Table.from_list(ctx, columns, dataset_3) tb4 = tb4.add_prefix('d1_') pdf1 = tb1.to_pandas() pdf2 = tb2.to_pandas() pdf3 = tb3.to_pandas() pdf4 = tb4.to_pandas() print(tb1) print("-" * 80) print(tb2) tb1.set_index(tb1.column_names[0], drop=True) tb2.set_index(tb2.column_names[0], drop=True) tb3.set_index(tb3.column_names[0], drop=True) tb4.set_index(tb4.column_names[0], drop=True) print("*" * 80) print("Indexed table") print(tb1) print("*" * 80) pdf1.set_index(pdf1.columns[0], drop=True, inplace=True) pdf2.set_index(pdf2.columns[0], drop=True, inplace=True) pdf3.set_index(pdf3.columns[0], drop=True, inplace=True) pdf4.set_index(pdf4.columns[0], drop=True, inplace=True) print("=" * 80) print("axis=1") print("=" * 80) res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=1) print(res_pdf_1) print("-" * 80) tables = [tb1, tb2] tb1_index_values = tb1.index.index_values tb2_index_values = tb2.index.index_values res_tb_1 = Table.concat(tables, join='inner', axis=1) print(res_tb_1) print("-" * 80) res_pdf_2 = pd.concat([pdf1, pdf2], join='inner', axis=1) print(res_pdf_2) assert res_pdf_2.values.tolist() == res_tb_1.to_pandas().values.tolist() assert res_tb_1.index.index_values == res_pdf_2.index.values.tolist() print("-" * 80) print(tb1.to_arrow()) print(tb2.to_arrow()) print(tb1.index.index_values, tb1_index_values) print(tb2.index.index_values, tb2_index_values) assert tb1.index.index_values.sort() == tb1_index_values.sort() assert tb2.index.index_values.sort() == tb2_index_values.sort() print("=" * 80) print("axis=0") print("=" * 80) res_pdf_3 = pd.concat([pdf1, pdf4], join='inner', axis=0) print(tb1.column_names, tb4.column_names) res_tb_2 = Table.concat([tb1, tb4], join='inner', axis=0) print(res_tb_2) print(res_tb_2.index.index_values) print(res_pdf_3) print(res_pdf_3.index.values.tolist()) assert res_pdf_3.values.tolist() == res_tb_2.to_pandas().values.tolist() assert res_tb_2.index.index_values == res_pdf_3.index.values.tolist()