示例#1
0
def shuffle():
    mpi_config = MPIConfig()

    ctx = CylonContext(config=mpi_config, distributed=True)
    rows = 5
    tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range(
        rows)], 'c3': [i * 3 for i in range(rows)]})

    tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows),
                                                           np.random.random(size=rows),
                                                           np.random.random(size=rows)])

    print(tb.shape)

    tb_shuffle = tb.shuffle(['c1'])

    tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all')

    print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape)

    from pycylon.io import CSVWriteOptions

    csv_write_options = CSVWriteOptions().with_delimiter(',')
    #
    # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options)

    ctx.finalize()
def test_df_dist_sorting():
    df1 = DataFrame(
        [random.sample(range(10, 30), 5),
         random.sample(range(10, 30), 5)])

    def check_sort(df, col, ascending):
        arr = df.to_pandas()[col]
        for i in range(len(arr) - 1):
            if ascending:
                assert arr[i] <= arr[i + 1]
            else:
                assert arr[i] >= arr[i + 1]

    # local sort
    df = df1.sort_values('0', ascending=True)
    check_sort(df, '0', True)

    df = df1.sort_values('0', ascending=False)
    check_sort(df, '0', False)

    # distributed sort
    env = CylonEnv(config=MPIConfig(), distributed=True)

    print("Distributed Sort", env.rank, env.world_size)

    df3 = df1.sort_values(by=[0], env=env, ascending=True)
    check_sort(df3, '0', True)

    df3 = df1.sort_values(by=[0], env=env, ascending=False)
    check_sort(df3, '0', False)
示例#3
0
def test_distributed_sort():
    import numpy as np
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank = ctx.get_rank()
    size = ctx.get_world_size()

    assert size == 4

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    print(tb1)

    tb2 = tb1.distributed_sort(order_by='use_id')

    col_data = tb2['use_id'].to_numpy()
    col_data = np.reshape(col_data, (col_data.shape[0]))

    def is_sort_array(array):
        for i in range(array.shape[0] - 1):
            if array[i] > array[i + 1]:
                return False
        return True

    assert is_sort_array(col_data)
示例#4
0
def test_shuffle():
    # only testing the functionality
    # accuracy was tested via shuffle op and unique test cases in C++ test cases
    mpi_config = MPIConfig()

    ctx = cn.CylonContext(config=mpi_config, distributed=True)

    tb: cn.Table = None

    rank = ctx.get_rank()
    size = ctx.get_world_size()

    assert size == 2

    if rank == 0:
        tb = cn.Table.from_pydict(
            ctx, {
                'c1': [1, 1, 3, 3, 4, 5],
                'c2': [2, 2, 2, 4, 6, 6],
                'c3': [3, 3, 3, 5, 7, 7]
            })

    if rank == 1:
        tb = cn.Table.from_pydict(
            ctx, {
                'c1': [5, 1, 1, 4, 1, 10],
                'c2': [6, 2, 1, 5, 0, 1],
                'c3': [7, 3, 0, 5, 1, 5]
            })

    tb = tb.distributed_unique(['c1', 'c2', 'c3'])
def test_multi_process():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank, size = ctx.get_rank(), ctx.get_world_size()

    assert size == 4

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table1_path = f'/tmp/user_device_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print(tb1.column_names)
    print(tb2.column_names)

    configs = {
        'join_type': 'inner',
        'algorithm': 'sort',
        'left_col': 0,
        'right_col': 0
    }

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[0],
                                      right_on=[3])

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=['use_id'],
                                      right_on=['use_id'])

    tb5: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=['use_id'])

    assert tb3.column_count == tb4.column_count == tb4.column_count == 8

    if rank == 0:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 640
    if rank == 1:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 624
    if rank == 2:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 592
    if rank == 3:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 688
示例#6
0
def multi_process():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print(tb1.column_names)
    print(tb2.column_names)

    configs = {'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0,
               'right_col': 0}

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[0],
                                      right_on=[3]
                                      )

    tb3.show()

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=['use_id'],
                                      right_on=['use_id']
                                      )

    tb4.show()

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=['use_id']
                                      )

    tb4.show()

    # tb5: Table = tb1.distributed_join(ctx, table=tb2,
    #                       join_type=configs['join_type'],
    #                       algorithm=configs['algorithm'],
    #                       on=[0]
    #                       )
    #
    # tb5.show()

    ctx.finalize()
def test_pd_read_csv():
    env = CylonEnv(config=MPIConfig())

    df1 = DataFrame(pd.read_csv('/tmp/user_usage_tm_1.csv'))
    df2 = DataFrame(pd.read_csv('/tmp/user_device_tm_1.csv'))

    df1 = df1.set_index([3], drop=True)
    df2 = df2.set_index([0], drop=True)

    df1.to_table().retain_memory(False)
    df2.to_table().retain_memory(False)

    df3 = df1.merge(right=df2,
                    left_on=[3],
                    right_on=[0],
                    algorithm='sort',
                    env=env)

    assert len(df3)
def test_conversion_check():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank, size = ctx.get_rank(), ctx.get_world_size()

    assert size == 2

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_device_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)
    assert os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type='inner',
                                      algorithm='sort',
                                      left_on=[3],
                                      right_on=[0])

    # pdf: pd.DataFrame = tb3.to_pandas()
    npy: np.ndarray = tb3.to_numpy(order='C')

    # Cylon table rows must be equal to the rows of pandas dataframe extracted from the table
    # assert tb3.rows == pdf.shape[0]
    # Cylon table columns must be equal to the columns of pandas dataframe extracted from the table
    # assert tb3.columns == pdf.shape[1]
    # Cylon table rows must be equal to the rows of numpy ndarray extracted from the table
    assert tb3.row_count == npy.shape[0]
    # Cylon table columns must be equal to the columns of numpy ndarray extracted from the table
    assert tb3.column_count == npy.shape[1]

    print(
        f"Rank[{ctx.get_rank()}]: Table.Rows={tb3.row_count}, Table.Columns={tb3.column_count}, "
        f"Numpy Array Shape = {npy.shape}")

    print(f"Array Config Rank[{ctx.get_rank()}], {npy.flags} {npy.dtype}")
def test_distributed_run():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    table1_path = '/tmp/user_usage_tm_1.csv'
    table2_path = '/tmp/user_device_tm_1.csv'

    assert os.path.exists(table1_path)
    assert os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    configs = {'join_type': 'inner', 'algorithm': 'sort'}

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[3],
                                      right_on=[0])
    row_count = tb3.row_count
    column_count = tb3.column_count

    assert ctx.get_world_size() == 4
    assert column_count == 8

    rank = ctx.get_rank()
    if rank == 0:
        assert row_count == 640
    elif rank == 1:
        assert row_count == 624
    elif rank == 2:
        assert row_count == 592
    elif rank == 3:
        assert row_count == 688
    else:
        raise Exception("Parallelism not supported in this test")
示例#10
0
def test_data_split():
    mpi_config = MPIConfig()
    env: CylonEnv = CylonEnv(config=mpi_config, distributed=True)

    rows = 100

    data_file = "/tmp/test_split.csv"

    if env.rank == 0:
        # remove if the file already exists
        try:
            os.remove(data_file)
        except OSError:
            pass
        data = numpy.random.randint(100, size=(rows + 1, 4))

        with open(data_file, 'w') as f:
            numpy.savetxt(f, data, delimiter=",", fmt='%1f')

    env.barrier()

    data_full = read_csv(data_file, slice=False, env=env)
    data = read_csv(data_file, slice=True, env=env)

    np_data = data.to_numpy()
    np_data_full = data_full.to_numpy()

    seg_size = int(rows / env.world_size)

    for i in range(0, seg_size):
        assert numpy.array_equal(np_data[i],
                                 np_data_full[(seg_size * env.rank) + i])

    env.barrier()

    if env.rank == 0:
        os.remove(data_file)
示例#11
0
def test_distributed_ra():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank = ctx.get_rank()
    size = ctx.get_world_size()

    assert size == 4

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)
    assert os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print("First Hello World From Rank {}, Size {}".format(
        ctx.get_rank(), ctx.get_world_size()))

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type='inner',
                                      algorithm='hash',
                                      left_on=[0],
                                      right_on=[0])

    tb4: Table = tb1.distributed_union(tb2)

    tb5: Table = tb1.distributed_subtract(tb2)

    tb6: Table = tb1.distributed_intersect(tb2)

    ctx.barrier()

    join_row_count = tb3.row_count
    join_column_count = tb3.column_count

    subtract_row_count = tb5.row_count
    subtract_column_count = tb5.column_count

    union_row_count = tb4.row_count
    union_column_count = tb4.column_count

    intersect_row_count = tb6.row_count
    intersect_column_count = tb6.column_count

    if rank == 0:
        assert join_row_count == 1424 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 112 and union_column_count == 4
        assert intersect_row_count == 112 and intersect_column_count == 4

    if rank == 1:
        assert join_row_count == 1648 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 122 and union_column_count == 4
        assert intersect_row_count == 122 and intersect_column_count == 4

    if rank == 2:
        assert join_row_count == 2704 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 102 and union_column_count == 4
        assert intersect_row_count == 102 and intersect_column_count == 4

    if rank == 3:
        assert join_row_count == 1552 and join_column_count == 8
        assert subtract_row_count == 0 and subtract_column_count == 4
        assert union_row_count == 144 and union_column_count == 4
        assert intersect_row_count == 144 and intersect_column_count == 4
示例#12
0
import random

from pycylon import DataFrame, CylonEnv
from pycylon.net import MPIConfig

df1 = DataFrame([random.sample(range(10, 100), 50),
                 random.sample(range(10, 100), 50)])

# local sort
df3 = df1.sort_values(by=[0])
print("Local Sort")
print(df3)

# distributed sort
env = CylonEnv(config=MPIConfig())

df1 = DataFrame([random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5),
                 random.sample(range(10 * env.rank, 15 * (env.rank + 1)), 5)])
print("Distributed Sort", env.rank)
df3 = df1.sort_values(by=[0], env=env)
print(df3)

# distributed sort
print("Distributed Sort with sort options", env.rank)
bins = env.world_size * 2
df3 = df1.sort_values(by=[0], num_bins=bins, num_samples=bins, env=env)
print(df3)

env.finalize()
示例#13
0
    pm_trues = output_table[output_table["pm"] == True]

    # get the distributed count of these tables
    am_pm_ratio = am_trues.to_table().count(0) / pm_trues.to_table().count(0) # table API

    np_am_pm_ratio = am_pm_ratio.to_numpy()[0,0]
    if np.isinf(np_am_pm_ratio):
        np_am_pm_ratio = -1.0

    print(np_am_pm_ratio)

    from pycylon import Table
    return Table.from_pydict(env, {"am_pm_ratio": [np_am_pm_ratio]})


if __name__ == "__main__":
    config = tpcxbb_argparser()

    mpi_config = MPIConfig()
    ctx: CylonEnv = CylonEnv(config=mpi_config, distributed=True)

    res = main(ctx, config)

    if ctx.rank == 0:
        import os

        os.makedirs(config['output_dir'], exist_ok=True)
        res.to_pandas().to_csv(f"{config['output_dir']}/q14_results.csv", index=False)

    ctx.finalize()
def test_concat_op():
    from pycylon.net import MPIConfig
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)
    columns = ['c1', 'c2', 'c3']
    dataset_1 = [[1, 2, 3, 4, 5], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]]
    dataset_2 = [[1, 20, 3, 4, 50], [20, 30, 40, 50, 51], [33, 43, 53, 63, 73]]
    dataset_3 = [[1, 20, 3, 40, 50, 60], [21, 31, 41, 51, 50, 70],
                 [32, 42, 52, 62, 72, 82]]

    tb1 = Table.from_list(ctx, columns, dataset_1)
    tb1 = tb1.add_prefix('d1_')

    tb2 = Table.from_list(ctx, columns, dataset_2)
    tb2 = tb2.add_prefix('d2_')

    tb3 = Table.from_list(ctx, columns, dataset_3)
    tb3 = tb3.add_prefix('d3_')

    tb4 = Table.from_list(ctx, columns, dataset_3)
    tb4 = tb4.add_prefix('d1_')

    pdf1 = tb1.to_pandas()
    pdf2 = tb2.to_pandas()
    pdf3 = tb3.to_pandas()
    pdf4 = tb4.to_pandas()

    print(tb1)
    print("-" * 80)
    print(tb2)

    tb1.set_index(tb1.column_names[0], drop=True)
    tb2.set_index(tb2.column_names[0], drop=True)
    tb3.set_index(tb3.column_names[0], drop=True)
    tb4.set_index(tb4.column_names[0], drop=True)

    print("*" * 80)
    print("Indexed table")
    print(tb1)
    print("*" * 80)

    pdf1.set_index(pdf1.columns[0], drop=True, inplace=True)
    pdf2.set_index(pdf2.columns[0], drop=True, inplace=True)
    pdf3.set_index(pdf3.columns[0], drop=True, inplace=True)
    pdf4.set_index(pdf4.columns[0], drop=True, inplace=True)

    print("=" * 80)
    print("axis=1")
    print("=" * 80)
    res_pdf_1 = pd.concat([pdf1, pdf2], join='inner', axis=1)
    print(res_pdf_1)
    print("-" * 80)
    tables = [tb1, tb2]
    tb1_index_values = tb1.index.index_values
    tb2_index_values = tb2.index.index_values
    res_tb_1 = Table.concat(tables, join='inner', axis=1)
    print(res_tb_1)
    print("-" * 80)
    res_pdf_2 = pd.concat([pdf1, pdf2], join='inner', axis=1)
    print(res_pdf_2)
    assert res_pdf_2.values.tolist() == res_tb_1.to_pandas().values.tolist()
    assert res_tb_1.index.index_values == res_pdf_2.index.values.tolist()
    print("-" * 80)
    print(tb1.to_arrow())
    print(tb2.to_arrow())
    print(tb1.index.index_values, tb1_index_values)
    print(tb2.index.index_values, tb2_index_values)
    assert tb1.index.index_values.sort() == tb1_index_values.sort()
    assert tb2.index.index_values.sort() == tb2_index_values.sort()
    print("=" * 80)
    print("axis=0")
    print("=" * 80)
    res_pdf_3 = pd.concat([pdf1, pdf4], join='inner', axis=0)
    print(tb1.column_names, tb4.column_names)
    res_tb_2 = Table.concat([tb1, tb4], join='inner', axis=0)
    print(res_tb_2)
    print(res_tb_2.index.index_values)
    print(res_pdf_3)
    print(res_pdf_3.index.values.tolist())
    assert res_pdf_3.values.tolist() == res_tb_2.to_pandas().values.tolist()
    assert res_tb_2.index.index_values == res_pdf_3.index.values.tolist()