def test_isin_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()
    filter_isin = [20, 11, 23]
    print(tb1)

    print(pdf)

    tb1.set_index('a', drop=True)
    pdf.set_index('a', inplace=True)

    filter_pdf: pd.DataFrame = pdf[['b', 'c']].iloc[0:5]

    tb_res = tb1[tb1['b'].isin(filter_isin)]
    pdf_res = pdf[pdf['b'].isin(filter_isin)]

    print(tb_res)
    print(pdf_res)

    assert tb_res.to_pandas().values.tolist() == pdf_res.values.tolist()

    print(tb_res.index.values)
    print(pdf_res.index.values)

    assert tb_res.index.values.tolist() == pdf_res.index.values.tolist()
Пример #2
0
def test_df_iterrows():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    tb1.set_index(tb1.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)
    num_records = tb1.row_count
    print(pdf)

    for idx, row in pdf.iterrows():
        print(idx)
        print(row)

    dict = tb1.to_pydict(with_index=False)
    indices = tb1.index.index_values
    rows = []

    for index_id, index in enumerate(indices):
        row = []
        for col in dict:
            row.append(dict[col][index_id])
        rows.append(row)

    for index, row in zip(indices, rows):
        print(index, row)

    for index1, row1, composite in zip(indices, rows, pdf.iterrows()):
        index2 = composite[0]
        row2 = composite[1].tolist()
        assert index1 == index2
        assert row1 == row2
Пример #3
0
def test_distributed_sort():
    import numpy as np
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank = ctx.get_rank()
    size = ctx.get_world_size()

    assert size == 4

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    print(tb1)

    tb2 = tb1.distributed_sort(order_by='use_id')

    col_data = tb2['use_id'].to_numpy()
    col_data = np.reshape(col_data, (col_data.shape[0]))

    def is_sort_array(array):
        for i in range(array.shape[0] - 1):
            if array[i] > array[i + 1]:
                return False
        return True

    assert is_sort_array(col_data)
Пример #4
0
def test_isin_column():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    tb1.set_index(tb1.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)

    print(tb1)
    print(pdf)

    isin_values = [10, 20, 30, 5, 2, 8]

    tbx = tb1['b'].isin(isin_values)
    pdfx = pdf['b'].isin(isin_values)

    print(tbx)

    print(pdfx)

    tb_list = tbx.to_pandas().values.flatten().tolist()
    pd_list = pdfx.values.tolist()

    assert tb_list == pd_list

    print(tb_list)
    print(pd_list)
Пример #5
0
def test_unique():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15]

    print("Original Data")
    print(pdf)

    tb2 = tb1['b'].unique()
    pdf2 = pdf['b'].unique()
    tb2.show()

    print("Unique Pdf")
    print(pdf2)
    print(type(pdf2))

    print("Unique Cylon")
    print(tb2)

    tb3_list = list(tb2.to_pydict().items())[0][1]
    pdf3_list = pdf2.tolist()

    assert tb3_list == pdf3_list

    set_pdf4 = set(pdf2)
    set_tb4 = set(tb3_list)

    assert set_tb4 == set_pdf4

    ctx.finalize()
Пример #6
0
def test_df_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    print(pdf.columns[0])
    pdf1 = pdf[[pdf.columns[0]]]

    print(pdf)

    print(pdf1)

    pdf3 = pdf.set_index(pdf.columns[0], drop=True)

    print(pdf3)

    artb = pa.Table.from_pandas(df=pdf3,
                                schema=None,
                                preserve_index=True,
                                nthreads=None,
                                columns=None,
                                safe=False)

    print(artb)
Пример #7
0
def test_properties():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table1_path = '/tmp/user_usage_tm_1.csv'
    table2_path = '/tmp/user_usage_tm_2.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb: Table = read_csv(ctx, table1_path, csv_read_options)

    pdf = tb.to_pandas()

    def generate_filter_and_result(op, column: str, input, comparison_value):
        if column:
            filter = op(input[column], comparison_value)
            return filter, input[filter]
        else:
            filter = op(input, comparison_value)
            return filter, input[filter]

    def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table,
                                    pdf_filter: DataFrame,
                                    pdf_result: DataFrame, is_full_table):

        if is_full_table:
            assert tb_filter.to_pandas().values.tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().fillna(
                0).values.tolist() == pdf_result.fillna(0).values.tolist()
        else:
            assert tb_filter.to_pandas().values.flatten().tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().values.tolist(
            ) == pdf_result.values.tolist()

    ops = [
        operator.__eq__, operator.__ne__, operator.__lt__, operator.__gt__,
        operator.__le__, operator.__ge__
    ]
    value = 519.12
    columns = ['monthly_mb', None]
    is_full_table_flags = [False, True]

    for column, is_full_table in zip(columns, is_full_table_flags):
        for op in ops:
            tb_filter_all, tb_filter_all_result = generate_filter_and_result(
                op, column, tb, value)

            pdf_filter_all, pdf_filter_all_result = generate_filter_and_result(
                op, column, pdf, value)

            do_comparison_on_pdf_and_tb(tb_filter=tb_filter_all,
                                        tb_result=tb_filter_all_result,
                                        pdf_filter=pdf_filter_all,
                                        pdf_result=pdf_filter_all_result,
                                        is_full_table=is_full_table)
def test_multi_process():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank, size = ctx.get_rank(), ctx.get_world_size()

    assert size == 4

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table1_path = f'/tmp/user_device_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print(tb1.column_names)
    print(tb2.column_names)

    configs = {
        'join_type': 'inner',
        'algorithm': 'sort',
        'left_col': 0,
        'right_col': 0
    }

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[0],
                                      right_on=[3])

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=['use_id'],
                                      right_on=['use_id'])

    tb5: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=['use_id'])

    assert tb3.column_count == tb4.column_count == tb4.column_count == 8

    if rank == 0:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 640
    if rank == 1:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 624
    if rank == 2:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 592
    if rank == 3:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 688
Пример #9
0
def test_col_access():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()
    print(tb1)
    tbx = tb1[tb1.column_names[0]]
    print(tbx)
    npy = tbx.to_numpy().flatten().tolist()
    print(npy)
def test_read_csv_with_use_cols():
    ctx = CylonContext(config=None, distributed=False)
    use_cols = ['a', 'b']
    csv_read_options = CSVReadOptions() \
        .use_threads(True) \
        .block_size(1 << 30) \
        .use_cols(use_cols)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf = pd.read_csv(table_path, usecols=use_cols)
    assert tb1.column_names == use_cols == pdf.columns.tolist()
def test_read_csv_with_skiprows():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions() \
        .use_threads(True) \
        .block_size(1 << 30) \
        .skip_rows(1)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf = pd.read_csv(table_path, skiprows=1)
    print(tb1)
    print("-" * 80)
    print(pdf)
    assert tb1.to_pandas().values.tolist() == pdf.values.tolist()
Пример #12
0
def multi_process():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print(tb1.column_names)
    print(tb2.column_names)

    configs = {'join_type': 'inner', 'algorithm': 'sort', 'left_col': 0,
               'right_col': 0}

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[0],
                                      right_on=[3]
                                      )

    tb3.show()

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=['use_id'],
                                      right_on=['use_id']
                                      )

    tb4.show()

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=['use_id']
                                      )

    tb4.show()

    # tb5: Table = tb1.distributed_join(ctx, table=tb2,
    #                       join_type=configs['join_type'],
    #                       algorithm=configs['algorithm'],
    #                       on=[0]
    #                       )
    #
    # tb5.show()

    ctx.finalize()
Пример #13
0
def test_uno_data_load():
    file_path = "/home/vibhatha/sandbox/UNO/Benchmarks/Data/Pilot1/"
    file_name = "combined_single_response_agg"
    save_file = "/tmp/combined_single_response_agg_enum"
    path = os.path.join(file_path, file_name)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(
        1 << 30).with_delimiter("\t")
    t1 = time.time()
    tb = read_csv(ctx, path, csv_read_options)
    t2 = time.time()

    print(t2 - t1)

    print(tb.shape)

    print(tb.to_arrow())

    print(tb.column_names)

    tb_drugs = tb['DRUG']

    tb_drug = tb.unique(columns=['DRUG'], keep='first')['DRUG']

    tb_drug_ar_tb = tb_drug.to_arrow().combine_chunks()
    tb_drug_list = tb_drug_ar_tb.column(0).chunk(0).tolist()

    tb_drugs_ar_tb = tb_drugs.to_arrow().combine_chunks()
    tb_drugs_list = tb_drugs_ar_tb.column(0).chunk(0).tolist()

    tb_drug_list_dict = {}

    for index, drug in enumerate(tb_drug_list):
        tb_drug_list_dict[drug] = index

    tb_drugs_enum_list = []

    for drug in tb_drugs_list:
        tb_drugs_enum_list.append(tb_drug_list_dict[drug])

    tb_enum_drug = Table.from_list(ctx, ['DRUG'], [tb_drugs_enum_list])

    print(tb_enum_drug.shape, tb_drugs.shape)
    tb = tb.drop(['DRUG'])

    tb['DRUG'] = tb_enum_drug

    print(tb.to_arrow())

    pdf = tb.to_pandas()

    pdf.to_csv(save_file, sep="\t")
def test_read_csv_with_na_values():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions() \
        .use_threads(True) \
        .block_size(1 << 30) \
        .na_values(['na', 'none'])
    table_path = 'data/input/null_data.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf = pd.read_csv(table_path, na_values=['na', 'none'])
    print(tb1)
    print("-" * 80)
    print(pdf)
    tb1 = tb1.fillna(0)
    pdf = pdf.fillna(0)
    assert tb1.to_pandas().values.tolist() == pdf.values.tolist()
Пример #15
0
def test_single_process():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table1_path = '/tmp/user_device_tm_1.csv'
    table2_path = '/tmp/user_usage_tm_1.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    configs = {'join_type': 'inner', 'algorithm': 'sort'}

    tb3: Table = tb1.join(table=tb2,
                          join_type=configs['join_type'],
                          algorithm=configs['algorithm'],
                          left_on=[0],
                          right_on=[3]
                          )

    print(tb3.row_count, tb3.column_count)

    tb4: Table = tb1.join(table=tb2,
                          join_type=configs['join_type'],
                          algorithm=configs['algorithm'],
                          left_on=['use_id'],
                          right_on=['use_id']
                          )

    tb5: Table = tb1.join(table=tb2,
                          join_type=configs['join_type'],
                          algorithm=configs['algorithm'],
                          on=['use_id']
                          )

    # tb6: Table = tb1.join(ctx, table=tb2,
    #                       join_type=configs['join_type'],
    #                       algorithm=configs['algorithm'],
    #                       on=[0]
    #                       )
    #
    # tb5.show()
    assert tb3.row_count == tb4.row_count == tb5.row_count and tb3.column_count == \
           tb4.column_count == tb5.column_count
    ctx.finalize()
def test_drop():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table1_path = '/tmp/user_usage_tm_1.csv'

    assert os.path.exists(table1_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb: Table = read_csv(ctx, table1_path, csv_read_options)

    drop_column = 'outgoing_sms_per_month'

    tb_new = tb.drop([drop_column])

    assert not tb_new.column_names.__contains__(drop_column)
Пример #17
0
def join_op(num_rows: int, num_cols: int, algorithm: str,
            unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    pdf_left = get_dataframe(num_rows=num_rows,
                             num_cols=num_cols,
                             unique_factor=unique_factor,
                             stringify=False)
    pdf_right = get_dataframe(num_rows=num_rows,
                              num_cols=num_cols,
                              unique_factor=unique_factor,
                              stringify=False)
    # NOTE: sort join breaks when loaded data in-memory via Pandas dataframe
    pdf_left.to_csv("/tmp/left_table.csv", index=False)
    pdf_right.to_csv("/tmp/right_table.csv", index=False)

    csv_read_options = CSVReadOptions() \
        .use_threads(True) \
        .block_size(1 << 30)

    tb_left = read_csv(ctx, "/tmp/left_table.csv", csv_read_options)
    tb_right = read_csv(ctx, "/tmp/right_table.csv", csv_read_options)
    join_col = tb_left.column_names[0]
    cylon_time = time.time()
    tb2 = tb_left.join(tb_right,
                       join_type='inner',
                       algorithm=algorithm,
                       on=[join_col])
    cylon_time = time.time() - cylon_time

    pandas_time = time.time()
    pdf2 = pdf_left.join(pdf_right,
                         how="inner",
                         on=join_col,
                         lsuffix="_l",
                         rsuffix="_r")
    pandas_time = time.time() - pandas_time

    pandas_eval_time = time.time()
    pdf2 = pd.eval(
        "pdf_left.join(pdf_right, how='inner', on=join_col, lsuffix='_l', rsuffix='_r')"
    )
    pandas_eval_time = time.time() - pandas_eval_time

    ctx.finalize()

    return pandas_time, cylon_time, pandas_eval_time
Пример #18
0
def test_series_tolist():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    series = pdf[pdf.columns[0]]

    print(type(series))

    lst = series.tolist()
    npy = series.to_numpy()

    print(lst)
    idx = series.index.values
    print(type(idx), idx)
def test_iterrows():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    tb1.set_index(tb1.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)

    for p, c in zip(pdf.iterrows(), tb1.iterrows()):
        idx_p = p[0]
        row_p = p[1].tolist()
        idx_c = c[0]
        row_c = c[1]
        assert idx_p == idx_c
        assert row_p == row_c
Пример #20
0
def test_isin_with_getitem():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb.to_pandas()

    tb.set_index(tb.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)

    assert tb.index.values.tolist() == pdf.index.values.tolist()

    compare_values = [4, 1, 10, 100, 150]

    tb_res_isin = tb.index.isin(compare_values)
    pdf_res_isin = pdf.index.isin(compare_values)

    assert tb_res_isin.tolist() == pdf_res_isin.tolist()

    print(tb_res_isin)

    print(pdf_res_isin)

    pdf1 = pdf[pdf_res_isin]

    print("Pandas Output")
    print(pdf1)
    print(pdf1.index.values)

    tb_filter = Table.from_list(ctx, ['filter'], [tb_res_isin.tolist()])
    tb1 = tb[tb_filter]
    resultant_index = tb.index.values[tb_res_isin].tolist()
    print(resultant_index)
    tb1.set_index(resultant_index)
    print("PyCylon Output")
    print(tb1)

    print(tb1.index.values)

    assert pdf1.values.tolist() == tb1.to_pandas().values.tolist()

    print(tb1.index.values)
    print(pdf1.index.values)

    assert tb1.index.values.tolist() == pdf1.index.values.tolist()
Пример #21
0
def test_filter():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table1_path = '/tmp/user_usage_tm_1.csv'
    table2_path = '/tmp/user_usage_tm_2.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb: Table = read_csv(ctx, table1_path, csv_read_options)

    column_name = 'monthly_mb'

    ops = [operator.__or__, operator.__and__]
    or_limits = [600, 5000, 15000]
    and_limits = [0, 5000, 1000]
    comp_op_or = [operator.__gt__, operator.__le__, operator.__gt__]
    comp_op_and = [operator.__gt__, operator.__le__, operator.__gt__]
    limits = [or_limits, and_limits]
    comp_ops = [comp_op_or, comp_op_and]

    for op, limit, comp_op in zip(ops, limits, comp_ops):
        print("Op ", op)
        tb_cond_1 = comp_op[0](tb[column_name], limit[0])
        tb_cond_2 = comp_op[1](tb[column_name], limit[1])
        tb_cond_3 = comp_op[2](tb[column_name], limit[2])

        res_1_op = op(tb_cond_1, tb_cond_2)
        res_2_op = op(res_1_op, tb_cond_3)

        res_1 = tb[res_1_op]
        res_2 = tb[res_2_op]

        column_pdf_1 = res_1[column_name].to_pandas()
        column_pdf_2 = res_2[column_name].to_pandas()

        column_1 = column_pdf_1[column_name]
        for col in column_1:
            assert op(comp_op[0](col, limit[0]), comp_op[1](col, limit[1]))

        column_2 = column_pdf_2[column_name]
        for col in column_2:
            assert op(op(comp_op[0](col, limit[0]), comp_op[1](col, limit[1])),
                      comp_op[2](col, limit[2]))
Пример #22
0
def test_isin():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb.to_pandas()

    tb.set_index(tb.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)

    assert tb.index.values.tolist() == pdf.index.values.tolist()

    compare_values = [4, 1, 10, 100, 150]

    tb_res_isin = tb.index.isin(compare_values)
    pdf_res_isin = pdf.index.isin(compare_values)

    assert tb_res_isin.tolist() == pdf_res_isin.tolist()
def test_conversion_check():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank, size = ctx.get_rank(), ctx.get_world_size()

    assert size == 2

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_device_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)
    assert os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type='inner',
                                      algorithm='sort',
                                      left_on=[3],
                                      right_on=[0])

    # pdf: pd.DataFrame = tb3.to_pandas()
    npy: np.ndarray = tb3.to_numpy(order='C')

    # Cylon table rows must be equal to the rows of pandas dataframe extracted from the table
    # assert tb3.rows == pdf.shape[0]
    # Cylon table columns must be equal to the columns of pandas dataframe extracted from the table
    # assert tb3.columns == pdf.shape[1]
    # Cylon table rows must be equal to the rows of numpy ndarray extracted from the table
    assert tb3.row_count == npy.shape[0]
    # Cylon table columns must be equal to the columns of numpy ndarray extracted from the table
    assert tb3.column_count == npy.shape[1]

    print(
        f"Rank[{ctx.get_rank()}]: Table.Rows={tb3.row_count}, Table.Columns={tb3.column_count}, "
        f"Numpy Array Shape = {npy.shape}")

    print(f"Array Config Rank[{ctx.get_rank()}], {npy.flags} {npy.dtype}")
Пример #24
0
def test_unique():

    ctx = cn.CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: cn.Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15]

    tb2 = tb1.unique(columns=['a', 'b'], keep='first')
    pdf2 = pdf.drop_duplicates(subset=['a', 'b'])
    tb2.show()
    sort_col = tb2.sort(3).to_pydict()['d']

    assert sort_col == expected_indices_of_sort_col

    assert pdf2['d'].values.tolist() == sort_col

    ctx.finalize()
def test_distributed_run():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    table1_path = '/tmp/user_usage_tm_1.csv'
    table2_path = '/tmp/user_device_tm_1.csv'

    assert os.path.exists(table1_path)
    assert os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    configs = {'join_type': 'inner', 'algorithm': 'sort'}

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[3],
                                      right_on=[0])
    row_count = tb3.row_count
    column_count = tb3.column_count

    assert ctx.get_world_size() == 4
    assert column_count == 8

    rank = ctx.get_rank()
    if rank == 0:
        assert row_count == 640
    elif rank == 1:
        assert row_count == 624
    elif rank == 2:
        assert row_count == 592
    elif rank == 3:
        assert row_count == 688
    else:
        raise Exception("Parallelism not supported in this test")
def test_getitem_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb.to_pandas()

    print(tb)
    print("-" * 80)
    print(pdf)

    tb.set_index('a', drop=True)
    pdf.set_index('a', drop=True, inplace=True)

    assert tb.index.values.tolist() == pdf.index.values.tolist()

    tb_1 = tb['b']
    pdf_1 = pdf['b']

    print(tb_1.index.values)
    print(pdf_1.index.values)

    assert tb_1.index.values.tolist() == pdf_1.index.values.tolist()

    tb_2 = tb[0:10]
    pdf_2 = pdf[0:10]

    print(tb_2.index.values)
    print(pdf_2.index.values)

    assert tb_2.index.values.tolist() == pdf_2.index.values.tolist()

    tb_3 = tb[['c', 'd']]
    pdf_3 = pdf[['c', 'd']]

    print(tb_3.index.values)
    print(pdf_3.index.values)

    assert tb_3.index.values.tolist() == pdf_3.index.values.tolist()
Пример #27
0
def test_rl():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table1_path = '/tmp/user_usage_tm_1.csv'
    table2_path = '/tmp/user_usage_tm_2.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print("First Hello World From Rank {}, Size {}".format(
        ctx.get_rank(), ctx.get_world_size()))

    tb3: Table = tb1.join(table=tb2,
                          join_type='inner',
                          algorithm='hash',
                          left_on=[0],
                          right_on=[0])

    assert tb3.row_count == 458 and tb3.column_count == 8

    tb4: Table = tb1.union(tb2)

    assert tb4.row_count == 240 and tb4.column_count == 4

    tb5: Table = tb1.subtract(tb2)

    assert tb5.row_count == 0 and tb5.column_count == 4

    tb6: Table = tb1.intersect(tb2)

    assert tb6.row_count == 240 and tb6.column_count == 4

    ctx.finalize()
Пример #28
0
def test_arrow_cylon():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table_path = '/tmp/user_device_tm_1.csv'

    assert os.path.exists(table_path)

    tb: pa.Table = csv.read_csv(table_path)

    arrow_columns = len(tb.columns)
    arrow_rows = tb.num_rows

    tbc = Table.from_arrow(ctx, tb)

    cylon_rows = tbc.row_count
    cylon_columns = tbc.column_count

    assert arrow_columns == cylon_columns
    assert arrow_rows == cylon_rows

    ctx.finalize()
def test_setitem_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb.to_pandas()

    print(tb)
    print("-" * 80)
    print(pdf)

    tb.set_index('a', drop=True)
    pdf.set_index('a', drop=True, inplace=True)

    new_data = [i * 10 for i in range(tb.row_count)]
    new_tb = Table.from_list(ctx, ['new_col'], [new_data])
    tb['e'] = new_tb
    pdf['e'] = pd.DataFrame(new_data)

    print(tb.index.values)
    print(pdf.index.values)

    assert tb.index.values.tolist() == pdf.index.values.tolist()
def test_table_initialization_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb: Table = read_csv(ctx, table_path, csv_read_options)
    expected_index = [i for i in range(tb.row_count)]
    expected_index_1 = [0, 1, 2]

    print(tb)
    print(tb.index.values)

    assert expected_index == tb.index.values.tolist()

    pd_data = [[1, 2, 3], [4, 5, 6], [6, 7, 8]]
    cols = ['a', 'b', 'c']
    dict_data = {'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [6, 7, 8]}
    pdf = pd.DataFrame(pd_data, columns=cols)
    print(pdf)

    tb_from_pd = Table.from_pandas(ctx, pdf)
    print(tb_from_pd)

    assert tb_from_pd.index.values.tolist() == pdf.index.values.tolist()

    tb_from_list = Table.from_list(ctx, cols, pd_data)

    print(tb_from_list)
    print(tb_from_list.index.values)

    assert expected_index_1 == tb_from_list.index.values.tolist()

    tb_from_dict = Table.from_pydict(ctx, dict_data)
    print(tb_from_dict)
    print(tb_from_dict.index.values)

    assert expected_index_1 == tb_from_dict.index.values.tolist()