def test_math_ops_for_table_values():
    pdf = DataFrame({
        '0': [1, 2, 3, 4],
        '1': [5, 6, 7, 9],
        '2': [1., 2., 3., 4.]
    })
    ctx: CylonContext = CylonContext()
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    from operator import add, sub, mul, truediv
    ops = [add]  #, sub, mul, truediv]

    for op in ops:
        # test column division
        cn_res = op(cn_tb['0'], cn_tb['0'])
        pd_res = op(pdf['0'], pdf['0'])

        # pandas series.values returns an array, whereas dataframe.values list of lists. Hence it
        # needs to be flattened to compare
        assert pd_res.values.tolist() == cn_res.to_pandas().values.flatten(
        ).tolist()

        # test table division
        cn_res2 = op(cn_tb, cn_tb['0'])
        pd_res2 = getattr(pdf, op.__name__)(pdf['0'], axis=0)

        assert pd_res2.values.tolist() == cn_res2.to_pandas().values.tolist()
Exemplo n.º 2
0
def shuffle():
    mpi_config = MPIConfig()

    ctx = CylonContext(config=mpi_config, distributed=True)
    rows = 5
    tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range(
        rows)], 'c3': [i * 3 for i in range(rows)]})

    tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows),
                                                           np.random.random(size=rows),
                                                           np.random.random(size=rows)])

    print(tb.shape)

    tb_shuffle = tb.shuffle(['c1'])

    tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all')

    print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape)

    from pycylon.io import CSVWriteOptions

    csv_write_options = CSVWriteOptions().with_delimiter(',')
    #
    # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options)

    ctx.finalize()
Exemplo n.º 3
0
def test_sorting():
    ctx: CylonContext = CylonContext()

    ar1 = pa.array([4, 2, 1, 4, 3])
    ar2 = pa.array(['ad', 'ac', 'ac', 'ab', 'a'])
    ar3 = pa.array([4., 2., 1., 4., 3.])

    pa_t: pa.Table = pa.Table.from_arrays([ar1, ar2, ar3],
                                          names=['col1', 'col2', 'col3'])

    cn_t = cn.Table.from_arrow(ctx, pa_t)

    def do_sort(col, ascending):
        srt = cn_t.sort(col, ascending)
        arr = srt.to_pydict()[col]
        print(srt)
        for i in range(len(arr) - 1):
            if ascending:
                assert arr[i] <= arr[i + 1]
            else:
                assert arr[i] >= arr[i + 1]

    for asc in [True, False]:
        for c in ['col1', 'col2', 'col3']:
            do_sort(c, asc)
Exemplo n.º 4
0
def math_op_base():
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    num_rows = 10_000_000
    data = np.random.randn(num_rows)

    df = pd.DataFrame({'data{}'.format(i): data for i in range(100)})

    np_key = np.random.randint(0, 100, size=num_rows)
    np_all = df.to_numpy()

    df['key'] = np_key

    rb = pa.record_batch(df)
    t = pa.Table.from_pandas(df)

    ct = Table.from_pandas(ctx, df)

    t1 = time.time()
    np_key + 1
    t2 = time.time()
    ct['key'] + 1
    t3 = time.time()
    df['key'] + 1
    t4 = time.time()
    artb = ct.to_arrow().combine_chunks()
    ar_key = ct['key'].to_arrow().combine_chunks().columns[0].chunks[0]
    pc.add(ar_key, 1)
    t5 = time.time()

    print(f"Numpy Time: {t2 - t1} s")
    print(f"PyCylon Time: {t3 - t2} s")
    print(f"Pandas Time: {t4 - t3} s")
    print(f"PyArrow Time: {t5 - t4} s")
def test_setitem():
    npr = np.array([[1, 2, 3, 4, 5], [-1, -2, -3, -4, -5]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    # replacing an existing column
    cn_tb['0'] = cn_tb['4']
    assert cn_tb['0'].to_pandas().values.tolist() == cn_tb['4'].to_pandas(
    ).values.tolist()
    # adding a new column at the end
    cn_tb['5'] = cn_tb['4']
    assert cn_tb['5'].to_pandas().values.tolist() == cn_tb['4'].to_pandas(
    ).values.tolist()

    cn_tb['6'] = 1  # create new column
    assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(),
                          np.full(cn_tb.row_count, 1))

    cn_tb['6'] = 1.0  # replace column
    assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(),
                          np.full(cn_tb.row_count, 1.0))

    cn_tb['6'] = 'aaa'  # replace column
    assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(),
                          np.full(cn_tb.row_count, 'aaa'))
def test_math_ops_for_scalar():
    npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50],
                    [10.2, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    from operator import add, sub, mul, truediv
    ops = [add, sub, mul, truediv]

    for op in ops:
        cn_tb_1 = cn_tb
        pdf_1 = pdf
        # test column division
        cn_tb_1['0'] = op(cn_tb_1['0'], 2)
        pdf_1[0] = op(pdf_1[0], 2)

        assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist()

        # test table division
        cn_tb_2 = cn_tb
        pdf_2 = pdf

        cn_tb_2 = op(cn_tb_2, 2)
        pdf_2 = op(pdf, 2)

        assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()
def test_rename():
    col_names = ['col1', 'col2', 'col3', 'col4']
    data_list_numeric = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10],
                         [11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb = Table.from_list(ctx, col_names, data_list_numeric)
    index_values = [0, 1, 2, 3, 4]
    cn_tb.set_index(index_values)
    prev_col_names = cn_tb.column_names
    # with dictionary
    columns = {'col1': 'col-1', 'col3': 'col-3'}
    cn_tb.rename(columns)

    new_col_names = cn_tb.column_names

    for key in columns:
        value = columns[key]
        assert prev_col_names.index(key) == new_col_names.index(value)

    # with list
    cn_tb_list = Table.from_list(ctx, col_names, data_list_numeric)
    cn_tb_list.set_index(index_values)
    prev_col_names = cn_tb_list.column_names
    new_column_names = ['col-1', 'col-2', 'col-3', 'col-4']
    cn_tb_list.rename(new_column_names)

    assert cn_tb_list.column_names == new_column_names
Exemplo n.º 8
0
def test_reset_index():
    from pycylon.indexing.index import IndexingType
    from pycylon.indexing.index_utils import IndexUtil

    pdf_float = pd.DataFrame({'a': pd.Series([1, 4, 7, 10, 20, 23, 10], dtype=np.int64()),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int')})
    pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8], [10, 11], [20, 22], [23, 25], [10, 12]])
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    # cn_tb.set_index('a', indexing_schema, drop_index)
    cn_tb.set_index('a', indexing_type, drop_index)

    # assert cn_tb.get_index().get_type() == IndexingSchema.LINEAR

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    rest_drop_index = False
    # cn_tb.reset_index(rest_drop_index)
    cn_tb.reset_index(rest_drop_index)

    assert cn_tb.column_names == ['index', 'b']

    # assert cn_tb.get_index().get_schema() == IndexingSchema.RANGE
    assert cn_tb.get_index().get_type() == IndexingType.RANGE
Exemplo n.º 9
0
def test_loc_with_list():
    ctx = CylonContext(config=None, distributed=False)

    dataset = []
    num_rows = 10_000
    num_columns = 2
    filter_size = 1_000

    data = np.random.randn(num_rows)
    index_vals = [i for i in range(0, num_rows)]
    filter_vals = [i for i in range(0, filter_size)]

    pdf = pd.DataFrame({'data{}'.format(i): data for i in range(num_columns)})
    index_df_col = pd.DataFrame(index_vals)
    pdf['index'] = index_df_col

    tb1 = Table.from_pandas(ctx, pdf)
    tb1['index'] = Table.from_pandas(ctx, index_df_col)
    index_column = 'index'
    tb1.set_index(index_column, drop=True)
    pdf.set_index(index_column, drop=True, inplace=True)

    print(tb1.shape, pdf.shape)
    i0 = pdf.index.values[0]
    print(type(i0), i0)
Exemplo n.º 10
0
def test_isin_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()
    filter_isin = [20, 11, 23]
    print(tb1)

    print(pdf)

    tb1.set_index('a', drop=True)
    pdf.set_index('a', inplace=True)

    filter_pdf: pd.DataFrame = pdf[['b', 'c']].iloc[0:5]

    tb_res = tb1[tb1['b'].isin(filter_isin)]
    pdf_res = pdf[pdf['b'].isin(filter_isin)]

    print(tb_res)
    print(pdf_res)

    assert tb_res.to_pandas().values.tolist() == pdf_res.values.tolist()

    print(tb_res.index.values)
    print(pdf_res.index.values)

    assert tb_res.index.values.tolist() == pdf_res.index.values.tolist()
Exemplo n.º 11
0
def test_df_iterrows():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    tb1.set_index(tb1.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)
    num_records = tb1.row_count
    print(pdf)

    for idx, row in pdf.iterrows():
        print(idx)
        print(row)

    dict = tb1.to_pydict(with_index=False)
    indices = tb1.index.index_values
    rows = []

    for index_id, index in enumerate(indices):
        row = []
        for col in dict:
            row.append(dict[col][index_id])
        rows.append(row)

    for index, row in zip(indices, rows):
        print(index, row)

    for index1, row1, composite in zip(indices, rows, pdf.iterrows()):
        index2 = composite[0]
        row2 = composite[1].tolist()
        assert index1 == index2
        assert row1 == row2
Exemplo n.º 12
0
def test_isin_column():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    tb1.set_index(tb1.column_names[0], drop=True)
    pdf.set_index(pdf.columns[0], drop=True, inplace=True)

    print(tb1)
    print(pdf)

    isin_values = [10, 20, 30, 5, 2, 8]

    tbx = tb1['b'].isin(isin_values)
    pdfx = pdf['b'].isin(isin_values)

    print(tbx)

    print(pdfx)

    tb_list = tbx.to_pandas().values.flatten().tolist()
    pd_list = pdfx.values.tolist()

    assert tb_list == pd_list

    print(tb_list)
    print(pd_list)
Exemplo n.º 13
0
def test_df_with_index():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    print(pdf.columns[0])
    pdf1 = pdf[[pdf.columns[0]]]

    print(pdf)

    print(pdf1)

    pdf3 = pdf.set_index(pdf.columns[0], drop=True)

    print(pdf3)

    artb = pa.Table.from_pandas(df=pdf3,
                                schema=None,
                                preserve_index=True,
                                nthreads=None,
                                columns=None,
                                safe=False)

    print(artb)
Exemplo n.º 14
0
def test_unique():
    ctx = CylonContext(config=None, distributed=False)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf: pd.DataFrame = tb1.to_pandas()

    expected_indices_of_sort_col = [1, 2, 3, 4, 5, 7, 10, 12, 13, 14, 15]

    print("Original Data")
    print(pdf)

    tb2 = tb1['b'].unique()
    pdf2 = pdf['b'].unique()
    tb2.show()

    print("Unique Pdf")
    print(pdf2)
    print(type(pdf2))

    print("Unique Cylon")
    print(tb2)

    tb3_list = list(tb2.to_pydict().items())[0][1]
    pdf3_list = pdf2.tolist()

    assert tb3_list == pdf3_list

    set_pdf4 = set(pdf2)
    set_tb4 = set(tb3_list)

    assert set_tb4 == set_pdf4

    ctx.finalize()
Exemplo n.º 15
0
def math_op(num_rows: int, num_cols: int, unique_factor: float, op=add):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "numpy")

    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=unique_factor)
    filter_column = pdf.columns[0]
    filter_column_data = pdf[pdf.columns[0]]
    random_index = np.random.randint(low=0, high=pdf.shape[0])
    math_value = filter_column_data.values[random_index]
    tb = Table.from_pandas(ctx, pdf)

    cylon_math_op_time = time.time()
    tb_filter = op(tb, math_value)
    cylon_math_op_time = time.time() - cylon_math_op_time

    pandas_math_op_time = time.time()
    pdf_filter = op(pdf, math_value)  # pdf[filter_column] > filter_value
    pandas_math_op_time = time.time() - pandas_math_op_time

    pandas_eval_math_op_time = time.time()
    pdf_filter = pd.eval("op(pdf, math_value)")
    pandas_eval_math_op_time = time.time() - pandas_eval_math_op_time

    return pandas_math_op_time, pandas_eval_math_op_time, cylon_math_op_time
Exemplo n.º 16
0
def test_temporal_types():
    ctx: CylonContext = CylonContext()

    ar1 = pa.array([4, 2, 1])
    ar2 = pa.array([datetime.datetime(2020, 5, 7), datetime.datetime(
        2020, 3, 17), datetime.datetime(2020, 1, 17)])
    ar3 = pa.array([4., 2., 1.])

    pa_t: pa.Table = pa.Table.from_arrays(
        [ar1, ar2, ar3], names=['col1', 'col2', 'col3'])

    cn_t = cn.Table.from_arrow(ctx, pa_t)

    def do_sort(col, ascending):
        srt = cn_t.sort(col, ascending)
        arr = srt.to_pydict()[col]
        print(srt)
        for i in range(len(arr) - 1):
            if ascending:
                assert arr[i] <= arr[i + 1]
            else:
                assert arr[i] >= arr[i + 1]

    do_sort('col2', True)
            
Exemplo n.º 17
0
def test_distributed_sort():
    import numpy as np
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank = ctx.get_rank()
    size = ctx.get_world_size()

    assert size == 4

    table1_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    print(tb1)

    tb2 = tb1.distributed_sort(order_by='use_id')

    col_data = tb2['use_id'].to_numpy()
    col_data = np.reshape(col_data, (col_data.shape[0]))

    def is_sort_array(array):
        for i in range(array.shape[0] - 1):
            if array[i] > array[i + 1]:
                return False
        return True

    assert is_sort_array(col_data)
Exemplo n.º 18
0
def null_handling_op(num_rows: int, num_cols: int, unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    df = get_dataframe(num_rows=num_rows,
                       num_cols=num_cols,
                       unique_factor=unique_factor,
                       with_null=True)
    index_df = get_dataframe(num_rows=num_rows,
                             num_cols=1,
                             unique_factor=1.0,
                             with_null=False)
    index_column = 'index_col'
    df[index_column] = index_df
    ct = Table.from_pandas(ctx, df)

    df.set_index(index_column, inplace=True, drop=True)
    ct.set_index(index_column, indexing_type=IndexingType.LINEAR, drop=True)

    pandas_time = time.time()
    df_isna = df.dropna(axis=1)
    pandas_time = time.time() - pandas_time

    cylon_time = time.time()
    ct_isna = ct.dropna(axis=0)
    cylon_time = time.time() - cylon_time

    pandas_eval_time = time.time()
    pd.eval('df.dropna(axis=1)')
    pandas_eval_time = time.time() - pandas_eval_time
    print(df_isna.shape, ct_isna.shape)
    return pandas_time, cylon_time, pandas_eval_time
Exemplo n.º 19
0
def benchmark_map_numeric():
    N = 10_000_000
    a_rand = np.random.random(size=N)
    b_rand = np.random.random(size=N)

    a = pa.array(a_rand)
    b = pa.array(b_rand)

    tb = pa.Table.from_arrays([a, b], ['c1', 'c2'])
    pdf: pd.DataFrame = tb.to_pandas()

    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cntb: Table = Table.from_arrow(ctx, tb)

    print(pdf.shape, cntb.shape)

    map_func = lambda x: x + x

    t1 = time.time()
    new_ct = cntb.applymap(map_func)
    t2 = time.time()
    new_pdf = pdf.applymap(map_func)
    t3 = time.time()

    print(f"Time for Cylon Apply Map {t2 - t1} s")
    print(f"Time for Pandas Apply Map {t3 - t2} s")
Exemplo n.º 20
0
def test_i_bitwise_ops():
    # TODO: Improve test and functionality: https://github.com/cylondata/cylon/issues/229
    npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50],
                    [36.2, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    a = cn_tb['0'] > 10
    b = cn_tb['1'] > 2
    a_pdf = pdf[0] > 10
    b_pdf = pdf[1] > 2

    d = a & b
    a &= b
    d_pdf = a_pdf & b_pdf
    a_pdf &= b_pdf

    assert d.to_pandas().values.tolist() == a.to_pandas().values.tolist()
    assert a.to_pandas().values.flatten().tolist() == a_pdf.values.tolist()

    ## OR

    a = cn_tb['0'] > 10
    b = cn_tb['1'] > 2
    a_pdf = pdf[0] > 10
    b_pdf = pdf[1] > 2

    d = a | b
    a |= b
    d_pdf = a_pdf | b_pdf
    a_pdf |= b_pdf

    assert d.to_pandas().values.tolist() == a.to_pandas().values.tolist()
    assert a.to_pandas().values.flatten().tolist() == a_pdf.values.tolist()
Exemplo n.º 21
0
def test_loc_op_mode_3():
    from pycylon.indexing.index import IndexingType
    from pycylon.indexing.index_utils import IndexUtil

    pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'),
                              'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'),
                              'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'),
                              'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121],
                                             dtype='int')})
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    print("Before Indexing")
    print(cn_tb)

    cn_tb.set_index('a', indexing_type, drop_index)

    pdf_float = pdf_float.set_index('a')

    print("After Indexing")
    assert cn_tb.column_names == ['b', 'c', 'd', 'e']

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    loc_cn_1 = cn_tb.loc["7":"20"]
    loc_pd_1 = pdf_float.loc["7":"20"]

    print(loc_cn_1.get_index().get_index_array())
    print(loc_pd_1.index.values)

    assert loc_pd_1.values.tolist() == loc_cn_1.to_pandas().values.tolist()
    assert loc_cn_1.get_index().get_index_array() == pa.array(loc_pd_1.index)

    loc_cn_2 = cn_tb.loc["7":]
    loc_pd_2 = pdf_float.loc["7":]

    assert loc_pd_2.values.tolist() == loc_cn_2.to_pandas().values.tolist()
    assert loc_cn_2.get_index().get_index_array() == pa.array(loc_pd_2.index)

    loc_cn_3 = cn_tb.loc[:"7"]
    loc_pd_3 = pdf_float.loc[:"7"]

    assert loc_pd_3.values.tolist() == loc_cn_3.to_pandas().values.tolist()
    assert loc_cn_3.get_index().get_index_array() == pa.array(loc_pd_3.index)

    loc_cn_4 = cn_tb.loc[:]
    loc_pd_4 = pdf_float.loc[:]

    assert loc_pd_4.values.tolist() == loc_cn_4.to_pandas().values.tolist()
    assert loc_cn_4.get_index().get_index_array() == pa.array(loc_pd_4.index)

    loc_cn_5 = cn_tb.loc[["7", "20"], :]
    loc_pd_5 = pdf_float.loc[["7", "20"], :]

    assert loc_pd_5.values.tolist() == loc_cn_5.to_pandas().values.tolist()
    assert loc_cn_5.get_index().get_index_array() == pa.array(loc_pd_5.index)
Exemplo n.º 22
0
def test_neg():
    npr = np.array([[1, 2, 3, 4, 5, -6, -7], [-1, -2, -3, -4, -5, 6, 7]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    neg_cn_tb: Table = -cn_tb
    neg_pdf = -pdf
    assert neg_cn_tb.to_pandas().values.tolist() == neg_pdf.values.tolist()
Exemplo n.º 23
0
def test_properties():
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    table1_path = '/tmp/user_usage_tm_1.csv'
    table2_path = '/tmp/user_usage_tm_2.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    tb: Table = read_csv(ctx, table1_path, csv_read_options)

    pdf = tb.to_pandas()

    def generate_filter_and_result(op, column: str, input, comparison_value):
        if column:
            filter = op(input[column], comparison_value)
            return filter, input[filter]
        else:
            filter = op(input, comparison_value)
            return filter, input[filter]

    def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table,
                                    pdf_filter: DataFrame,
                                    pdf_result: DataFrame, is_full_table):

        if is_full_table:
            assert tb_filter.to_pandas().values.tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().fillna(
                0).values.tolist() == pdf_result.fillna(0).values.tolist()
        else:
            assert tb_filter.to_pandas().values.flatten().tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().values.tolist(
            ) == pdf_result.values.tolist()

    ops = [
        operator.__eq__, operator.__ne__, operator.__lt__, operator.__gt__,
        operator.__le__, operator.__ge__
    ]
    value = 519.12
    columns = ['monthly_mb', None]
    is_full_table_flags = [False, True]

    for column, is_full_table in zip(columns, is_full_table_flags):
        for op in ops:
            tb_filter_all, tb_filter_all_result = generate_filter_and_result(
                op, column, tb, value)

            pdf_filter_all, pdf_filter_all_result = generate_filter_and_result(
                op, column, pdf, value)

            do_comparison_on_pdf_and_tb(tb_filter=tb_filter_all,
                                        tb_result=tb_filter_all_result,
                                        pdf_filter=pdf_filter_all,
                                        pdf_result=pdf_filter_all_result,
                                        is_full_table=is_full_table)
Exemplo n.º 24
0
def fixed_filter_bench():
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    num_rows = 10_000_000
    data = np.random.randn(num_rows)

    df = pd.DataFrame({'data{}'.format(i): data for i in range(2)})

    np_key = np.random.randint(0, 100, size=num_rows)
    np_all = df.to_numpy()

    df['key'] = np_key

    rb = pa.record_batch(df)
    t = pa.Table.from_pandas(df)

    ct = Table.from_pandas(ctx, df)

    print(ct.shape, df.shape)
    pdf_time = []
    ct_time = []
    rep = 1

    t1 = time.time()
    ct_filter = ct['key'] > 5
    t2 = time.time()
    df_filter = df['key'] > 5
    t3 = time.time()
    ct_res = ct[ct_filter]
    t4 = time.time()
    df_res = df[df_filter]
    t5 = time.time()
    np_filter = np_key > 5
    t6 = time.time()
    np_res = np_all[np_filter]
    t7 = time.time()

    print(f"PyCylon filter time :  {t2 - t1} s")
    print(f"Pandas filter time: {t3 - t2} s")
    print(f"Numpy filter time: {t6 - t5} s")
    print(f"PyCylon assign time: {t4 - t3} s")
    print(f"Pandas assign time: {t5 - t4} s")
    print(f"Numpy assign time: {t7 - t6} s")

    artb = t

    artb_filter = ct_filter.to_arrow().combine_chunks()
    artb_array_filter = artb_filter.columns[0].chunks[0]
    t_ar_s = time.time()
    artb = artb.combine_chunks()
    from pyarrow import compute
    res = []
    for chunk_arr in artb.itercolumns():
        res.append(chunk_arr.filter(artb_array_filter))
    t_ar_e = time.time()
    res_t = pa.Table.from_arrays(res, ct.column_names)
    t_ar_e_2 = time.time()
    print(f"PyArrow Filter Time : {t_ar_e - t_ar_s}")
    print(f"PyArrow Table Creation : {t_ar_e_2 - t_ar_e}")
Exemplo n.º 25
0
def test_notnull():
    columns = ['col1', 'col2']
    data = [[1, 2, 3, 4, 5, None], [None, 7, 8, 9, 10, 11]]
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb = cn.Table.from_list(ctx, columns, data)
    df = cn_tb.to_pandas()

    assert df.notnull().values.tolist() == cn_tb.notnull().to_pandas(
    ).values.tolist()
Exemplo n.º 26
0
def test_empty_table():
    from pycylon.data.table import EmptyTable
    from pycylon.index import RangeIndex
    import pandas as pd
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    empt_tb = EmptyTable(ctx, RangeIndex(data=range(0, 0)))

    assert empt_tb.to_pandas().values.tolist() == pd.DataFrame().values.tolist(
    )
Exemplo n.º 27
0
def test_context_and_configs():
    from pycylon.net.mpi_config import MPIConfig
    from pycylon import CylonContext

    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    print("Hello World From Rank {}, Size {}".format(ctx.get_rank(),
                                                     ctx.get_world_size()))
def test_multi_process():
    mpi_config = MPIConfig()
    ctx: CylonContext = CylonContext(config=mpi_config, distributed=True)

    rank, size = ctx.get_rank(), ctx.get_world_size()

    assert size == 4

    csv_read_options = CSVReadOptions().use_threads(True).block_size(1 << 30)

    table1_path = f'/tmp/user_device_tm_{rank + 1}.csv'
    table2_path = f'/tmp/user_usage_tm_{rank + 1}.csv'

    assert os.path.exists(table1_path) and os.path.exists(table2_path)

    tb1: Table = read_csv(ctx, table1_path, csv_read_options)

    tb2: Table = read_csv(ctx, table2_path, csv_read_options)

    print(tb1.column_names)
    print(tb2.column_names)

    configs = {
        'join_type': 'inner',
        'algorithm': 'sort',
        'left_col': 0,
        'right_col': 0
    }

    tb3: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=[0],
                                      right_on=[3])

    tb4: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      left_on=['use_id'],
                                      right_on=['use_id'])

    tb5: Table = tb1.distributed_join(table=tb2,
                                      join_type=configs['join_type'],
                                      algorithm=configs['algorithm'],
                                      on=['use_id'])

    assert tb3.column_count == tb4.column_count == tb4.column_count == 8

    if rank == 0:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 640
    if rank == 1:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 624
    if rank == 2:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 592
    if rank == 3:
        assert tb3.row_count == tb4.row_count == tb5.row_count == 688
Exemplo n.º 29
0
def test_add_suffix():
    npr = np.array([[20.2, 2.0, 3.2, 4.3, 5.5], [10, -20, -30, -40, -50],
                    [36.8, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    suffix = "item_"
    cn_tb_with_suffix = cn_tb.add_suffix(suffix)
    pdf_with_suffix = pdf.add_suffix(suffix)

    assert pdf_with_suffix.columns.tolist() == cn_tb_with_suffix.column_names
Exemplo n.º 30
0
def test_read_csv_with_use_cols():
    ctx = CylonContext(config=None, distributed=False)
    use_cols = ['a', 'b']
    csv_read_options = CSVReadOptions() \
        .use_threads(True) \
        .block_size(1 << 30) \
        .use_cols(use_cols)
    table_path = '/tmp/duplicate_data_0.csv'
    tb1: Table = read_csv(ctx, table_path, csv_read_options)
    pdf = pd.read_csv(table_path, usecols=use_cols)
    assert tb1.column_names == use_cols == pdf.columns.tolist()