Пример #1
0
def test_loc_with_list():
    ctx = CylonContext(config=None, distributed=False)

    dataset = []
    num_rows = 10_000
    num_columns = 2
    filter_size = 1_000

    data = np.random.randn(num_rows)
    index_vals = [i for i in range(0, num_rows)]
    filter_vals = [i for i in range(0, filter_size)]

    pdf = pd.DataFrame({'data{}'.format(i): data for i in range(num_columns)})
    index_df_col = pd.DataFrame(index_vals)
    pdf['index'] = index_df_col

    tb1 = Table.from_pandas(ctx, pdf)
    tb1['index'] = Table.from_pandas(ctx, index_df_col)
    index_column = 'index'
    tb1.set_index(index_column, drop=True)
    pdf.set_index(index_column, drop=True, inplace=True)

    print(tb1.shape, pdf.shape)
    i0 = pdf.index.values[0]
    print(type(i0), i0)
def test_rename():
    col_names = ['col1', 'col2', 'col3', 'col4']
    data_list_numeric = [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10],
                         [11, 12, 13, 14, 15], [16, 17, 18, 19, 20]]
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb = Table.from_list(ctx, col_names, data_list_numeric)
    index_values = [0, 1, 2, 3, 4]
    cn_tb.set_index(index_values)
    prev_col_names = cn_tb.column_names
    # with dictionary
    columns = {'col1': 'col-1', 'col3': 'col-3'}
    cn_tb.rename(columns)

    new_col_names = cn_tb.column_names

    for key in columns:
        value = columns[key]
        assert prev_col_names.index(key) == new_col_names.index(value)

    # with list
    cn_tb_list = Table.from_list(ctx, col_names, data_list_numeric)
    cn_tb_list.set_index(index_values)
    prev_col_names = cn_tb_list.column_names
    new_column_names = ['col-1', 'col-2', 'col-3', 'col-4']
    cn_tb_list.rename(new_column_names)

    assert cn_tb_list.column_names == new_column_names
Пример #3
0
def shuffle():
    mpi_config = MPIConfig()

    ctx = CylonContext(config=mpi_config, distributed=True)
    rows = 5
    tb: Table = Table.from_pydict(ctx, {'c1': [i for i in range(rows)], 'c2': [i * 2 for i in range(
        rows)], 'c3': [i * 3 for i in range(rows)]})

    tb: Table = Table.from_numpy(ctx, ['c1', 'c2', 'c3'], [np.random.random(size=rows),
                                                           np.random.random(size=rows),
                                                           np.random.random(size=rows)])

    print(tb.shape)

    tb_shuffle = tb.shuffle(['c1'])

    tb_shuffle_dna = tb_shuffle.dropna(axis=1, how='all')

    print("Rank : ", ctx.get_rank(), tb_shuffle.shape, tb.shape, tb_shuffle_dna.shape)

    from pycylon.io import CSVWriteOptions

    csv_write_options = CSVWriteOptions().with_delimiter(',')
    #
    # tb_shuffle.to_csv(f'/tmp/shuffle_{rows}_{ctx.get_rank()}.csv', csv_write_options)

    ctx.finalize()
Пример #4
0
def test_pdf_to_pdf_assign():
    index1 = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    index2 = [0, 1, 2, 3, 4]
    index3 = [10, 11, 12, 13, 14, 15, 16, 17, 7]
    pdf1 = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 110, 111, 112, 113],
            'b': [10, 11, 12, 13, 14, 5, 4, 3, 2]
        },
        index=index1)

    pdf2 = pd.DataFrame(
        {
            'a': [10, 20, 30, 40, 50],
            'b': [100, 101, 102, 103, 104]
        },
        index=index2)

    pdf3 = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 110, 111, 112, 113],
            'b': [1110, 1111, 1112, 1113, 1114, 115, 114, 113, 112]
        },
        index=index3)

    tb1 = Table.from_pandas(ctx, pdf1)
    tb1.set_index(index1)
    tb2 = Table.from_pandas(ctx, pdf2)
    tb2.set_index(index2)
    tb3 = Table.from_pandas(ctx, pdf3)
    tb3.set_index(index3)

    print(pdf1)
    print("-----------")
    print(pdf2)
    print("-----------")
    print(pdf3)
    print("-----------")
    gp = pdf1['b']
    # print(pdf1['b'] < 6)
    print(gp[pdf1['b'] < 6])
    print(gp)
    print("-----------")
    gp[pdf1['b'] < 6] = pdf3['b']
    print(gp)

    tb_gp = tb1['b']
    print(tb_gp)
    print(tb_gp.index.index_values)
    tb_sample = tb_gp[tb1['b'] < 6]
    print(tb_sample)
    print(tb_sample.index.index_values)
Пример #5
0
    def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table,
                                    pdf_filter: DataFrame,
                                    pdf_result: DataFrame, is_full_table):

        if is_full_table:
            assert tb_filter.to_pandas().values.tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().fillna(
                0).values.tolist() == pdf_result.fillna(0).values.tolist()
        else:
            assert tb_filter.to_pandas().values.flatten().tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().values.tolist(
            ) == pdf_result.values.tolist()
Пример #6
0
def math_op(num_rows: int, num_cols: int, unique_factor: float, op=add):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "numpy")

    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=unique_factor)
    filter_column = pdf.columns[0]
    filter_column_data = pdf[pdf.columns[0]]
    random_index = np.random.randint(low=0, high=pdf.shape[0])
    math_value = filter_column_data.values[random_index]
    tb = Table.from_pandas(ctx, pdf)

    cylon_math_op_time = time.time()
    tb_filter = op(tb, math_value)
    cylon_math_op_time = time.time() - cylon_math_op_time

    pandas_math_op_time = time.time()
    pdf_filter = op(pdf, math_value)  # pdf[filter_column] > filter_value
    pandas_math_op_time = time.time() - pandas_math_op_time

    pandas_eval_math_op_time = time.time()
    pdf_filter = pd.eval("op(pdf, math_value)")
    pandas_eval_math_op_time = time.time() - pandas_eval_math_op_time

    return pandas_math_op_time, pandas_eval_math_op_time, cylon_math_op_time
Пример #7
0
def math_op_base():
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    num_rows = 10_000_000
    data = np.random.randn(num_rows)

    df = pd.DataFrame({'data{}'.format(i): data for i in range(100)})

    np_key = np.random.randint(0, 100, size=num_rows)
    np_all = df.to_numpy()

    df['key'] = np_key

    rb = pa.record_batch(df)
    t = pa.Table.from_pandas(df)

    ct = Table.from_pandas(ctx, df)

    t1 = time.time()
    np_key + 1
    t2 = time.time()
    ct['key'] + 1
    t3 = time.time()
    df['key'] + 1
    t4 = time.time()
    artb = ct.to_arrow().combine_chunks()
    ar_key = ct['key'].to_arrow().combine_chunks().columns[0].chunks[0]
    pc.add(ar_key, 1)
    t5 = time.time()

    print(f"Numpy Time: {t2 - t1} s")
    print(f"PyCylon Time: {t3 - t2} s")
    print(f"Pandas Time: {t4 - t3} s")
    print(f"PyArrow Time: {t5 - t4} s")
Пример #8
0
def test_additions_and_maps():
    from pycylon import Table
    from pycylon import CylonContext
    import pandas as pd
    import numpy as np

    pdf = pd.DataFrame({
        'idx': ['x', 'y', 'z'],
        'col-1': ["a", "b", "c"],
        'col-2': [10, 20, 30],
        'col-3': ['Y', 'N', 'Y']
    })

    tb = Table.from_pandas(ctx, pdf)

    print(tb)

    tb_s = tb['col-1'].applymap(lambda x: x + "_i")
    tb_log = tb['col-2'].applymap(lambda x: np.log10(x))
    tb_y = tb['col-3'].applymap(lambda x: (x == 'Y'))

    tb['col-1'] = tb_s
    tb['col-2'] = tb_log

    tb = tb[tb_y]
    pdf = pdf[pdf['col-3'].map(lambda x: (x == 'Y'))]

    print(pdf.to_dict())

    print(tb.to_pydict())
Пример #9
0
def null_handling_op(num_rows: int, num_cols: int, unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    df = get_dataframe(num_rows=num_rows,
                       num_cols=num_cols,
                       unique_factor=unique_factor,
                       with_null=True)
    index_df = get_dataframe(num_rows=num_rows,
                             num_cols=1,
                             unique_factor=1.0,
                             with_null=False)
    index_column = 'index_col'
    df[index_column] = index_df
    ct = Table.from_pandas(ctx, df)

    df.set_index(index_column, inplace=True, drop=True)
    ct.set_index(index_column, indexing_type=IndexingType.LINEAR, drop=True)

    pandas_time = time.time()
    df_isna = df.dropna(axis=1)
    pandas_time = time.time() - pandas_time

    cylon_time = time.time()
    ct_isna = ct.dropna(axis=0)
    cylon_time = time.time() - cylon_time

    pandas_eval_time = time.time()
    pd.eval('df.dropna(axis=1)')
    pandas_eval_time = time.time() - pandas_eval_time
    print(df_isna.shape, ct_isna.shape)
    return pandas_time, cylon_time, pandas_eval_time
def test_i_bitwise_ops():
    # TODO: Improve test and functionality: https://github.com/cylondata/cylon/issues/229
    npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50],
                    [36.2, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    a = cn_tb['0'] > 10
    b = cn_tb['1'] > 2
    a_pdf = pdf[0] > 10
    b_pdf = pdf[1] > 2

    d = a & b
    a &= b
    d_pdf = a_pdf & b_pdf
    a_pdf &= b_pdf

    assert d.to_pandas().values.tolist() == a.to_pandas().values.tolist()
    assert a.to_pandas().values.flatten().tolist() == a_pdf.values.tolist()

    ## OR

    a = cn_tb['0'] > 10
    b = cn_tb['1'] > 2
    a_pdf = pdf[0] > 10
    b_pdf = pdf[1] > 2

    d = a | b
    a |= b
    d_pdf = a_pdf | b_pdf
    a_pdf |= b_pdf

    assert d.to_pandas().values.tolist() == a.to_pandas().values.tolist()
    assert a.to_pandas().values.flatten().tolist() == a_pdf.values.tolist()
Пример #11
0
def benchmark_map_numeric():
    N = 10_000_000
    a_rand = np.random.random(size=N)
    b_rand = np.random.random(size=N)

    a = pa.array(a_rand)
    b = pa.array(b_rand)

    tb = pa.Table.from_arrays([a, b], ['c1', 'c2'])
    pdf: pd.DataFrame = tb.to_pandas()

    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cntb: Table = Table.from_arrow(ctx, tb)

    print(pdf.shape, cntb.shape)

    map_func = lambda x: x + x

    t1 = time.time()
    new_ct = cntb.applymap(map_func)
    t2 = time.time()
    new_pdf = pdf.applymap(map_func)
    t3 = time.time()

    print(f"Time for Cylon Apply Map {t2 - t1} s")
    print(f"Time for Pandas Apply Map {t3 - t2} s")
def test_math_ops_for_table_values():
    pdf = DataFrame({
        '0': [1, 2, 3, 4],
        '1': [5, 6, 7, 9],
        '2': [1., 2., 3., 4.]
    })
    ctx: CylonContext = CylonContext()
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    from operator import add, sub, mul, truediv
    ops = [add]  #, sub, mul, truediv]

    for op in ops:
        # test column division
        cn_res = op(cn_tb['0'], cn_tb['0'])
        pd_res = op(pdf['0'], pdf['0'])

        # pandas series.values returns an array, whereas dataframe.values list of lists. Hence it
        # needs to be flattened to compare
        assert pd_res.values.tolist() == cn_res.to_pandas().values.flatten(
        ).tolist()

        # test table division
        cn_res2 = op(cn_tb, cn_tb['0'])
        pd_res2 = getattr(pdf, op.__name__)(pdf['0'], axis=0)

        assert pd_res2.values.tolist() == cn_res2.to_pandas().values.tolist()
def test_setitem():
    npr = np.array([[1, 2, 3, 4, 5], [-1, -2, -3, -4, -5]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    # replacing an existing column
    cn_tb['0'] = cn_tb['4']
    assert cn_tb['0'].to_pandas().values.tolist() == cn_tb['4'].to_pandas(
    ).values.tolist()
    # adding a new column at the end
    cn_tb['5'] = cn_tb['4']
    assert cn_tb['5'].to_pandas().values.tolist() == cn_tb['4'].to_pandas(
    ).values.tolist()

    cn_tb['6'] = 1  # create new column
    assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(),
                          np.full(cn_tb.row_count, 1))

    cn_tb['6'] = 1.0  # replace column
    assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(),
                          np.full(cn_tb.row_count, 1.0))

    cn_tb['6'] = 'aaa'  # replace column
    assert np.array_equal(cn_tb['6'].to_pandas().values.flatten(),
                          np.full(cn_tb.row_count, 'aaa'))
Пример #14
0
def indexing_op(num_rows: int, num_cols: int, unique_factor: float):
    from pycylon.indexing.index import IndexingType
    ctx: cn.CylonContext = cn.CylonContext(config=None, distributed=False)
    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=unique_factor)
    filter_column = pdf.columns[0]
    filter_column_data = pdf[pdf.columns[0]]
    random_index = np.random.randint(low=0, high=pdf.shape[0])
    filter_value = filter_column_data.values[random_index]
    filter_values = filter_column_data.values.tolist()[0:pdf.shape[0] // 2]
    tb = Table.from_pandas(ctx, pdf)
    cylon_indexing_time = time.time()
    tb.set_index(filter_column, indexing_type=IndexingType.LINEAR, drop=True)
    cylon_indexing_time = time.time() - cylon_indexing_time
    pdf_indexing_time = time.time()
    pdf.set_index(filter_column, drop=True, inplace=True)
    pdf_indexing_time = time.time() - pdf_indexing_time

    cylon_filter_time = time.time()
    tb_filter = tb.loc[filter_values]
    cylon_filter_time = time.time() - cylon_filter_time

    pandas_filter_time = time.time()
    pdf_filtered = pdf.loc[filter_values]
    pandas_filter_time = time.time() - pandas_filter_time

    print(tb_filter.shape, pdf_filtered.shape)

    return pandas_filter_time, cylon_filter_time, pdf_indexing_time, cylon_indexing_time
Пример #15
0
def test_reset_index():
    from pycylon.indexing.index import IndexingType
    from pycylon.indexing.index_utils import IndexUtil

    pdf_float = pd.DataFrame({'a': pd.Series([1, 4, 7, 10, 20, 23, 10], dtype=np.int64()),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int')})
    pdf = pd.DataFrame([[1, 2], [4, 5], [7, 8], [10, 11], [20, 22], [23, 25], [10, 12]])
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    # cn_tb.set_index('a', indexing_schema, drop_index)
    cn_tb.set_index('a', indexing_type, drop_index)

    # assert cn_tb.get_index().get_type() == IndexingSchema.LINEAR

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    rest_drop_index = False
    # cn_tb.reset_index(rest_drop_index)
    cn_tb.reset_index(rest_drop_index)

    assert cn_tb.column_names == ['index', 'b']

    # assert cn_tb.get_index().get_schema() == IndexingSchema.RANGE
    assert cn_tb.get_index().get_type() == IndexingType.RANGE
def test_math_ops_for_scalar():
    npr = np.array([[20, 2, 3, 4, 5], [10, -20, -30, -40, -50],
                    [10.2, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)

    from operator import add, sub, mul, truediv
    ops = [add, sub, mul, truediv]

    for op in ops:
        cn_tb_1 = cn_tb
        pdf_1 = pdf
        # test column division
        cn_tb_1['0'] = op(cn_tb_1['0'], 2)
        pdf_1[0] = op(pdf_1[0], 2)

        assert pdf_1.values.tolist() == cn_tb_1.to_pandas().values.tolist()

        # test table division
        cn_tb_2 = cn_tb
        pdf_2 = pdf

        cn_tb_2 = op(cn_tb_2, 2)
        pdf_2 = op(pdf, 2)

        assert pdf_2.values.tolist() == cn_tb_2.to_pandas().values.tolist()
Пример #17
0
def test_loc_op_mode_3():
    from pycylon.indexing.index import IndexingType
    from pycylon.indexing.index_utils import IndexUtil

    pdf_float = pd.DataFrame({'a': pd.Series(["1", "4", "7", "10", "20", "23", "11"]),
                              'b': pd.Series([2, 5, 8, 11, 22, 25, 12], dtype='int'),
                              'c': pd.Series([12, 15, 18, 111, 122, 125, 112], dtype='int'),
                              'd': pd.Series([212, 215, 218, 211, 222, 225, 312], dtype='int'),
                              'e': pd.Series([1121, 12151, 12181, 12111, 12221, 12251, 13121],
                                             dtype='int')})
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf_float)
    indexing_type = IndexingType.LINEAR
    drop_index = True

    print("Before Indexing")
    print(cn_tb)

    cn_tb.set_index('a', indexing_type, drop_index)

    pdf_float = pdf_float.set_index('a')

    print("After Indexing")
    assert cn_tb.column_names == ['b', 'c', 'd', 'e']

    assert cn_tb.get_index().get_type() == IndexingType.LINEAR

    loc_cn_1 = cn_tb.loc["7":"20"]
    loc_pd_1 = pdf_float.loc["7":"20"]

    print(loc_cn_1.get_index().get_index_array())
    print(loc_pd_1.index.values)

    assert loc_pd_1.values.tolist() == loc_cn_1.to_pandas().values.tolist()
    assert loc_cn_1.get_index().get_index_array() == pa.array(loc_pd_1.index)

    loc_cn_2 = cn_tb.loc["7":]
    loc_pd_2 = pdf_float.loc["7":]

    assert loc_pd_2.values.tolist() == loc_cn_2.to_pandas().values.tolist()
    assert loc_cn_2.get_index().get_index_array() == pa.array(loc_pd_2.index)

    loc_cn_3 = cn_tb.loc[:"7"]
    loc_pd_3 = pdf_float.loc[:"7"]

    assert loc_pd_3.values.tolist() == loc_cn_3.to_pandas().values.tolist()
    assert loc_cn_3.get_index().get_index_array() == pa.array(loc_pd_3.index)

    loc_cn_4 = cn_tb.loc[:]
    loc_pd_4 = pdf_float.loc[:]

    assert loc_pd_4.values.tolist() == loc_cn_4.to_pandas().values.tolist()
    assert loc_cn_4.get_index().get_index_array() == pa.array(loc_pd_4.index)

    loc_cn_5 = cn_tb.loc[["7", "20"], :]
    loc_pd_5 = pdf_float.loc[["7", "20"], :]

    assert loc_pd_5.values.tolist() == loc_cn_5.to_pandas().values.tolist()
    assert loc_cn_5.get_index().get_index_array() == pa.array(loc_pd_5.index)
def test_neg():
    npr = np.array([[1, 2, 3, 4, 5, -6, -7], [-1, -2, -3, -4, -5, 6, 7]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    neg_cn_tb: Table = -cn_tb
    neg_pdf = -pdf
    assert neg_cn_tb.to_pandas().values.tolist() == neg_pdf.values.tolist()
Пример #19
0
def fixed_filter_bench():
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    num_rows = 10_000_000
    data = np.random.randn(num_rows)

    df = pd.DataFrame({'data{}'.format(i): data for i in range(2)})

    np_key = np.random.randint(0, 100, size=num_rows)
    np_all = df.to_numpy()

    df['key'] = np_key

    rb = pa.record_batch(df)
    t = pa.Table.from_pandas(df)

    ct = Table.from_pandas(ctx, df)

    print(ct.shape, df.shape)
    pdf_time = []
    ct_time = []
    rep = 1

    t1 = time.time()
    ct_filter = ct['key'] > 5
    t2 = time.time()
    df_filter = df['key'] > 5
    t3 = time.time()
    ct_res = ct[ct_filter]
    t4 = time.time()
    df_res = df[df_filter]
    t5 = time.time()
    np_filter = np_key > 5
    t6 = time.time()
    np_res = np_all[np_filter]
    t7 = time.time()

    print(f"PyCylon filter time :  {t2 - t1} s")
    print(f"Pandas filter time: {t3 - t2} s")
    print(f"Numpy filter time: {t6 - t5} s")
    print(f"PyCylon assign time: {t4 - t3} s")
    print(f"Pandas assign time: {t5 - t4} s")
    print(f"Numpy assign time: {t7 - t6} s")

    artb = t

    artb_filter = ct_filter.to_arrow().combine_chunks()
    artb_array_filter = artb_filter.columns[0].chunks[0]
    t_ar_s = time.time()
    artb = artb.combine_chunks()
    from pyarrow import compute
    res = []
    for chunk_arr in artb.itercolumns():
        res.append(chunk_arr.filter(artb_array_filter))
    t_ar_e = time.time()
    res_t = pa.Table.from_arrays(res, ct.column_names)
    t_ar_e_2 = time.time()
    print(f"PyArrow Filter Time : {t_ar_e - t_ar_s}")
    print(f"PyArrow Table Creation : {t_ar_e_2 - t_ar_e}")
Пример #20
0
def test_tb_to_pydict_with_index():
    pdf = pd.DataFrame({
        'idx': ['x', 'y', 'z'],
        'col-1': ["a", "b", "c"],
        'col-2': [10, 20, 30],
        'col-3': ['Y', 'N', 'Y']
    })

    tb = Table.from_pandas(ctx, pdf)

    assert tb.to_pydict(with_index=True) == pdf.to_dict()
def test_add_suffix():
    npr = np.array([[20.2, 2.0, 3.2, 4.3, 5.5], [10, -20, -30, -40, -50],
                    [36.8, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    suffix = "item_"
    cn_tb_with_suffix = cn_tb.add_suffix(suffix)
    pdf_with_suffix = pdf.add_suffix(suffix)

    assert pdf_with_suffix.columns.tolist() == cn_tb_with_suffix.column_names
Пример #22
0
def test_column_str_ops():
    file_path = '/home/vibhatha/data/cylon/none_data.csv'
    df = pd.read_csv(file_path, na_values='Nan', header=0)
    tb_cn = Table.from_pandas(ctx, df)
    print(df)

    df['d'] = df['d'].str.replace('-', '')
    print(df)
    tb_cn['d'] = tb_cn['d'].applymap(lambda x: x.replace('-', ''))

    print(tb_cn)
def test_string_type_filters():
    ctx: CylonContext = CylonContext()

    tb: Table = Table.from_pydict(ctx, {
        "A": ['a', 'b', 'c', 'ab', 'a'],
        "B": [1, 2, 3, 4, 5]
    })
    pdf = tb.to_pandas()

    def generate_filter_and_result(op, column: str, input, comparison_value):
        if column:
            filter = op(input[column], comparison_value)
            return filter, input[filter]
        else:
            filter = op(input, comparison_value)
            return filter, input[filter]

    def do_comparison_on_pdf_and_tb(tb_filter: Table, tb_result: Table,
                                    pdf_filter: DataFrame,
                                    pdf_result: DataFrame, is_full_table):

        if is_full_table:
            assert tb_filter.to_pandas().values.tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().fillna(
                0).values.tolist() == pdf_result.fillna(0).values.tolist()
        else:
            assert tb_filter.to_pandas().values.flatten().tolist(
            ) == pdf_filter.values.tolist()
            assert tb_result.to_pandas().values.tolist(
            ) == pdf_result.values.tolist()

    ops = [
        operator.__eq__, operator.__ne__, operator.__lt__, operator.__gt__,
        operator.__le__, operator.__ge__
    ]
    value = "a"
    columns = ["A"]
    is_full_table_flags = [False]

    for column, is_full_table in zip(columns, is_full_table_flags):
        for op in ops:
            tb_filter_all, tb_filter_all_result = generate_filter_and_result(
                op, column, tb, value)

            pdf_filter_all, pdf_filter_all_result = generate_filter_and_result(
                op, column, pdf, value)

            do_comparison_on_pdf_and_tb(tb_filter=tb_filter_all,
                                        tb_result=tb_filter_all_result,
                                        pdf_filter=pdf_filter_all,
                                        pdf_result=pdf_filter_all_result,
                                        is_full_table=is_full_table)
Пример #24
0
def tb_creation_op(num_rows: int, num_cols: int, duplication_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "numpy")
    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=duplication_factor)
    # data_row_list
    data_row = [np.random.randint(num_rows)] * num_rows
    # data row np array
    data_row_ar = np.array(data_row)
    # data row pa array
    data_row_pa_ar = pa.array(data_row)

    data_set = [data_row for i in range(num_cols)]
    data_set_ar = [data_row_ar for i in range(num_cols)]
    data_set_pa_ar = [data_row_pa_ar for i in range(num_cols)]

    column_names = ["data_" + str(i) for i in range(num_cols)]

    t_pandas = time.time()
    tb = Table.from_pandas(ctx, pdf)
    t_pandas = time.time() - t_pandas

    t_list = time.time()
    tb1 = Table.from_list(ctx, column_names, data_set)
    t_list = time.time() - t_list

    t_numpy = time.time()
    tb2 = Table.from_numpy(ctx, column_names, data_set_ar)
    t_numpy = time.time() - t_numpy

    t_arrow = time.time()
    tb3 = pa.Table.from_arrays(data_set_pa_ar, column_names)
    t_arrow = time.time() - t_arrow

    t_cylon_from_arrow = time.time()
    tb4 = Table.from_arrow(ctx, tb3)
    t_cylon_from_arrow = time.time() - t_cylon_from_arrow
    return t_pandas, t_numpy, t_list, t_arrow, t_cylon_from_arrow
def test_fillna():
    col_names = ['col1', 'col2']
    data_list_numeric = [[1, 2, None, 4, 5], [6, 7, 8, 9, None]]
    fill_value = 0
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb_numeric = Table.from_list(ctx, col_names, data_list_numeric)

    cn_tb_numeric_fillna = cn_tb_numeric.fillna(fill_value)

    data_list = list(cn_tb_numeric_fillna.to_pydict().values())
    for col in data_list:
        assert not col.__contains__(None)
        assert col.__contains__(fill_value)
Пример #26
0
def test_uno_data_load():
    file_path = "/home/vibhatha/sandbox/UNO/Benchmarks/Data/Pilot1/"
    file_name = "combined_single_response_agg"
    save_file = "/tmp/combined_single_response_agg_enum"
    path = os.path.join(file_path, file_name)
    csv_read_options = CSVReadOptions().use_threads(True).block_size(
        1 << 30).with_delimiter("\t")
    t1 = time.time()
    tb = read_csv(ctx, path, csv_read_options)
    t2 = time.time()

    print(t2 - t1)

    print(tb.shape)

    print(tb.to_arrow())

    print(tb.column_names)

    tb_drugs = tb['DRUG']

    tb_drug = tb.unique(columns=['DRUG'], keep='first')['DRUG']

    tb_drug_ar_tb = tb_drug.to_arrow().combine_chunks()
    tb_drug_list = tb_drug_ar_tb.column(0).chunk(0).tolist()

    tb_drugs_ar_tb = tb_drugs.to_arrow().combine_chunks()
    tb_drugs_list = tb_drugs_ar_tb.column(0).chunk(0).tolist()

    tb_drug_list_dict = {}

    for index, drug in enumerate(tb_drug_list):
        tb_drug_list_dict[drug] = index

    tb_drugs_enum_list = []

    for drug in tb_drugs_list:
        tb_drugs_enum_list.append(tb_drug_list_dict[drug])

    tb_enum_drug = Table.from_list(ctx, ['DRUG'], [tb_drugs_enum_list])

    print(tb_enum_drug.shape, tb_drugs.shape)
    tb = tb.drop(['DRUG'])

    tb['DRUG'] = tb_enum_drug

    print(tb.to_arrow())

    pdf = tb.to_pandas()

    pdf.to_csv(save_file, sep="\t")
Пример #27
0
def benchmark_table_conversion():
    N = 10_000
    a_rand = np.random.random(size=N)

    num_cols = 10_000
    data = []
    col_names = []
    for i in range(num_cols):
        data.append(a_rand)
        col_names.append('c_' + str(i))

    tb = pa.Table.from_arrays(data, col_names)
    pdf: pd.DataFrame = tb.to_pandas()

    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cntb: Table = Table.from_arrow(ctx, tb)

    print(pdf.shape, cntb.shape)

    t1 = time.time()
    npr_cn = cntb.to_numpy()
    t2 = time.time()
    npr_df = pdf.to_numpy()
    t3 = time.time()

    def map_func(x):
        return x + 1

    t4 = time.time()
    npr_map = map_func(npr_df)
    t5 = time.time()
    pdf_map = pdf.applymap(map_func)
    t6 = time.time()
    cn_map = cntb.applymap(map_func)
    t7 = time.time()

    print(f"PyCylon Time to table to Numpy {t2 - t1} s")
    print(f"Pandas Time to table to Numpy {t3 - t2} s")

    print(f"Numpy virtual map time : {t5-t4} s")
    print(f"Pandas map time : {t6 - t5} s")
    print(f"PyCylon map time : {t7 - t6} s")

    print(npr_map.shape)
    lst = []
    tx = time.time()
    for i in range(npr_map.shape[1]):
        lst.append(npr_map[:][i])
    ty = time.time()
    print(ty - tx)
Пример #28
0
def test_default_indexing():
    pdf = pd.DataFrame({
        'idx': ['x', 'y', 'z'],
        'col-1': ["a", "b", "c"],
        'col-2': [10, 20, 30],
        'col-3': ['Y', 'N', 'Y']
    })

    tb = Table.from_pandas(ctx, pdf)

    tb_idx_values = tb.index.index_values
    pdf_idx_values = pdf.index.values.tolist()

    assert tb_idx_values == pdf_idx_values
Пример #29
0
def test_non_numeric_applymap():
    a = pa.array(['Rayan', 'Reynolds', 'Jack', 'Mat'])
    b = pa.array(['Cameron', 'Selena', 'Roger', 'Murphy'])

    tb: pa.Table = pa.Table.from_arrays([a, b], ['c1', 'c2'])
    pdf: pd.DataFrame = tb.to_pandas()
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cntb: Table = Table.from_arrow(ctx, tb)

    map_func = lambda x: "Hello, " + x
    new_cntb = cntb.applymap(map_func)
    new_pdf = pdf.applymap(map_func)

    assert new_cntb.to_pandas().values.tolist() == new_pdf.values.tolist()
def test_invert():
    # Bool Invert Test

    data_list = [[False, True, False, True, True],
                 [False, True, False, True, True]]
    pdf = DataFrame(data_list)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb = Table.from_pandas(ctx, pdf)

    invert_cn_tb = ~cn_tb
    invert_pdf = ~pdf

    assert invert_cn_tb.to_pandas().values.tolist(
    ) == invert_pdf.values.tolist()