def null_handling_op(num_rows: int, num_cols: int, unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    df = get_dataframe(num_rows=num_rows,
                       num_cols=num_cols,
                       unique_factor=unique_factor,
                       with_null=True)
    index_df = get_dataframe(num_rows=num_rows,
                             num_cols=1,
                             unique_factor=1.0,
                             with_null=False)
    index_column = 'index_col'
    df[index_column] = index_df
    ct = Table.from_pandas(ctx, df)

    df.set_index(index_column, inplace=True, drop=True)
    ct.set_index(index_column, indexing_type=IndexingType.LINEAR, drop=True)

    pandas_time = time.time()
    df_isna = df.dropna(axis=1)
    pandas_time = time.time() - pandas_time

    cylon_time = time.time()
    ct_isna = ct.dropna(axis=0)
    cylon_time = time.time() - cylon_time

    pandas_eval_time = time.time()
    pd.eval('df.dropna(axis=1)')
    pandas_eval_time = time.time() - pandas_eval_time
    print(df_isna.shape, ct_isna.shape)
    return pandas_time, cylon_time, pandas_eval_time
示例#2
0
def math_op(num_rows: int, num_cols: int, unique_factor: float, op=add):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "numpy")

    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=unique_factor)
    filter_column = pdf.columns[0]
    filter_column_data = pdf[pdf.columns[0]]
    random_index = np.random.randint(low=0, high=pdf.shape[0])
    math_value = filter_column_data.values[random_index]
    tb = Table.from_pandas(ctx, pdf)

    cylon_math_op_time = time.time()
    tb_filter = op(tb, math_value)
    cylon_math_op_time = time.time() - cylon_math_op_time

    pandas_math_op_time = time.time()
    pdf_filter = op(pdf, math_value)  # pdf[filter_column] > filter_value
    pandas_math_op_time = time.time() - pandas_math_op_time

    pandas_eval_math_op_time = time.time()
    pdf_filter = pd.eval("op(pdf, math_value)")
    pandas_eval_math_op_time = time.time() - pandas_eval_math_op_time

    return pandas_math_op_time, pandas_eval_math_op_time, cylon_math_op_time
def indexing_op(num_rows: int, num_cols: int, unique_factor: float):
    from pycylon.indexing.index import IndexingType
    ctx: cn.CylonContext = cn.CylonContext(config=None, distributed=False)
    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=unique_factor)
    filter_column = pdf.columns[0]
    filter_column_data = pdf[pdf.columns[0]]
    random_index = np.random.randint(low=0, high=pdf.shape[0])
    filter_value = filter_column_data.values[random_index]
    filter_values = filter_column_data.values.tolist()[0:pdf.shape[0] // 2]
    tb = Table.from_pandas(ctx, pdf)
    cylon_indexing_time = time.time()
    tb.set_index(filter_column, indexing_type=IndexingType.LINEAR, drop=True)
    cylon_indexing_time = time.time() - cylon_indexing_time
    pdf_indexing_time = time.time()
    pdf.set_index(filter_column, drop=True, inplace=True)
    pdf_indexing_time = time.time() - pdf_indexing_time

    cylon_filter_time = time.time()
    tb_filter = tb.loc[filter_values]
    cylon_filter_time = time.time() - cylon_filter_time

    pandas_filter_time = time.time()
    pdf_filtered = pdf.loc[filter_values]
    pandas_filter_time = time.time() - pandas_filter_time

    print(tb_filter.shape, pdf_filtered.shape)

    return pandas_filter_time, cylon_filter_time, pdf_indexing_time, cylon_indexing_time
示例#4
0
def join_op(num_rows: int, num_cols: int, algorithm: str,
            unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    pdf_left = get_dataframe(num_rows=num_rows,
                             num_cols=num_cols,
                             unique_factor=unique_factor,
                             stringify=False)
    pdf_right = get_dataframe(num_rows=num_rows,
                              num_cols=num_cols,
                              unique_factor=unique_factor,
                              stringify=False)
    # NOTE: sort join breaks when loaded data in-memory via Pandas dataframe
    pdf_left.to_csv("/tmp/left_table.csv", index=False)
    pdf_right.to_csv("/tmp/right_table.csv", index=False)

    csv_read_options = CSVReadOptions() \
        .use_threads(True) \
        .block_size(1 << 30)

    tb_left = read_csv(ctx, "/tmp/left_table.csv", csv_read_options)
    tb_right = read_csv(ctx, "/tmp/right_table.csv", csv_read_options)
    join_col = tb_left.column_names[0]
    cylon_time = time.time()
    tb2 = tb_left.join(tb_right,
                       join_type='inner',
                       algorithm=algorithm,
                       on=[join_col])
    cylon_time = time.time() - cylon_time

    pandas_time = time.time()
    pdf2 = pdf_left.join(pdf_right,
                         how="inner",
                         on=join_col,
                         lsuffix="_l",
                         rsuffix="_r")
    pandas_time = time.time() - pandas_time

    pandas_eval_time = time.time()
    pdf2 = pd.eval(
        "pdf_left.join(pdf_right, how='inner', on=join_col, lsuffix='_l', rsuffix='_r')"
    )
    pandas_eval_time = time.time() - pandas_eval_time

    ctx.finalize()

    return pandas_time, cylon_time, pandas_eval_time
def duplicate_op(num_rows: int, num_cols: int, filter_size: int, unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)

    pdf = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor)
    tb = Table.from_pandas(ctx, pdf)
    filter_columns = tb.column_names[0:filter_size]
    cylon_time = time.time()
    tb2 = tb.unique(columns=filter_columns)
    cylon_time = time.time() - cylon_time

    pandas_time = time.time()
    pdf2 = pdf.drop_duplicates(subset=filter_columns)
    pandas_time = time.time() - pandas_time

    pandas_eval_time = time.time()
    pdf2 = pd.eval("pdf.drop_duplicates(subset=filter_columns)")
    pandas_eval_time = time.time() - pandas_eval_time

    return pandas_time, cylon_time, pandas_eval_time
示例#6
0
def tb_creation_op(num_rows: int, num_cols: int, duplication_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "numpy")
    pdf = get_dataframe(num_rows=num_rows,
                        num_cols=num_cols,
                        unique_factor=duplication_factor)
    # data_row_list
    data_row = [np.random.randint(num_rows)] * num_rows
    # data row np array
    data_row_ar = np.array(data_row)
    # data row pa array
    data_row_pa_ar = pa.array(data_row)

    data_set = [data_row for i in range(num_cols)]
    data_set_ar = [data_row_ar for i in range(num_cols)]
    data_set_pa_ar = [data_row_pa_ar for i in range(num_cols)]

    column_names = ["data_" + str(i) for i in range(num_cols)]

    t_pandas = time.time()
    tb = Table.from_pandas(ctx, pdf)
    t_pandas = time.time() - t_pandas

    t_list = time.time()
    tb1 = Table.from_list(ctx, column_names, data_set)
    t_list = time.time() - t_list

    t_numpy = time.time()
    tb2 = Table.from_numpy(ctx, column_names, data_set_ar)
    t_numpy = time.time() - t_numpy

    t_arrow = time.time()
    tb3 = pa.Table.from_arrays(data_set_pa_ar, column_names)
    t_arrow = time.time() - t_arrow

    t_cylon_from_arrow = time.time()
    tb4 = Table.from_arrow(ctx, tb3)
    t_cylon_from_arrow = time.time() - t_cylon_from_arrow
    return t_pandas, t_numpy, t_list, t_arrow, t_cylon_from_arrow
示例#7
0
def isin_op(num_rows: int, num_cols: int, filter_size: int, unique_factor: float):
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    ctx.add_config("compute_engine", "arrow")
    df = get_dataframe(num_rows=num_rows, num_cols=num_cols, unique_factor=unique_factor)

    ct = Table.from_pandas(ctx, df)

    cmp_data = np.random.randn(filter_size)

    cmp_data = cmp_data.tolist()

    pandas_time = time.time()
    df.isin(cmp_data)
    pandas_time = time.time() - pandas_time

    cylon_time = time.time()
    ct.isin(cmp_data)
    cylon_time = time.time() - cylon_time

    pandas_eval_time = time.time()
    pd.eval('df.isin(cmp_data)')
    pandas_eval_time = time.time() - pandas_eval_time

    return pandas_time, cylon_time, pandas_eval_time