Пример #1
0
def create_joined_df(perf_table):
    delinquency_12_expr = (ibis.case().when(
        perf_table["current_loan_delinquency_status"].notnull(),
        perf_table["current_loan_delinquency_status"],
    ).else_(-1).end())
    upb_12_expr = (ibis.case().when(
        perf_table["current_actual_upb"].notnull(),
        perf_table["current_actual_upb"]).else_(999999999).end())
    joined_df = perf_table["loan_id", perf_table["monthly_reporting_period"].
                           month().name("timestamp_month").cast("int32"),
                           perf_table["monthly_reporting_period"].year(
                           ).name("timestamp_year").cast("int32"),
                           delinquency_12_expr.name("delinquency_12"),
                           upb_12_expr.name("upb_12"), ]
    return joined_df
Пример #2
0
def test_case_where(backend, alltypes, df):
    table = alltypes
    table = table.mutate(
        new_col=(
            ibis.case()
            .when(table['int_col'] == 1, 20)
            .when(table['int_col'] == 0, 10)
            .else_(0)
            .end()
            .cast('int64')
        )
    )

    result = table.execute()

    expected = df.copy()
    mask_0 = expected['int_col'] == 1
    mask_1 = expected['int_col'] == 0

    expected['new_col'] = 0
    expected.loc[mask_0, 'new_col'] = 20
    expected.loc[mask_1, 'new_col'] = 10
    expected['new_col'] = expected['new_col']

    backend.assert_frame_equal(result, expected)
Пример #3
0
def test_multiple_case_null_else(table):
    expr = ibis.case().when(table.g == "foo", "bar").end()
    op = expr.op()

    assert isinstance(expr, ir.StringColumn)
    assert isinstance(op.default, ir.ValueExpr)
    assert isinstance(op.default.op(), ops.NullLiteral)
Пример #4
0
def test_case_in_projection(alltypes):
    t = alltypes

    expr = (
        t.g.case().when('foo', 'bar').when('baz', 'qux').else_('default').end()
    )

    expr2 = ibis.case().when(t.g == 'foo', 'bar').when(t.g == 'baz', t.g).end()

    proj = t[expr.name('col1'), expr2.name('col2'), t]

    result = Compiler.to_sql(proj)
    expected = """\
SELECT
  CASE `g`
    WHEN 'foo' THEN 'bar'
    WHEN 'baz' THEN 'qux'
    ELSE 'default'
  END AS `col1`,
  CASE
    WHEN `g` = 'foo' THEN 'bar'
    WHEN `g` = 'baz' THEN `g`
    ELSE CAST(NULL AS string)
  END AS `col2`, *
FROM alltypes"""
    assert result == expected
Пример #5
0
def test_select_filter_mutate(backend, alltypes, df):
    """Test that select, filter and mutate are executed in right order.

    Before Pr 2635, try_fusion in analysis.py would fuse these operations
    together in a way that the order of the operations were wrong. (mutate
    was executed before filter).
    """
    t = alltypes

    # Prepare the float_col so that filter must execute
    # before the cast to get the correct result.
    t = t.mutate(
        float_col=ibis.case()
        .when(t['bool_col'], t['float_col'])
        .else_(np.nan)
        .end()
    )

    # Actual test
    t = t[t.columns]
    t = t[~t['float_col'].isnan()]
    t = t.mutate(float_col=t['float_col'].cast('int32'))
    result = t.execute()

    expected = df.copy()
    expected.loc[~df['bool_col'], 'float_col'] = None
    expected = expected[~expected['float_col'].isna()]
    expected = expected.assign(float_col=expected['float_col'].astype('int32'))

    backend.assert_frame_equal(result, expected)
Пример #6
0
    def test_multiple_case_null_else(self):
        expr = ibis.case().when(self.table.g == "foo", "bar").end()
        op = expr.op()

        assert isinstance(expr, ir.StringArray)
        assert isinstance(op.default, ir.ValueExpr)
        assert isinstance(op.default.op(), ir.NullLiteral)
Пример #7
0
    def ifelse(
        self,
        true_expr: ir.Value,
        false_expr: ir.Value,
    ) -> ir.Value:
        """Construct a ternary conditional expression.

        Parameters
        ----------
        true_expr
            Expression to return if `self` evaluates to `True`
        false_expr
            Expression to return if `self` evaluates to `False`

        Returns
        -------
        Value
            The value of `true_expr` if `arg` is `True` else `false_expr`

        Examples
        --------
        >>> import ibis
        >>> t = ibis.table([("is_person", "boolean")], name="t")
        >>> expr = t.is_person.ifelse("yes", "no")
        >>> print(ibis.impala.compile(expr))
        SELECT CASE WHEN `is_person` THEN 'yes' ELSE 'no' END AS `tmp`
        FROM t
        """
        import ibis

        # Result will be the result of promotion of true/false exprs. These
        # might be conflicting types; same type resolution as case expressions
        # must be used.
        return ibis.case().when(self, true_expr).else_(false_expr).end()
Пример #8
0
def _extract_quarter(t, expr):
    (arg, ) = expr.op().args

    expr_new = ops.ExtractMonth(arg).to_expr()
    expr_new = (ibis.case().when(expr_new.isin([1, 2, 3]),
                                 1).when(expr_new.isin([4, 5, 6]),
                                         2).when(expr_new.isin([7, 8, 9]),
                                                 3).else_(4).end())
    return sa.cast(t.translate(expr_new), sa.Integer)
Пример #9
0
def test_searched_case_column(batting, batting_df):
    t = batting
    df = batting_df
    expr = (ibis.case().when(t.RBI < 5, 'really bad team').when(
        t.teamID == 'PH1', 'ph1 team').else_(t.teamID).end())
    result = expr.execute()
    expected = pd.Series(
        np.select([df.RBI < 5, df.teamID == 'PH1'],
                  ['really bad team', 'ph1 team'], df.teamID))
    tm.assert_series_equal(result, expected)
Пример #10
0
    def test_search_case(self):
        expr = ibis.case().when(self.table.f > 0, self.table.d * 2).when(self.table.c < 0, self.table.a * 2).end()

        result = self._translate(expr)
        expected = """CASE
  WHEN f > 0 THEN d * 2
  WHEN c < 0 THEN a * 2
  ELSE NULL
END"""
        assert result == expected
Пример #11
0
    def test_search_case(self):
        expr = (ibis.case().when(self.table.f > 0, self.table.d * 2).when(
            self.table.c < 0, self.table.a * 2).end())

        result = self._translate(expr)
        expected = """CASE
  WHEN f > 0 THEN d * 2
  WHEN c < 0 THEN a * 2
  ELSE NULL
END"""
        assert result == expected
Пример #12
0
    def _get_hist_disease_expr(
        cls,
        disease: str,
        geocodes: List[int],
        year_week_start: int,
        year_week_end: int,
    ) -> ibis.expr.types.Expr:
        """
        Return an ibis expression with the history for a given disease.

        Parameters
        ----------
        disease : str, {'dengue', 'chik', 'zika'}
        geocodes : List[int]
        year_week_start : int
            The starting Year/Week, e.g.: 202002
        year_week_end : int
            The ending Year/Week, e.g.: 202005

        Returns
        -------
        ibis.expr.types.Expr
        """
        table_suffix = ''
        if disease != 'dengue':
            table_suffix = '_{}'.format(disease)

        schema_city = con.schema('Municipio')
        t_hist = schema_city.table('Historico_alerta{}'.format(table_suffix))

        case_level = (ibis.case().when(
            (t_hist.nivel.cast('string') == '1'), 'verde').when(
                (t_hist.nivel.cast('string') == '2'), 'amarelo').when(
                    (t_hist.nivel.cast('string') == '3'), 'laranja').when(
                        (t_hist.nivel.cast('string') == '4'),
                        'vermelho').else_('-').end()).name(f'nivel_{disease}')

        hist_keys = [
            t_hist.SE.name(f'SE_{disease}'),
            t_hist.casos.name(f'casos_{disease}'),
            t_hist.p_rt1.name(f'p_rt1_{disease}'),
            t_hist.casos_est.name(f'casos_est_{disease}'),
            t_hist.p_inc100k.name(f'p_inc100k_{disease}'),
            t_hist.nivel.name(f'level_code_{disease}'),
            case_level,
            t_hist.municipio_geocodigo.name(f'geocode_{disease}'),
        ]

        hist_filter = (t_hist['SE'].between(
            year_week_start,
            year_week_end)) & (t_hist['municipio_geocodigo'].isin(geocodes))

        return t_hist[hist_filter][hist_keys].sort_by(f'SE_{disease}')
Пример #13
0
def test_searched_case_column(batting, batting_df):
    t = batting
    df = batting_df
    expr = (ibis.case().when(t.RBI < 5, 'really bad team').when(
        t.teamID == 'PH1', 'ph1 team').else_(t.teamID).end())
    result = expr.compile()
    expected = dd.from_array(
        np.select(
            [df.RBI < 5, df.teamID == 'PH1'],
            ['really bad team', 'ph1 team'],
            df.teamID,
        ))
    tm.assert_series_equal(result.compute(), expected.compute())
Пример #14
0
def test_substr_with_null_values(backend, alltypes, df):
    table = alltypes.mutate(substr_col_null=ibis.case().when(
        alltypes['bool_col'], alltypes['string_col']).else_(None).end().substr(
            0, 2))
    result = table.execute()

    expected = df.copy()
    mask = ~expected['bool_col']
    expected['substr_col_null'] = expected['string_col']
    expected.loc[mask, 'substr_col_null'] = None
    expected['substr_col_null'] = expected['substr_col_null'].str.slice(0, 2)

    backend.assert_frame_equal(result, expected)
Пример #15
0
def test_search_case(con, alltypes, translate):
    t = alltypes
    expr = (ibis.case().when(t.float_col > 0,
                             t.int_col * 2).when(t.float_col < 0,
                                                 t.int_col).else_(0).end())

    expected = """CASE
  WHEN `float_col` > 0 THEN `int_col` * 2
  WHEN `float_col` < 0 THEN `int_col`
  ELSE 0
END"""
    assert translate(expr) == expected
    assert len(con.execute(expr))
Пример #16
0
def test_pickle_multiple_case_node(table):
    case1 = table.a == 5
    case2 = table.b == 128
    case3 = table.c == 1000

    result1 = table.f
    result2 = table.b * 2
    result3 = table.e

    default = table.d
    expr = (ibis.case().when(case1, result1).when(case2, result2).when(
        case3, result3).else_(default).end())

    op = expr.op()
    assert_pickle_roundtrip(op)
Пример #17
0
def _bucket(expr):
    op = expr.op()
    stmt = ibis.case()

    if op.closed == 'left':
        l_cmp = operator.le
        r_cmp = operator.lt
    else:
        l_cmp = operator.lt
        r_cmp = operator.le

    user_num_buckets = len(op.buckets) - 1

    bucket_id = 0
    if op.include_under:
        if user_num_buckets > 0:
            cmp = operator.lt if op.close_extreme else r_cmp
        else:
            cmp = operator.le if op.closed == 'right' else operator.lt
        stmt = stmt.when(cmp(op.arg, op.buckets[0]), bucket_id)
        bucket_id += 1

    for j, (lower, upper) in enumerate(zip(op.buckets, op.buckets[1:])):
        if op.close_extreme and (
            (op.closed == 'right' and j == 0) or
            (op.closed == 'left' and j == (user_num_buckets - 1))):
            stmt = stmt.when((lower <= op.arg) & (op.arg <= upper), bucket_id)
        else:
            stmt = stmt.when(
                l_cmp(lower, op.arg) & r_cmp(op.arg, upper), bucket_id)
        bucket_id += 1

    if op.include_over:
        if user_num_buckets > 0:
            cmp = operator.lt if op.close_extreme else l_cmp
        else:
            cmp = operator.lt if op.closed == 'right' else operator.le

        stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id)
        bucket_id += 1

    result = stmt.end()
    if expr.has_name():
        result = result.name(expr.get_name())

    return result
Пример #18
0
def _bucket(translator, expr):
    import operator

    op = expr.op()
    stmt = ibis.case()

    if op.closed == 'left':
        l_cmp = operator.le
        r_cmp = operator.lt
    else:
        l_cmp = operator.lt
        r_cmp = operator.le

    user_num_buckets = len(op.buckets) - 1

    bucket_id = 0
    if op.include_under:
        if user_num_buckets > 0:
            cmp = operator.lt if op.close_extreme else r_cmp
        else:
            cmp = operator.le if op.closed == 'right' else operator.lt
        stmt = stmt.when(cmp(op.arg, op.buckets[0]), bucket_id)
        bucket_id += 1

    for j, (lower, upper) in enumerate(zip(op.buckets, op.buckets[1:])):
        if (op.close_extreme and
            ((op.closed == 'right' and j == 0) or
             (op.closed == 'left' and j == (user_num_buckets - 1)))):
            stmt = stmt.when((lower <= op.arg) & (op.arg <= upper),
                             bucket_id)
        else:
            stmt = stmt.when(l_cmp(lower, op.arg) & r_cmp(op.arg, upper),
                             bucket_id)
        bucket_id += 1

    if op.include_over:
        if user_num_buckets > 0:
            cmp = operator.lt if op.close_extreme else l_cmp
        else:
            cmp = operator.lt if op.closed == 'right' else operator.le

        stmt = stmt.when(cmp(op.buckets[-1], op.arg), bucket_id)
        bucket_id += 1

    case_expr = stmt.end().name(expr._name)
    return _searched_case(translator, case_expr)
Пример #19
0
    def test_multiple_case_expr(self):
        case1 = self.table.a == 5
        case2 = self.table.b == 128
        case3 = self.table.c == 1000

        result1 = self.table.f
        result2 = self.table.b * 2
        result3 = self.table.e

        default = self.table.d

        expr = (ibis.case().when(case1, result1).when(case2, result2).when(
            case3, result3).else_(default).end())

        op = expr.op()
        assert isinstance(expr, ir.DoubleArray)
        assert isinstance(op, ops.SearchedCase)
        assert op.default is default
Пример #20
0
def test_multiple_case_expr(table):
    case1 = table.a == 5
    case2 = table.b == 128
    case3 = table.c == 1000

    result1 = table.f
    result2 = table.b * 2
    result3 = table.e

    default = table.d

    expr = (ibis.case().when(case1, result1).when(case2, result2).when(
        case3, result3).else_(default).end())

    op = expr.op()
    assert isinstance(expr, ir.FloatingColumn)
    assert isinstance(op, ops.SearchedCase)
    assert op.default is default
Пример #21
0
def tpc_h08(
    con,
    NATION="BRAZIL",
    REGION="AMERICA",
    TYPE="ECONOMY ANODIZED STEEL",
    DATE="1995-01-01",
):
    part = con.table("part")
    supplier = con.table("supplier")
    lineitem = con.table("lineitem")
    orders = con.table("orders")
    customer = con.table("customer")
    region = con.table("region")
    n1 = con.table("nation")
    n2 = n1.view()

    q = part
    q = q.join(lineitem, part.p_partkey == lineitem.l_partkey)
    q = q.join(supplier, supplier.s_suppkey == lineitem.l_suppkey)
    q = q.join(orders, lineitem.l_orderkey == orders.o_orderkey)
    q = q.join(customer, orders.o_custkey == customer.c_custkey)
    q = q.join(n1, customer.c_nationkey == n1.n_nationkey)
    q = q.join(region, n1.n_regionkey == region.r_regionkey)
    q = q.join(n2, supplier.s_nationkey == n2.n_nationkey)

    q = q[orders.o_orderdate.year().cast("string").name("o_year"),
          (lineitem.l_extendedprice *
           (1 - lineitem.l_discount)).name("volume"),
          n2.n_name.name("nation"), region.r_name, orders.o_orderdate,
          part.p_type, ]

    q = q.filter([
        q.r_name == REGION,
        q.o_orderdate.between(DATE, add_date(DATE, dy=2, dd=-1)),
        q.p_type == TYPE,
    ])

    q = q.mutate(nation_volume=ibis.case().when(q.nation == NATION,
                                                q.volume).else_(0).end())
    gq = q.group_by([q.o_year])
    q = gq.aggregate(mkt_share=q.nation_volume.sum() / q.volume.sum())
    q = q.sort_by([q.o_year])
    return q
Пример #22
0
def test_searched_case_column(batting, batting_df):
    t = batting
    df = batting_df
    expr = (
        ibis.case()
        .when(t.RBI < 5, 'really bad team')
        .when(t.teamID == 'PH1', 'ph1 team')
        .else_(t.teamID)
        .end()
    )
    result = expr.execute()
    expected = pd.Series(
        np.select(
            [df.RBI < 5, df.teamID == 'PH1'],
            ['really bad team', 'ph1 team'],
            df.teamID,
        )
    )
    tm.assert_series_equal(result, expected)
Пример #23
0
    def case_expression(self, when_expressions: List[Union[Tuple[Value, Value],
                                                           Value]]):
        """
        Handles sql_to_ibis case expressions
        :param when_expressions:
        :return:
        """
        case_expression = ibis.case()
        for i, when_expression in enumerate(when_expressions):
            if isinstance(when_expression, tuple):
                conditional_boolean = when_expression[0].get_value()
                conditional_value = when_expression[1].get_value()
                case_expression = case_expression.when(conditional_boolean,
                                                       conditional_value)
            else:
                case_expression = case_expression.else_(
                    when_expression.get_value()).end()

        return Expression(value=case_expression)
Пример #24
0
    def test_multiple_case_expr(self):
        case1 = self.table.a == 5
        case2 = self.table.b == 128
        case3 = self.table.c == 1000

        result1 = self.table.f
        result2 = self.table.b * 2
        result3 = self.table.e

        default = self.table.d

        expr = (ibis.case()
                .when(case1, result1)
                .when(case2, result2)
                .when(case3, result3)
                .else_(default)
                .end())

        op = expr.op()
        assert isinstance(expr, ir.DoubleArray)
        assert isinstance(op, ops.SearchedCase)
        assert op.default is default
Пример #25
0
def create_12_mon_features(joined_df):
    delinq_df = None
    n_months = 12
    for y in range(1, n_months + 1):
        year_dec = (ibis.case().when(
            joined_df["timestamp_month"] < ibis.literal(y), 1).else_(0).end())
        tmp_df = joined_df["loan_id", "delinquency_12", "upb_12",
                           (joined_df["timestamp_year"] -
                            year_dec).name("timestamp_year"), ]

        delinquency_12 = (tmp_df["delinquency_12"].max() > 3).cast("int32") + (
            tmp_df["upb_12"].min() == 0).cast("int32")
        tmp_df = tmp_df.groupby(["loan_id", "timestamp_year"]).aggregate(
            delinquency_12.name("delinquency_12"))

        tmp_df = tmp_df.mutate(timestamp_month=ibis.literal(y, "int32"))

        if delinq_df is None:
            delinq_df = tmp_df
        else:
            delinq_df = delinq_df.union(tmp_df)

    return delinq_df
Пример #26
0
def test_searched_case_scalar(client):
    expr = ibis.case().when(True, 1).when(False, 2).end()
    result = client.execute(expr)
    expected = np.int8(1)
    assert result == expected
Пример #27
0
def search_case(con):
    t = con.table('alltypes')
    return ibis.case().when(t.f > 0, t.d * 2).when(t.c < 0, t.a * 2).end()
Пример #28
0
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    run_import_queries,
    etl_keys,
    import_mode,
):
    tmp_table_name = "tmp_table"

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }

        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] +
            ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"])
        import_query_cols_str = "".join(import_query_cols_list)

        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str)
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true")
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename)

        # data file import by ibis
        columns_types_import_query = ["string", "int64"
                                      ] + ["float64" for _ in range(200)]
        schema_table_import = ibis.Schema(names=columns_names,
                                          types=columns_types_import_query)
        omnisci_server_worker.create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
        )

        table_import_query = omnisci_server_worker.database(
            database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000)

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        etl_times.update(etl_times_import)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = round((timer() - t0) * 1000)

        elif import_mode == "pandas":
            # Datafiles import
            columns_types_converted = [
                "float64" if (x.startswith("decimal")) else x
                for x in columns_types
            ]
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types_converted,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith("gz") else None,
                validation=validation,
            )
            etl_times["t_readcsv"] = round(
                (t_import_pandas + t_import_ibis) * 1000)

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith("gz"):
                    import gzip

                    unzip_name = "/tmp/santander-fsi.csv"

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name, unzip_name or filename, schema_table)
                etl_times["t_readcsv"] = round((timer() - t0) * 1000)

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name("var_%d_gt1" % i))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    return table_df, etl_times
Пример #29
0
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    etl_keys,
    import_mode,
):
    import ibis

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    # Create table and import data
    if create_new_table:
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            # Create table and import data for ETL queries
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = round((timer() - t0) * 1000)

        elif import_mode == "pandas":
            # Datafiles import
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith("gz") else None,
                validation=validation,
            )
            etl_times["t_readcsv"] = round(
                (t_import_pandas + t_import_ibis) * 1000)

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith("gz"):
                    import gzip

                    unzip_name = "/tmp/census-fsi.csv"

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name, unzip_name or filename, schema_table)
                etl_times["t_readcsv"] = round((timer() - t0) * 1000)

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    t_etl_start = timer()

    keep_cols = [
        "YEAR0",
        "DATANUM",
        "SERIAL",
        "CBSERIAL",
        "HHWT",
        "CPI99",
        "GQ",
        "PERNUM",
        "SEX",
        "AGE",
        "INCTOT",
        "EDUC",
        "EDUCD",
        "EDUC_HEAD",
        "EDUC_POP",
        "EDUC_MOM",
        "EDUCD_MOM2",
        "EDUCD_POP2",
        "INCTOT_MOM",
        "INCTOT_POP",
        "INCTOT_MOM2",
        "INCTOT_POP2",
        "INCTOT_HEAD",
        "SEX_HEAD",
    ]

    if import_mode == "pandas" and validation:
        keep_cols.append("id")

    table = table[keep_cols]

    # first, we do all filters and eliminate redundant fillna operations for EDUC and EDUCD
    table = table[table.INCTOT != 9999999]
    table = table[table["EDUC"].notnull()]
    table = table[table["EDUCD"].notnull()]

    table = table.set_column("INCTOT", table["INCTOT"] * table["CPI99"])

    cols = []
    # final fillna and casting for necessary columns
    for column in keep_cols:
        cols.append(ibis.case().when(
            table[column].notnull(),
            table[column]).else_(-1).end().cast("float64").name(column))

    table = table.mutate(cols)

    df = table.execute()

    if import_mode == "pandas" and validation:
        df.index = df["id"].values

    # here we use pandas to split table
    y = df["EDUC"]
    X = df.drop(["EDUC", "CPI99"], axis=1)

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    print("DataFrame shape:", X.shape)

    return df, X, y, etl_times
Пример #30
0
def test_searched_case_scalar(client):
    expr = ibis.case().when(True, 1).when(False, 2).end()
    result = client.execute(expr)
    expected = np.int8(1)
    assert result == expected
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    etl_keys,
    import_mode,
    fragments_size,
):
    etl_times = {key: 0.0 for key in etl_keys}

    fragments_size = check_fragments_size(fragments_size,
                                          count_table=1,
                                          import_mode=import_mode)

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        if import_mode == "copy-from":
            t0 = timer()
            omnisci_server_worker.create_table(
                table_name=table_name,
                schema=schema_table,
                database=database_name,
                fragment_size=fragments_size[0],
            )
            table_import = omnisci_server_worker.database(database_name).table(
                table_name)
            etl_times["t_connect"] += timer() - t0

            t0 = timer()
            table_import.read_csv(filename,
                                  header=True,
                                  quotechar="",
                                  delimiter=",")
            etl_times["t_readcsv"] = timer() - t0

        elif import_mode == "pandas":
            # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion
            # accuracy during import from Pandas into OmniSciDB for proper results validation
            columns_types = [
                "decimal(9, 6)" if (x == "decimal(8, 4)") else x
                for x in columns_types
            ]
            t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
                table_name=table_name,
                data_files_names=filename,
                files_limit=1,
                columns_names=columns_names,
                columns_types=columns_types,
                header=0,
                nrows=None,
                compression_type="gzip" if filename.endswith(".gz") else None,
                use_columns_types_for_pd=False,
            )
            etl_times["t_readcsv"] = t_import_pandas + t_import_ibis
            etl_times[
                "t_connect"] += omnisci_server_worker.get_conn_creation_time()

        elif import_mode == "fsi":
            try:
                unzip_name = None
                if filename.endswith(".gz"):
                    import gzip

                    unzip_name = get_tmp_filepath("santander-fsi.csv")

                    with gzip.open(filename, "rb") as gz_input:
                        with open(unzip_name, "wb") as output:
                            output.write(gz_input.read())

                t0 = timer()
                omnisci_server_worker._conn.create_table_from_csv(
                    table_name,
                    unzip_name or filename,
                    schema_table,
                    fragment_size=fragments_size[0],
                )
                etl_times["t_readcsv"] = timer() - t0
                etl_times[
                    "t_connect"] += omnisci_server_worker.get_conn_creation_time(
                    )

            finally:
                if filename.endswith("gz"):
                    import os

                    os.remove(unzip_name)

    # Second connection - this is ibis's ipc connection for DML
    t0 = timer()
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)
    etl_times["t_connect"] += timer() - t0

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name(col_gt1))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = timer() - t_etl_start
    return table_df, etl_times
Пример #32
0
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    etl_keys,
):
    import ibis

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    # Create table and import data
    if create_new_table:
        # Datafiles import
        t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis(
            table_name=table_name,
            data_files_names=filename,
            files_limit=1,
            columns_names=columns_names,
            columns_types=columns_types,
            header=0,
            nrows=None,
            compression_type="gzip",
            validation=validation,
        )
    etl_times["t_readcsv"] = t_import_pandas + t_import_ibis

    # Second connection - this is ibis's ipc connection for DML
    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    t_etl_start = timer()

    keep_cols = [
        "YEAR0",
        "DATANUM",
        "SERIAL",
        "CBSERIAL",
        "HHWT",
        "CPI99",
        "GQ",
        "PERNUM",
        "SEX",
        "AGE",
        "INCTOT",
        "EDUC",
        "EDUCD",
        "EDUC_HEAD",
        "EDUC_POP",
        "EDUC_MOM",
        "EDUCD_MOM2",
        "EDUCD_POP2",
        "INCTOT_MOM",
        "INCTOT_POP",
        "INCTOT_MOM2",
        "INCTOT_POP2",
        "INCTOT_HEAD",
        "SEX_HEAD",
    ]

    if validation:
        keep_cols.append("id")

    table = table[keep_cols]

    # first, we do all filters and eliminate redundant fillna operations for EDUC and EDUCD
    table = table[table.INCTOT != 9999999]
    table = table[table["EDUC"].notnull()]
    table = table[table["EDUCD"].notnull()]

    table = table.set_column("INCTOT", table["INCTOT"] * table["CPI99"])

    cols = []
    # final fillna and casting for necessary columns
    for column in keep_cols:
        cols.append(ibis.case().when(
            table[column].notnull(),
            table[column]).else_(-1).end().cast("float64").name(column))

    table = table.mutate(cols)

    df = table.execute()

    # here we use pandas to split table
    y = df["EDUC"]
    X = df.drop(["EDUC", "CPI99"], axis=1)

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    print("DataFrame shape:", X.shape)

    return df, X, y, etl_times
def _calculate_difference(field_differences, datatype, validation, is_value_comparison):
    pct_threshold = ibis.literal(validation.threshold)

    if isinstance(datatype, ibis.expr.datatypes.Timestamp):
        source_value = field_differences["differences_source_value"].epoch_seconds()
        target_value = field_differences["differences_target_value"].epoch_seconds()
    elif isinstance(datatype, ibis.expr.datatypes.Float64):
        # Float64 type results from AVG() aggregation
        source_value = field_differences["differences_source_value"].round(digits=4)
        target_value = field_differences["differences_target_value"].round(digits=4)
    elif isinstance(datatype, ibis.expr.datatypes.Decimal):
        source_value = (
            field_differences["differences_source_value"]
            .cast("float64")
            .round(digits=4)
        )
        target_value = (
            field_differences["differences_target_value"]
            .cast("float64")
            .round(digits=4)
        )
    else:
        source_value = field_differences["differences_source_value"]
        target_value = field_differences["differences_target_value"]

    # Does not calculate difference between agg values for row hash due to int64 overflow
    if is_value_comparison:
        difference = pct_difference = ibis.null()
        validation_status = (
            ibis.case()
            .when(
                target_value.isnull() & source_value.isnull(),
                consts.VALIDATION_STATUS_SUCCESS,
            )
            .when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS)
            .else_(consts.VALIDATION_STATUS_FAIL)
            .end()
        )
    # String data types i.e "None" can be returned for NULL timestamp/datetime aggs
    elif isinstance(datatype, ibis.expr.datatypes.String):
        difference = pct_difference = ibis.null().cast("float64")
        validation_status = (
            ibis.case()
            .when(
                target_value.isnull() & source_value.isnull(),
                consts.VALIDATION_STATUS_SUCCESS,
            )
            .else_(consts.VALIDATION_STATUS_FAIL)
            .end()
        )
    else:
        difference = (target_value - source_value).cast("float64")

        pct_difference_nonzero = (
            ibis.literal(100.0)
            * difference
            / (
                source_value.case()
                .when(ibis.literal(0), target_value)
                .else_(source_value)
                .end()
            ).cast("float64")
        ).cast("float64")

        # Considers case that source and target agg values can both be 0
        pct_difference = (
            ibis.case()
            .when(difference == ibis.literal(0), ibis.literal(0).cast("float64"))
            .else_(pct_difference_nonzero)
            .end()
        )

        th_diff = (pct_difference.abs() - pct_threshold).cast("float64")
        validation_status = (
            ibis.case()
            .when(
                source_value.isnull() & target_value.isnull(),
                consts.VALIDATION_STATUS_SUCCESS,
            )
            .when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL)
            .else_(consts.VALIDATION_STATUS_SUCCESS)
            .end()
        )
    return (
        difference.name("difference"),
        pct_difference.name("pct_difference"),
        pct_threshold.name("pct_threshold"),
        validation_status.name("validation_status"),
    )
Пример #34
0
def etl_ibis(
    filename,
    columns_names,
    columns_types,
    database_name,
    table_name,
    omnisci_server_worker,
    delete_old_database,
    create_new_table,
    ipc_connection,
    validation,
    run_import_queries,
    etl_keys,
):
    tmp_table_name = "tmp_table"

    etl_times = {key: 0.0 for key in etl_keys}

    omnisci_server_worker.create_database(database_name,
                                          delete_if_exists=delete_old_database)

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }

        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] +
            ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"])
        import_query_cols_str = "".join(import_query_cols_list)

        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str)
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true")
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename)

        # data file import by ibis
        columns_types_import_query = ["string", "int64"
                                      ] + ["float64" for _ in range(200)]
        schema_table_import = ibis.Schema(names=columns_names,
                                          types=columns_types_import_query)
        omnisci_server_worker.create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
        )

        table_import_query = omnisci_server_worker.database(
            database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000)

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000)

        omnisci_server_worker.drop_table(tmp_table_name)

        etl_times.update(etl_times_import)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        omnisci_server_worker.create_table(
            table_name=table_name,
            schema=schema_table,
            database=database_name,
        )

        table_import = omnisci_server_worker.database(database_name).table(
            table_name)
        t0 = timer()
        table_import.read_csv(filename, delimiter=",")
        etl_times["t_readcsv"] = round((timer() - t0) * 1000)

    omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection)
    table = omnisci_server_worker.database(database_name).table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t_etl_start = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ['var_%s' % i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(ibis.case().when(
            table[col].count().over(w).name(col_count) > 1,
            table[col].cast("float32"),
        ).else_(ibis.null()).end().name("var_%d_gt1" % i))
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()

    etl_times["t_etl"] = round((timer() - t_etl_start) * 1000)
    return table_df, etl_times
Пример #35
0
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False):

    filename = args.file
    database_name = args.name
    table_name = args.table
    delete_old_database = not args.dnd
    create_new_table = not args.dni
    run_import_queries = str_arg_to_bool(run_import_queries)
    validation = str_arg_to_bool(validation)

    tmp_table_name = "tmp_table"

    etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0}

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }
        etl_times.update(etl_times_import)

    omnisci_server = OmnisciServer(
        omnisci_executable=args.omnisci_executable,
        omnisci_port=args.omnisci_port,
        database_name=args.name,
        user=args.user,
        password=args.password,
        debug_timer=True,
        columnar_output=args.server_columnar_output,
        lazy_fetch=args.server_lazy_fetch,
    )
    omnisci_server.launch()

    import ibis
    from server_worker import OmnisciServerWorker

    omnisci_server_worker = OmnisciServerWorker(omnisci_server)
    omnisci_server_worker.create_database(
        database_name, delete_if_exists=delete_old_database
    )

    time.sleep(2)
    omnisci_server_worker.connect_to_server()

    if run_import_queries:
        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"]
            + ["var_%s DOUBLE, \n" % i for i in range(199)]
            + ["var_199 DOUBLE"]
        )
        import_query_cols_str = "".join(import_query_cols_list)

        connect_to_db_sql = connect_to_db_sql_template.format(database_name)
        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str
        )
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true"
        )
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename
        )

        # data file import by ibis
        columns_types_import_query = ["string", "int64"] + [
            "float64" for _ in range(200)
        ]
        schema_table_import = ibis.Schema(
            names=columns_names, types=columns_types_import_query
        )
        omnisci_server_worker.get_conn().create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times["t_readcsv_by_ibis"] = timer() - t0

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times["t_readcsv_by_FSI"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times["t_readcsv_by_COPY"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        omnisci_server_worker.get_conn().create_table(
            table_name=table_name,
            schema=schema_table,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import = omnisci_server_worker.database(database_name).table(table_name)
        table_import.read_csv(filename, delimiter=",")

    if args.server_conn_type == "regular":
        omnisci_server_worker.connect_to_server()
    elif args.server_conn_type == "ipc":
        omnisci_server_worker.ipc_connect_to_server()
    else:
        print("Wrong connection type is specified!")
        sys.exit(0)

    db = omnisci_server_worker.database(database_name)
    table = db.table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t0 = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(
            ibis.case()
            .when(
                table[col].count().over(w).name(col_count) > 1,
                table[col].cast("float32"),
            )
            .else_(ibis.null())
            .end()
            .name("var_%d_gt1" % i)
        )
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()
    etl_times["t_groupby_merge_where"] = timer() - t0

    # rows split query
    t0 = timer()
    training_part, validation_part = table_df[:-10000], table_df[-10000:]
    etl_times["t_train_test_split"] = timer() - t0
    
    etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"]
    
    x_train = training_part.drop(['target0'],axis=1)
    y_train = training_part['target0']
    x_valid = validation_part.drop(['target0'],axis=1)
    y_valid = validation_part['target0']
    
    omnisci_server.terminate()
    omnisci_server = None

    return x_train, y_train, x_valid, y_valid, etl_times