def test_filter_with_analytic(): x = ibis.table(ibis.schema([('col', 'int32')]), 'x') with_filter_col = x[x.columns + [ibis.null().name('filter')]] filtered = with_filter_col[with_filter_col['filter'].isnull()] subquery = filtered[filtered.columns] with_analytic = subquery[['col', subquery.count().name('analytic')]] expr = with_analytic[with_analytic.columns] result = ibis.impala.compile(expr) expected = """\ SELECT `col`, `analytic` FROM ( SELECT `col`, count(*) OVER () AS `analytic` FROM ( SELECT `col`, `filter` FROM ( SELECT * FROM ( SELECT `col`, NULL AS `filter` FROM x ) t3 WHERE `filter` IS NULL ) t2 ) t1 ) t0""" assert result == expected
def test_null(self): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ir.NullLiteral) expr2 = ibis.null() assert_equal(expr, expr2)
def test_filter_with_analytic(): x = ibis.table(ibis.schema([('col', 'int32')]), 'x') with_filter_col = x[x.columns + [ibis.null().name('filter')]] filtered = with_filter_col[with_filter_col['filter'].isnull()] subquery = filtered[filtered.columns] with_analytic = subquery[['col', subquery.count().name('analytic')]] expr = with_analytic[with_analytic.columns] result = ibis.impala.compile(expr) expected = """\ SELECT `col`, `analytic` FROM ( SELECT `col`, count(*) OVER () AS `analytic` FROM ( SELECT `col`, `filter` FROM ( SELECT * FROM ( SELECT `col`, NULL AS `filter` FROM x ) t3 WHERE `filter` IS NULL ) t2 ) t1 ) t0""" assert result == expected
def test_null(): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ir.NullLiteral) assert expr._arg.value is None expr2 = ibis.null() assert_equal(expr, expr2)
def test_null(): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ir.NullLiteral) assert expr._arg.value is None expr2 = ibis.null() assert_equal(expr, expr2)
def test_is_ancestor_analytic(): x = ibis.table(ibis.schema([('col', 'int32')]), 'x') with_filter_col = x[x.columns + [ibis.null().name('filter')]] filtered = with_filter_col[with_filter_col['filter'].isnull()] subquery = filtered[filtered.columns] with_analytic = subquery[subquery.columns + [subquery.count().name('analytic')]] assert not subquery.op().equals(with_analytic.op())
def test_null(): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ops.NullLiteral) assert expr._arg.value is None expr2 = ibis.null() assert_equal(expr, expr2) assert expr is expr2 assert expr.type().equals(dt.null) assert expr2.type().equals(dt.null)
def test_null(): expr = ibis.literal(None) assert isinstance(expr, ir.NullScalar) assert isinstance(expr.op(), ops.NullLiteral) assert expr._arg.value is None expr2 = ibis.null() assert_equal(expr, expr2) assert expr is expr2 assert expr.type() is dt.null assert expr2.type() is dt.null
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, run_import_queries, etl_keys, ): tmp_table_name = "tmp_table" etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"]) import_query_cols_str = "".join(import_query_cols_list) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true") import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename) # data file import by ibis columns_types_import_query = ["string", "int64" ] + ["float64" for _ in range(200)] schema_table_import = ibis.Schema(names=columns_names, types=columns_types_import_query) omnisci_server_worker.create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, ) table_import_query = omnisci_server_worker.database( database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000) # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) etl_times.update(etl_times_import) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s' % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name("var_%d_gt1" % i)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) return table_df, etl_times
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, run_import_queries, etl_keys, import_mode, ): tmp_table_name = "tmp_table" etl_times = {key: 0.0 for key in etl_keys} omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"]) import_query_cols_str = "".join(import_query_cols_list) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true") import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename) # data file import by ibis columns_types_import_query = ["string", "int64" ] + ["float64" for _ in range(200)] schema_table_import = ibis.Schema(names=columns_names, types=columns_types_import_query) omnisci_server_worker.create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, ) table_import_query = omnisci_server_worker.database( database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times_import["t_readcsv_by_ibis"] = round((timer() - t0) * 1000) # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times_import["t_readcsv_by_FSI"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times_import["t_readcsv_by_COPY"] = round((timer() - t0) * 1000) omnisci_server_worker.drop_table(tmp_table_name) etl_times.update(etl_times_import) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, ) table_import = omnisci_server_worker.database(database_name).table( table_name) t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = round((timer() - t0) * 1000) elif import_mode == "pandas": # Datafiles import columns_types_converted = [ "float64" if (x.startswith("decimal")) else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types_converted, header=0, nrows=None, compression_type="gzip" if filename.endswith("gz") else None, validation=validation, ) etl_times["t_readcsv"] = round( (t_import_pandas + t_import_ibis) * 1000) elif import_mode == "fsi": try: unzip_name = None if filename.endswith("gz"): import gzip unzip_name = "/tmp/santander-fsi.csv" with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table) etl_times["t_readcsv"] = round((timer() - t0) * 1000) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name("var_%d_gt1" % i)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = round((timer() - t_etl_start) * 1000) return table_df, etl_times
def equal_operator(l, r): if ibis.null().equals(l): return r.isnull() if ibis.null().equals(r): return l.isnull() return l == r
""" if isinstance(value, float) and math.isnan(value): return False if isinstance(value, (int, float)): return True if isinstance(value, ibis.expr.types.NumericValue): return ~value.isnull() return False # From https://vega.github.io/vega/docs/expressions/ VEGAJS_NAMESPACE: Dict[str, Any] = { # Constants "null": ibis.null(), "NaN": ibis.NA, "E": math.e, "LN2": math.log(2), "LN10": math.log(10), "LOG2E": math.log2(math.e), "LOG10E": math.log10(math.e), "MAX_VALUE": sys.float_info.max, "MIN_VALUE":
def etl_ibis( filename, columns_names, columns_types, database_name, table_name, omnisci_server_worker, delete_old_database, create_new_table, ipc_connection, validation, etl_keys, import_mode, fragments_size, ): etl_times = {key: 0.0 for key in etl_keys} fragments_size = check_fragments_size(fragments_size, count_table=1, import_mode=import_mode) omnisci_server_worker.create_database(database_name, delete_if_exists=delete_old_database) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) if import_mode == "copy-from": t0 = timer() omnisci_server_worker.create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=fragments_size[0], ) table_import = omnisci_server_worker.database(database_name).table( table_name) etl_times["t_connect"] += timer() - t0 t0 = timer() table_import.read_csv(filename, header=True, quotechar="", delimiter=",") etl_times["t_readcsv"] = timer() - t0 elif import_mode == "pandas": # decimal(8, 4) is converted to decimal(9, 6) in order to provide better data conversion # accuracy during import from Pandas into OmniSciDB for proper results validation columns_types = [ "decimal(9, 6)" if (x == "decimal(8, 4)") else x for x in columns_types ] t_import_pandas, t_import_ibis = omnisci_server_worker.import_data_by_ibis( table_name=table_name, data_files_names=filename, files_limit=1, columns_names=columns_names, columns_types=columns_types, header=0, nrows=None, compression_type="gzip" if filename.endswith(".gz") else None, use_columns_types_for_pd=False, ) etl_times["t_readcsv"] = t_import_pandas + t_import_ibis etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time() elif import_mode == "fsi": try: unzip_name = None if filename.endswith(".gz"): import gzip unzip_name = get_tmp_filepath("santander-fsi.csv") with gzip.open(filename, "rb") as gz_input: with open(unzip_name, "wb") as output: output.write(gz_input.read()) t0 = timer() omnisci_server_worker._conn.create_table_from_csv( table_name, unzip_name or filename, schema_table, fragment_size=fragments_size[0], ) etl_times["t_readcsv"] = timer() - t0 etl_times[ "t_connect"] += omnisci_server_worker.get_conn_creation_time( ) finally: if filename.endswith("gz"): import os os.remove(unzip_name) # Second connection - this is ibis's ipc connection for DML t0 = timer() omnisci_server_worker.connect_to_server(database_name, ipc=ipc_connection) table = omnisci_server_worker.database(database_name).table(table_name) etl_times["t_connect"] += timer() - t0 # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t_etl_start = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ["var_%s" % i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append(ibis.case().when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ).else_(ibis.null()).end().name(col_gt1)) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_etl"] = timer() - t_etl_start return table_df, etl_times
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False): filename = args.file database_name = args.name table_name = args.table delete_old_database = not args.dnd create_new_table = not args.dni run_import_queries = str_arg_to_bool(run_import_queries) validation = str_arg_to_bool(validation) tmp_table_name = "tmp_table" etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0} if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } etl_times.update(etl_times_import) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, debug_timer=True, columnar_output=args.server_columnar_output, lazy_fetch=args.server_lazy_fetch, ) omnisci_server.launch() import ibis from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) omnisci_server_worker.create_database( database_name, delete_if_exists=delete_old_database ) time.sleep(2) omnisci_server_worker.connect_to_server() if run_import_queries: # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"] ) import_query_cols_str = "".join(import_query_cols_list) connect_to_db_sql = connect_to_db_sql_template.format(database_name) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str ) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true" ) import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename ) # data file import by ibis columns_types_import_query = ["string", "int64"] + [ "float64" for _ in range(200) ] schema_table_import = ibis.Schema( names=columns_names, types=columns_types_import_query ) omnisci_server_worker.get_conn().create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, fragment_size=args.fragment_size, ) table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times["t_readcsv_by_ibis"] = timer() - t0 # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times["t_readcsv_by_FSI"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times["t_readcsv_by_COPY"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.get_conn().create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=args.fragment_size, ) table_import = omnisci_server_worker.database(database_name).table(table_name) table_import.read_csv(filename, delimiter=",") if args.server_conn_type == "regular": omnisci_server_worker.connect_to_server() elif args.server_conn_type == "ipc": omnisci_server_worker.ipc_connect_to_server() else: print("Wrong connection type is specified!") sys.exit(0) db = omnisci_server_worker.database(database_name) table = db.table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t0 = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append( ibis.case() .when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ) .else_(ibis.null()) .end() .name("var_%d_gt1" % i) ) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_groupby_merge_where"] = timer() - t0 # rows split query t0 = timer() training_part, validation_part = table_df[:-10000], table_df[-10000:] etl_times["t_train_test_split"] = timer() - t0 etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"] x_train = training_part.drop(['target0'],axis=1) y_train = training_part['target0'] x_valid = validation_part.drop(['target0'],axis=1) y_valid = validation_part['target0'] omnisci_server.terminate() omnisci_server = None return x_train, y_train, x_valid, y_valid, etl_times
def _calculate_difference(field_differences, datatype, validation, is_value_comparison): pct_threshold = ibis.literal(validation.threshold) if isinstance(datatype, ibis.expr.datatypes.Timestamp): source_value = field_differences["differences_source_value"].epoch_seconds() target_value = field_differences["differences_target_value"].epoch_seconds() elif isinstance(datatype, ibis.expr.datatypes.Float64): # Float64 type results from AVG() aggregation source_value = field_differences["differences_source_value"].round(digits=4) target_value = field_differences["differences_target_value"].round(digits=4) elif isinstance(datatype, ibis.expr.datatypes.Decimal): source_value = ( field_differences["differences_source_value"] .cast("float64") .round(digits=4) ) target_value = ( field_differences["differences_target_value"] .cast("float64") .round(digits=4) ) else: source_value = field_differences["differences_source_value"] target_value = field_differences["differences_target_value"] # Does not calculate difference between agg values for row hash due to int64 overflow if is_value_comparison: difference = pct_difference = ibis.null() validation_status = ( ibis.case() .when( target_value.isnull() & source_value.isnull(), consts.VALIDATION_STATUS_SUCCESS, ) .when(target_value == source_value, consts.VALIDATION_STATUS_SUCCESS) .else_(consts.VALIDATION_STATUS_FAIL) .end() ) # String data types i.e "None" can be returned for NULL timestamp/datetime aggs elif isinstance(datatype, ibis.expr.datatypes.String): difference = pct_difference = ibis.null().cast("float64") validation_status = ( ibis.case() .when( target_value.isnull() & source_value.isnull(), consts.VALIDATION_STATUS_SUCCESS, ) .else_(consts.VALIDATION_STATUS_FAIL) .end() ) else: difference = (target_value - source_value).cast("float64") pct_difference_nonzero = ( ibis.literal(100.0) * difference / ( source_value.case() .when(ibis.literal(0), target_value) .else_(source_value) .end() ).cast("float64") ).cast("float64") # Considers case that source and target agg values can both be 0 pct_difference = ( ibis.case() .when(difference == ibis.literal(0), ibis.literal(0).cast("float64")) .else_(pct_difference_nonzero) .end() ) th_diff = (pct_difference.abs() - pct_threshold).cast("float64") validation_status = ( ibis.case() .when( source_value.isnull() & target_value.isnull(), consts.VALIDATION_STATUS_SUCCESS, ) .when(th_diff.isnan() | (th_diff > 0.0), consts.VALIDATION_STATUS_FAIL) .else_(consts.VALIDATION_STATUS_SUCCESS) .end() ) return ( difference.name("difference"), pct_difference.name("pct_difference"), pct_threshold.name("pct_threshold"), validation_status.name("validation_status"), )