def q3_ibis(table, input_for_validation, debug_mode): t_query = 0 t0 = timer() q3_output_ibis = ( # noqa: F841 (assigned, but unused. Used in commented code.) table.groupby( [table.passenger_count, table.pickup_datetime.year().name("pickup_datetime")] ) .aggregate(count=table.passenger_count.count()) .execute() ) t_query += timer() - t0 if input_for_validation is not None: print("Validating query 3 results ...") q3_output_pd = input_for_validation["Query3"] # Casting of Pandas q3 output to Pandas.DataFrame type, which is compartible with # Ibis q3 output q3_output_pd_casted = pd.DataFrame( { "passenger_count": q3_output_pd["passenger_count"], "pickup_datetime": q3_output_pd["pickup_datetime"], "count": q3_output_pd[0], } ) compare_dataframes( ibis_dfs=[q3_output_pd_casted], pandas_dfs=[q3_output_ibis], sort_cols=[], drop_cols=[] ) # Query result extraction for comparison with SQL version query if debug_mode: q3_output_pd_casted.to_csv("./q3_pd_result.csv", index=False) return t_query
def q1_ibis(table, input_for_validation, debug_mode): t_query = 0 t0 = timer() q1_output_ibis = ( # noqa: F841 (assigned, but unused. Used in commented code.) table.groupby("cab_type").count().sort_by("cab_type")["cab_type", "count"].execute() ) t_query += timer() - t0 if input_for_validation: print("Validating query 1 results ...") q1_output_pd = input_for_validation["Query1"] # Casting of Pandas q1 output to Pandas.DataFrame type, which is compartible with # Ibis q1 output q1_output_pd_data = { q1_output_pd.name: q1_output_pd.index.to_numpy(), "count": q1_output_pd.to_numpy(), } q1_output_pd_df = pd.DataFrame(q1_output_pd_data, columns=[q1_output_pd.name, "count"]) q1_output_pd_df = q1_output_pd_df.astype({"cab_type": "category"}, copy=False) compare_dataframes( ibis_dfs=[q1_output_pd_df], pandas_dfs=[q1_output_ibis], sort_cols=[], drop_cols=[] ) # Query result extraction for comparison with SQL version query if debug_mode: q1_output_pd.to_csv("./q1_pd_result.csv") return t_query
def q4_ibis(table, input_for_validation, debug_mode): t_query = 0 t0 = timer() q4_ibis_sized = table.groupby( [ table.passenger_count, table.pickup_datetime.year().name("pickup_datetime"), table.trip_distance.cast("int64").name("trip_distance"), ] ).size() q4_output_ibis = ( q4_ibis_sized.sort_by( # noqa: F841 (assigned, but unused. Used in commented code.) [("pickup_datetime", True), ("count", False)] ).execute() ) t_query += timer() - t0 if input_for_validation is not None: print("Validating query 4 results ...") q4_output_pd = input_for_validation["Query4"] # Casting of Pandas q4 output to Pandas.DataFrame type, which is compartible with # Ibis q4 output q4_output_ibis_casted = q4_output_ibis.sort_values( by=["passenger_count", "pickup_datetime", "trip_distance", "count"], ascending=[True, True, True, True], ) q4_output_pd_casted = q4_output_pd.sort_values( by=["passenger_count", "pickup_datetime", "trip_distance", 0], ascending=[True, True, True, True], ) q4_output_pd_casted.columns = [ "passenger_count", "pickup_datetime", "trip_distance", "count", ] compare_dataframes( ibis_dfs=[q4_output_ibis_casted], pandas_dfs=[q4_output_pd_casted], sort_cols=[], drop_cols=[], ) # Query result extraction for comparison with SQL version query if debug_mode: q4_output_pd.to_csv("./q4_pd_result.csv", index=False) q4_output_pd_casted.to_csv("./q4_pd_result_sorted.csv", index=False) return t_query
def q2_ibis(table, input_for_validation, debug_mode): t_query = 0 t0 = timer() q2_output_ibis = ( # noqa: F841 (assigned, but unused. Used in commented code.) table.groupby("passenger_count") .aggregate(total_amount=table.total_amount.mean())[["passenger_count", "total_amount"]] .execute() ) t_query += timer() - t0 if input_for_validation is not None: print("Validating query 2 results ...") q2_output_pd = input_for_validation["Query2"] compare_dataframes( ibis_dfs=[q2_output_pd], pandas_dfs=[q2_output_ibis], sort_cols=[], drop_cols=[] ) # Query result extraction for comparison with SQL version query if debug_mode: q2_output_pd.to_csv("./q2_pd_result.csv", index=False) return t_query
def run_benchmark(parameters): ignored_parameters = { "dfiles_num": parameters["dfiles_num"], "gpu_memory": parameters["gpu_memory"], } warnings.warn(f"Parameters {ignored_parameters} are irnored", RuntimeWarning) parameters["data_file"] = parameters["data_file"].replace("'", "") etl_times_ibis = None etl_times = None ml_times_ibis = None ml_times = None var_cols = ["var_%s" % i for i in range(200)] count_cols = ["var_%s_count" % i for i in range(200)] gt1_cols = ["var_%s_gt1" % i for i in range(200)] columns_names = ["ID_code", "target"] + var_cols columns_types_pd = ["object", "int64"] + ["float64" for _ in range(200)] columns_types_ibis = ["string", "int32" ] + ["decimal(8, 4)" for _ in range(200)] etl_keys = ["t_readcsv", "t_etl"] ml_keys = [ "t_train_test_split", "t_ml", "t_train", "t_inference", "t_dmatrix" ] ml_score_keys = ["mse", "cod"] try: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) if not parameters["no_ibis"]: ml_data_ibis, etl_times_ibis = etl_ibis( filename=parameters["data_file"], run_import_queries=False, columns_names=columns_names, columns_types=columns_types_ibis, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], validation=parameters["validation"], etl_keys=etl_keys, import_mode=parameters["import_mode"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="ms") etl_times_ibis["Backend"] = "Ibis" ml_data, etl_times = etl_pandas( filename=parameters["data_file"], columns_names=columns_names, columns_types=columns_types_pd, etl_keys=etl_keys, ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="ms") etl_times["Backend"] = parameters["pandas_mode"] if not parameters["no_ml"]: ml_scores, ml_times = ml( ml_data=ml_data, target="target", ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="ms") ml_times["Backend"] = parameters["pandas_mode"] print_results(results=ml_scores, backend=parameters["pandas_mode"]) ml_scores["Backend"] = parameters["pandas_mode"] if not parameters["no_ibis"]: ml_scores_ibis, ml_times_ibis = ml( ml_data=ml_data_ibis, target="target0", ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times_ibis, backend="Ibis", unit="ms") ml_times_ibis["Backend"] = "Ibis" print_results(results=ml_scores_ibis, backend="Ibis") ml_scores_ibis["Backend"] = "Ibis" # Results validation block (comparison of etl_ibis and etl_pandas outputs) if parameters["validation"] and not parameters["no_ibis"]: print("Validation of ETL query results with ...") cols_to_sort = ["var_0", "var_1", "var_2", "var_3", "var_4"] ml_data_ibis = ml_data_ibis.rename(columns={"target0": "target"}) # compare_dataframes doesn't sort pandas dataframes ml_data.sort_values(by=cols_to_sort, inplace=True) compare_result = compare_dataframes(ibis_dfs=[ml_data_ibis], pandas_dfs=[ml_data], sort_cols=cols_to_sort, drop_cols=[]) pandas_original() compare_all_with_pandas_original() return { "ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times] } except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1)
def run_benchmark(parameters): parameters["data_file"] = parameters["data_file"].replace("'", "") parameters["dfiles_num"] = parameters["dfiles_num"] or 1 parameters["no_ml"] = parameters["no_ml"] or False check_support(parameters, unsupported_params=["gpu_memory"]) if parameters["validation"]: print("WARNING: Validation not yet supported") if not parameters["no_ibis"]: if parameters["import_mode"] not in ("fsi",): raise ValueError("Unsupported import mode: %s" % parameters["import_mode"]) if not parameters["no_pandas"]: import_pandas_into_module_namespace( namespace=[run_benchmark.__globals__, etl_pandas.__globals__], mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) acq_schema = ibis.Schema( names=( "loan_id", "orig_channel", "seller_name", "orig_interest_rate", "orig_upb", "orig_loan_term", "orig_date", "first_pay_date", "orig_ltv", "orig_cltv", "num_borrowers", "dti", "borrower_credit_score", "first_home_buyer", "loan_purpose", "property_type", "num_units", "occupancy_status", "property_state", "zip", "mortgage_insurance_percent", "product_type", "coborrow_credit_score", "mortgage_insurance_type", "relocation_mortgage_indicator", "year_quarter_ignore", ), types=( "int64", "category", "string", "float64", "int64", "int64", "timestamp", "timestamp", "float64", "float64", "float64", "float64", "float64", "category", "category", "category", "int64", "category", "category", "int64", "float64", "category", "float64", "float64", "category", "int32", ), ) perf_schema = ibis.Schema( names=( "loan_id", "monthly_reporting_period", "servicer", "interest_rate", "current_actual_upb", "loan_age", "remaining_months_to_legal_maturity", "adj_remaining_months_to_maturity", "maturity_date", "msa", "current_loan_delinquency_status", "mod_flag", "zero_balance_code", "zero_balance_effective_date", "last_paid_installment_date", "foreclosed_after", "disposition_date", "foreclosure_costs", "prop_preservation_and_repair_costs", "asset_recovery_costs", "misc_holding_expenses", "holding_taxes", "net_sale_proceeds", "credit_enhancement_proceeds", "repurchase_make_whole_proceeds", "other_foreclosure_proceeds", "non_interest_bearing_upb", "principal_forgiveness_upb", "repurchase_make_whole_proceeds_flag", "foreclosure_principal_write_off_amount", "servicing_activity_indicator", ), types=( "int64", "timestamp", "category", "float64", "float64", "float64", "float64", "float64", "timestamp", "float64", "int32", "category", "category", "timestamp", "timestamp", "timestamp", "timestamp", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "category", "float64", "category", ), ) etl_keys = ["t_readcsv", "t_etl", "t_connect"] ml_keys = ["t_dmatrix", "t_ml", "t_train"] ml_score_keys = ["mse_mean", "cod_mean", "mse_dev", "cod_dev"] N_RUNS = 1 result = {"ETL": [], "ML": []} # gets data directory size in MB dataset_size = get_dir_size(parameters["data_file"]) if not parameters["no_ibis"]: df_ibis, mb_ibis, etl_times_ibis = _etl_ibis(parameters, acq_schema, perf_schema, etl_keys) print_results(results=etl_times_ibis, backend="Ibis", unit="s") etl_times_ibis["Backend"] = "Ibis" etl_times_ibis["dataset_size"] = dataset_size result["ETL"].append(etl_times_ibis) if not parameters["no_ml"]: result["ML"].append(_run_ml(df_ibis, N_RUNS, mb_ibis, ml_keys, ml_score_keys, "Ibis")) if not parameters["no_pandas"]: df_pd, mb_pd, etl_times_pd = _etl_pandas(parameters, acq_schema, perf_schema, etl_keys) print_results(results=etl_times_pd, backend=parameters["pandas_mode"], unit="s") etl_times_pd["Backend"] = parameters["pandas_mode"] etl_times_pd["dataset_size"] = dataset_size result["ETL"].append(etl_times_pd) if not parameters["no_ml"]: result["ML"].append( _run_ml(df_pd, N_RUNS, mb_pd, ml_keys, ml_score_keys, parameters["pandas_mode"]) ) if parameters["validation"]: # recompute frames but leave categories as strings idf, _, _ = _etl_ibis(parameters, acq_schema, perf_schema, etl_keys, do_validate=True) pdf, _, _ = _etl_pandas(parameters, acq_schema, perf_schema, etl_keys, do_validate=True) for df in (pdf, idf): for colname, coltype in df.dtypes.items(): if str(coltype) == "category": df[colname] = ( df[colname] .cat.reorder_categories(sorted(df[colname].cat.categories), True) .cat.add_categories("N/A") .fillna("N/A") ) sortBy = sorted(pdf.dtypes.index) pdf.sort_values(by=sortBy, axis=0, inplace=True) idf.sort_values(by=sortBy, axis=0, inplace=True) pdf = pdf.reset_index().drop("index", axis=1) idf = idf.reset_index().drop("index", axis=1) compare_dataframes((idf,), (pdf,), [], []) # pdf['servicer'] = pdf['servicer'].cat.add_categories('N/A').fillna('N/A') # pdb.set_trace() # # df_pd.drop(dropCols, axis=1, inplace=True) # compare_dataframes( # ibis_dfs=(df_ibis,), pandas_dfs=(df_pd,), sort_cols=sortBy, drop_cols=dropCols # ) return result
def run_benchmark(parameters): ignored_parameters = { "dfiles_num": parameters["dfiles_num"], "gpu_memory": parameters["gpu_memory"], } warnings.warn(f"Parameters {ignored_parameters} are irnored", RuntimeWarning) parameters["data_file"] = parameters["data_file"].replace("'", "") # ML specific N_RUNS = 50 TEST_SIZE = 0.1 RANDOM_STATE = 777 columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] etl_keys = ["t_readcsv", "t_etl"] ml_keys = ["t_train_test_split", "t_ml", "t_train", "t_inference"] ml_score_keys = ["mse_mean", "cod_mean", "mse_dev", "cod_dev"] try: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_times_ibis = None ml_times_ibis = None etl_times = None ml_times = None if not parameters["pandas_mode"] and parameters["validation"]: print("WARNING: validation working only for '-import_mode pandas'") if not parameters["no_ibis"]: df_ibis, X_ibis, y_ibis, etl_times_ibis = etl_ibis( filename=parameters["data_file"], columns_names=columns_names, columns_types=columns_types, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], validation=parameters["validation"], etl_keys=etl_keys, import_mode=parameters["import_mode"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="ms") etl_times_ibis["Backend"] = "Ibis" if not parameters["no_ml"]: ml_scores_ibis, ml_times_ibis = ml( X=X_ibis, y=y_ibis, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE, optimizer=parameters["optimizer"], ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times_ibis, backend="Ibis", unit="ms") ml_times_ibis["Backend"] = "Ibis" print_results(results=ml_scores_ibis, backend="Ibis") ml_scores_ibis["Backend"] = "Ibis" df, X, y, etl_times = etl_pandas( parameters["data_file"], columns_names=columns_names, columns_types=columns_types, etl_keys=etl_keys, ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="ms") etl_times["Backend"] = parameters["pandas_mode"] if not parameters["no_ml"]: ml_scores, ml_times = ml( X=X, y=y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE, optimizer=parameters["optimizer"], ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="ms") ml_times["Backend"] = parameters["pandas_mode"] print_results(results=ml_scores, backend=parameters["pandas_mode"]) ml_scores["Backend"] = parameters["pandas_mode"] if parameters["pandas_mode"] and parameters["validation"]: # this should work only for pandas mode compare_dataframes( ibis_dfs=(X_ibis, y_ibis), pandas_dfs=(X, y), ) return { "ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times] } except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1)
def main(): args = None omnisci_server_worker = None train_final, test_final = None, None parser, args, skip_rows = get_args() try: if not args.no_ibis: sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from server import OmnisciServer if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) train_final, test_final, etl_times = etl_all_ibis( filename=args.dataset_path, database_name=args.name, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, skip_rows=skip_rows, validation=args.val, ) ml_data, etl_times = split_step(train_final, test_final, etl_times) print_times(etl_times) omnisci_server_worker.terminate() omnisci_server_worker = None if not args.no_ml: print("using ml with dataframes from ibis") ml_times = ml(ml_data) print_times(ml_times) ptrain_final, ptest_final, petl_times = etl_all_pandas( args.dataset_path, skip_rows ) ml_data, petl_times = split_step(ptrain_final, ptest_final, petl_times) print_times(petl_times) if not args.no_ml: print("using ml with dataframes from pandas") ml_times = ml(ml_data) print_times(ml_times) if args.val and (not train_final is None) and (not test_final is None): print("validating result ...") compare_dataframes((train_final, test_final), (ptrain_final, ptest_final)) finally: if omnisci_server_worker: omnisci_server_worker.terminate()
def run_benchmark(parameters): check_support(parameters, unsupported_params=["dfiles_num"]) parameters["data_file"] = parameters["data_file"].replace("'", "") parameters["gpu_memory"] = parameters["gpu_memory"] or 16 parameters["no_ml"] = parameters["no_ml"] or False skip_rows = compute_skip_rows(parameters["gpu_memory"]) dtypes = OrderedDict( [ ("object_id", "int32"), ("mjd", "float32"), ("passband", "int32"), ("flux", "float32"), ("flux_err", "float32"), ("detected", "int32"), ] ) # load metadata columns_names = [ "object_id", "ra", "decl", "gal_l", "gal_b", "ddf", "hostgal_specz", "hostgal_photoz", "hostgal_photoz_err", "distmod", "mwebv", "target", ] meta_dtypes = ["int32"] + ["float32"] * 4 + ["int32"] + ["float32"] * 5 + ["int32"] meta_dtypes = OrderedDict( [(columns_names[i], meta_dtypes[i]) for i in range(len(meta_dtypes))] ) etl_keys = ["t_readcsv", "t_etl", "t_connect"] ml_keys = ["t_train_test_split", "t_dmatrix", "t_training", "t_infer", "t_ml"] if not parameters["no_pandas"]: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_times_ibis = None ml_times_ibis = None etl_times = None ml_times = None if not parameters["no_ibis"]: train_final_ibis, test_final_ibis, etl_times_ibis = etl_all_ibis( dataset_path=parameters["data_file"], database_name=parameters["database_name"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], skip_rows=skip_rows, validation=parameters["validation"], dtypes=dtypes, meta_dtypes=meta_dtypes, etl_keys=etl_keys, import_mode=parameters["import_mode"], fragments_size=parameters["fragments_size"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="s") etl_times_ibis["Backend"] = "Ibis" if not parameters["no_ml"]: print("using ml with dataframes from Ibis") ml_times_ibis = ml(train_final_ibis, test_final_ibis, ml_keys) print_results(results=ml_times_ibis, backend="Ibis", unit="s") ml_times_ibis["Backend"] = "Ibis" if not parameters["no_pandas"]: train_final, test_final, etl_times = etl_all_pandas( dataset_path=parameters["data_file"], skip_rows=skip_rows, dtypes=dtypes, meta_dtypes=meta_dtypes, etl_keys=etl_keys, pandas_mode=parameters["pandas_mode"], ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="s") etl_times["Backend"] = parameters["pandas_mode"] if not parameters["no_ml"]: print("using ml with dataframes from Pandas") ml_times = ml(train_final, test_final, ml_keys) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="s") ml_times["Backend"] = parameters["pandas_mode"] if parameters["validation"] and parameters["import_mode"] != "pandas": print( "WARNING: validation can not be performed, it works only for 'pandas' import mode, '{}' passed".format( parameters["import_mode"] ) ) if parameters["validation"] and parameters["import_mode"] == "pandas": compare_dataframes( ibis_dfs=[train_final_ibis, test_final_ibis], pandas_dfs=[train_final, test_final], parallel_execution=True, ) return {"ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times]}
def run_benchmark(parameters): check_support(parameters, unsupported_params=["dfiles_num", "gpu_memory", "optimizer"]) parameters["data_file"] = parameters["data_file"].replace("'", "") parameters["no_ml"] = parameters["no_ml"] or False etl_times_ibis = None etl_times = None ml_times_ibis = None ml_times = None var_cols = ["var_%s" % i for i in range(200)] columns_names = ["ID_code", "target"] + var_cols columns_types_pd = ["object", "int64"] + ["float64" for _ in range(200)] columns_types_ibis = ["string", "int32" ] + ["decimal(8, 4)" for _ in range(200)] etl_keys = ["t_readcsv", "t_etl", "t_connect"] ml_keys = [ "t_train_test_split", "t_ml", "t_train", "t_inference", "t_dmatrix" ] ml_score_keys = ["mse", "cod"] if not parameters["no_pandas"]: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) if not parameters["no_ibis"]: ml_data_ibis, etl_times_ibis = etl_ibis( filename=parameters["data_file"], columns_names=columns_names, columns_types=columns_types_ibis, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], validation=parameters["validation"], etl_keys=etl_keys, import_mode=parameters["import_mode"], fragments_size=parameters["fragments_size"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="s") etl_times_ibis["Backend"] = "Ibis" if not parameters["no_pandas"]: ml_data, etl_times = etl_pandas( filename=parameters["data_file"], columns_names=columns_names, columns_types=columns_types_pd, etl_keys=etl_keys, ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="s") etl_times["Backend"] = parameters["pandas_mode"] if not parameters["no_ml"]: if not parameters["no_pandas"]: ml_scores, ml_times = ml( ml_data=ml_data, target="target", ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="s") ml_times["Backend"] = parameters["pandas_mode"] print_results(results=ml_scores, backend=parameters["pandas_mode"]) ml_scores["Backend"] = parameters["pandas_mode"] if not parameters["no_ibis"]: ml_scores_ibis, ml_times_ibis = ml( ml_data=ml_data_ibis, target="target0", ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times_ibis, backend="Ibis", unit="s") ml_times_ibis["Backend"] = "Ibis" print_results(results=ml_scores_ibis, backend="Ibis") ml_scores_ibis["Backend"] = "Ibis" # Results validation block (comparison of etl_ibis and etl_pandas outputs) if parameters["validation"]: print("Validation of ETL query results ...") cols_to_sort = ["var_0", "var_1", "var_2", "var_3", "var_4"] ml_data_ibis = ml_data_ibis.rename(columns={"target0": "target"}) # compare_dataframes doesn't sort pandas dataframes ml_data.sort_values(by=cols_to_sort, inplace=True) compare_dataframes( ibis_dfs=[ml_data_ibis], pandas_dfs=[ml_data], sort_cols=cols_to_sort, drop_cols=[], parallel_execution=True, ) return { "ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times] }
def run_benchmark(parameters): check_support(parameters, unsupported_params=["dfiles_num", "gpu_memory"]) parameters["data_file"] = parameters["data_file"].replace("'", "") parameters["optimizer"] = parameters["optimizer"] or "intel" parameters["no_ml"] = parameters["no_ml"] or False # ML specific N_RUNS = 50 TEST_SIZE = 0.1 RANDOM_STATE = 777 columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] etl_keys = ["t_readcsv", "t_etl", "t_connect"] ml_keys = ["t_train_test_split", "t_ml", "t_train", "t_inference"] ml_score_keys = ["mse_mean", "cod_mean", "mse_dev", "cod_dev"] if not parameters["no_pandas"]: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_times_ibis = None ml_times_ibis = None etl_times = None ml_times = None if parameters["validation"] and parameters["import_mode"] != "pandas": print( f"WARNING: validation can not be performed, it works only for 'pandas' \ import mode, '{parameters['import_mode']}' passed") if parameters["data_file"].endswith(".csv"): csv_size = getsize(parameters["data_file"]) else: print( "WARNING: uncompressed datafile not found, default value for dataset_size is set" ) # deafault csv_size value (unit - MB) obtained by calling getsize # function on the ipums_education2income_1970-2010.csv file # (default Census benchmark data file) csv_size = 2100.0 if not parameters["no_ibis"]: df_ibis, X_ibis, y_ibis, etl_times_ibis = etl_ibis( filename=parameters["data_file"], columns_names=columns_names, columns_types=columns_types, database_name=parameters["database_name"], table_name=parameters["table"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], validation=parameters["validation"], etl_keys=etl_keys, import_mode=parameters["import_mode"], fragments_size=parameters["fragments_size"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="s") etl_times_ibis["Backend"] = "Ibis" etl_times_ibis["dataset_size"] = csv_size if not parameters["no_ml"]: ml_scores_ibis, ml_times_ibis = ml( X=X_ibis, y=y_ibis, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE, optimizer=parameters["optimizer"], ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times_ibis, backend="Ibis", unit="s") ml_times_ibis["Backend"] = "Ibis" print_results(results=ml_scores_ibis, backend="Ibis") ml_scores_ibis["Backend"] = "Ibis" if not parameters["no_pandas"]: df, X, y, etl_times = etl_pandas( parameters["data_file"], columns_names=columns_names, columns_types=columns_types, etl_keys=etl_keys, pandas_mode=parameters["pandas_mode"], ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="s") etl_times["Backend"] = parameters["pandas_mode"] etl_times["dataset_size"] = csv_size if not parameters["no_ml"]: ml_scores, ml_times = ml( X=X, y=y, random_state=RANDOM_STATE, n_runs=N_RUNS, test_size=TEST_SIZE, optimizer=parameters["optimizer"], ml_keys=ml_keys, ml_score_keys=ml_score_keys, ) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="s") ml_times["Backend"] = parameters["pandas_mode"] print_results(results=ml_scores, backend=parameters["pandas_mode"]) ml_scores["Backend"] = parameters["pandas_mode"] if parameters["validation"] and parameters["import_mode"] == "pandas": # this should work only for pandas mode compare_dataframes(ibis_dfs=(X_ibis, y_ibis), pandas_dfs=(X, y)) return { "ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times] }
def main(): omniscript_path = os.path.dirname(__file__) args = None omnisci_server_worker = None parser = argparse.ArgumentParser(description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-f", "--file", dest="file", required=True, help="A datafile that should be loaded", ) optional.add_argument("-dnd", action="store_true", help="Do not delete old table.") optional.add_argument( "-dni", action="store_true", help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-val", action="store_true", help="validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-o", "--optimizer", choices=["intel", "stock"], dest="optimizer", default="intel", help="Which optimizer is used", ) # MySQL database parameters optional.add_argument( "-db-server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db-port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db-user", dest="db_user", default="", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db-pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db-name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db-table", dest="db_table", help="Table to use to store results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-e", "--executable", dest="omnisci_executable", required=False, help="Path to omnisci_server executable.", ) optional.add_argument( "-w", "--workdir", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", "--omnisci_port", dest="omnisci_port", default=6274, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-u", "--user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-p", "--password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-n", "--name", dest="name", default="census_database", help="Database name to use in omniscidb server.", ) optional.add_argument( "-t", "--table", dest="table", default="census_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash to use for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for benchmark.", ) optional.add_argument( "-no_ibis", action="store_true", help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) optional.add_argument( "-pandas_mode", choices=["pandas", "modin_on_ray", "modin_on_dask", "modin_on_python"], default="pandas", help="Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask", ) optional.add_argument( "-ray_tmpdir", default="/tmp", help="Location where to keep Ray plasma store. It should have enough space to keep -ray_memory", ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, help="Size of memory to allocate for Ray plasma store", ) optional.add_argument( "-no_ml", action="store_true", help="Do not run machine learning benchmark, only ETL part", ) args = parser.parse_args() args.file = args.file.replace("'", "") # ML specific N_RUNS = 50 TRAIN_SIZE = 0.9 RANDOM_STATE = 777 columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] db_reporter = None try: if not args.no_ibis: if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) if args.db_user is not "": print("Connecting to database") db = mysql.connector.connect( host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name, ) db_reporter = DbReport( db, args.db_table, { "QueryName": "VARCHAR(500) NOT NULL", "FirstExecTimeMS": "BIGINT UNSIGNED", "WorstExecTimeMS": "BIGINT UNSIGNED", "BestExecTimeMS": "BIGINT UNSIGNED", "AverageExecTimeMS": "BIGINT UNSIGNED", "TotalTimeMS": "BIGINT UNSIGNED", "IbisCommitHash": "VARCHAR(500) NOT NULL", "BackEnd": "VARCHAR(100) NOT NULL", }, { "ScriptName": "census_pandas_ibis.py", "CommitHash": args.commit_omnisci, "IbisCommitHash": args.commit_ibis, }, ) df_ibis, X_ibis, y_ibis, etl_times_ibis = etl_ibis( filename=args.file, columns_names=columns_names, columns_types=columns_types, database_name=args.name, table_name=args.table, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, validation=args.val, ) omnisci_server_worker.terminate() omnisci_server_worker = None print_times(etl_times_ibis, "Ibis", db_reporter) if not args.no_ml: mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml( X_ibis, y_ibis, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer ) print_times(ml_times, "Ibis") print("mean MSE ± deviation: {:.9f} ± {:.9f}".format(mse_mean, mse_dev)) print("mean COD ± deviation: {:.9f} ± {:.9f}".format(cod_mean, cod_dev)) import_pandas_into_module_namespace( main.__globals__, args.pandas_mode, args.ray_tmpdir, args.ray_memory ) df, X, y, etl_times = etl_pandas( args.file, columns_names=columns_names, columns_types=columns_types ) print_times(etl_times, args.pandas_mode, db_reporter) if not args.no_ml: mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml( X, y, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer ) print_times(ml_times, args.pandas_mode) print("mean MSE ± deviation: {:.9f} ± {:.9f}".format(mse_mean, mse_dev)) print("mean COD ± deviation: {:.9f} ± {:.9f}".format(cod_mean, cod_dev)) if args.val: compare_dataframes((df_ibis,), (df,)) except Exception as err: print("Failed: ", err) sys.exit(1) finally: if omnisci_server_worker: omnisci_server_worker.terminate()
def run_benchmark(parameters): ignored_parameters = { "dfiles_num": parameters["dfiles_num"], } warnings.warn(f"Parameters {ignored_parameters} are irnored", RuntimeWarning) parameters["data_file"] = parameters["data_file"].replace("'", "") skip_rows = compute_skip_rows(parameters["gpu_memory"]) dtypes = OrderedDict([ ("object_id", "int32"), ("mjd", "float32"), ("passband", "int32"), ("flux", "float32"), ("flux_err", "float32"), ("detected", "int32"), ]) # load metadata columns_names = [ "object_id", "ra", "decl", "gal_l", "gal_b", "ddf", "hostgal_specz", "hostgal_photoz", "hostgal_photoz_err", "distmod", "mwebv", "target", ] meta_dtypes = ["int32"] + ["float32"] * 4 + ["int32"] + ["float32"] * 5 + [ "int32" ] meta_dtypes = OrderedDict([(columns_names[i], meta_dtypes[i]) for i in range(len(meta_dtypes))]) etl_keys = ["t_readcsv", "t_etl"] ml_keys = [ "t_train_test_split", "t_dmatrix", "t_training", "t_infer", "t_ml" ] try: import_pandas_into_module_namespace( namespace=run_benchmark.__globals__, mode=parameters["pandas_mode"], ray_tmpdir=parameters["ray_tmpdir"], ray_memory=parameters["ray_memory"], ) etl_times_ibis = None ml_times_ibis = None etl_times = None ml_times = None if not parameters["no_ibis"]: train_final_ibis, test_final_ibis, etl_times_ibis = etl_all_ibis( dataset_path=parameters["data_file"], database_name=parameters["database_name"], omnisci_server_worker=parameters["omnisci_server_worker"], delete_old_database=not parameters["dnd"], create_new_table=not parameters["dni"], ipc_connection=parameters["ipc_connection"], skip_rows=skip_rows, validation=parameters["validation"], dtypes=dtypes, meta_dtypes=meta_dtypes, etl_keys=etl_keys, import_mode=parameters["import_mode"], ) print_results(results=etl_times_ibis, backend="Ibis", unit="ms") etl_times_ibis["Backend"] = "Ibis" if not parameters["no_ml"]: print("using ml with dataframes from Ibis") ml_times_ibis = ml(train_final_ibis, test_final_ibis, ml_keys) print_results(results=ml_times_ibis, backend="Ibis", unit="ms") ml_times_ibis["Backend"] = "Ibis" train_final, test_final, etl_times = etl_all_pandas( dataset_path=parameters["data_file"], skip_rows=skip_rows, dtypes=dtypes, meta_dtypes=meta_dtypes, etl_keys=etl_keys, ) print_results(results=etl_times, backend=parameters["pandas_mode"], unit="ms") etl_times["Backend"] = parameters["pandas_mode"] if not parameters["no_ml"]: print("using ml with dataframes from Pandas") ml_times = ml(train_final, test_final, ml_keys) print_results(results=ml_times, backend=parameters["pandas_mode"], unit="ms") ml_times["Backend"] = parameters["pandas_mode"] if parameters["validation"]: compare_dataframes( ibis_dfs=[train_final_ibis, test_final_ibis], pandas_dfs=[train_final, test_final], ) return { "ETL": [etl_times_ibis, etl_times], "ML": [ml_times_ibis, ml_times] } except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1)