'BenchName': 'VARCHAR(500) NOT NULL', 'BestExecTimeMS': 'BIGINT UNSIGNED', 'BestTotalTimeMS': 'BIGINT UNSIGNED', 'WorstExecTimeMS': 'BIGINT UNSIGNED', 'WorstTotalTimeMS': 'BIGINT UNSIGNED', 'AverageExecTimeMS': 'BIGINT UNSIGNED', 'AverageTotalTimeMS': 'BIGINT UNSIGNED' }, { 'ScriptName': 'run_omnisci_benchmark.py', 'CommitHash': args.commit }) try: omnisci_server = OmnisciServer(omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.passwd) omnisci_server.launch() with open(args.report, "w") as report: print( "datafiles,fragment_size,query,query_exec_min,query_total_min,query_exec_max," "query_total_max,query_exec_avg,query_total_avg,query_error_info", file=report, flush=True) if args.fragment_size is not None: for fs in args.fragment_size: print("RUNNING WITH FRAGMENT SIZE", fs) execute_benchmark(datafiles, import_cmdline, args.benchmarks_path, benchmark_cmdline, fs,
def main(): omniscript_path = os.path.dirname(__file__) args = None omnisci_server = None parser = argparse.ArgumentParser( description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-f", "--file", dest="file", required=True, help="A datafile that should be loaded", ) optional.add_argument("-dnd", action="store_true", help="Do not delete old table.") optional.add_argument( "-dni", action="store_true", help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-val", action="store_true", help= "validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-o", "--optimizer", choices=["intel", "stock"], dest="optimizer", default="intel", help="Which optimizer is used", ) # MySQL database parameters optional.add_argument( "-db-server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db-port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db-user", dest="db_user", default="", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db-pass", dest="db_password", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db-name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db-table", dest="db_table", help="Table to use to store results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-e", "--executable", dest="omnisci_executable", required=False, help="Path to omnisci_server executable.", ) optional.add_argument( "-w", "--workdir", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", "--omnisci_port", dest="omnisci_port", default=6274, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-u", "--user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-p", "--password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-n", "--name", dest="name", default="census_database", help="Database name to use in omniscidb server.", ) optional.add_argument( "-t", "--table", dest="table", default="census_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash to use for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for benchmark.", ) optional.add_argument( "-no_ibis", action="store_true", help="Do not run Ibis benchmark, run only Pandas (or Modin) version") optional.add_argument( "-pandas_mode", choices=["pandas", "modin_on_ray", "modin_on_dask"], default="pandas", help= "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask" ) optional.add_argument( "-ray_tmpdir", default="/tmp", help= "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory" ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, help="Size of memory to allocate for Ray plasma store") optional.add_argument( "-no_ml", action="store_true", help="Do not run machine learning benchmark, only ETL part") args = parser.parse_args() args.file = args.file.replace("'", "") # ML specific N_RUNS = 50 TRAIN_SIZE = 0.9 RANDOM_STATE = 777 columns_names = [ "YEAR0", "DATANUM", "SERIAL", "CBSERIAL", "HHWT", "CPI99", "GQ", "QGQ", "PERNUM", "PERWT", "SEX", "AGE", "EDUC", "EDUCD", "INCTOT", "SEX_HEAD", "SEX_MOM", "SEX_POP", "SEX_SP", "SEX_MOM2", "SEX_POP2", "AGE_HEAD", "AGE_MOM", "AGE_POP", "AGE_SP", "AGE_MOM2", "AGE_POP2", "EDUC_HEAD", "EDUC_MOM", "EDUC_POP", "EDUC_SP", "EDUC_MOM2", "EDUC_POP2", "EDUCD_HEAD", "EDUCD_MOM", "EDUCD_POP", "EDUCD_SP", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_HEAD", "INCTOT_MOM", "INCTOT_POP", "INCTOT_SP", "INCTOT_MOM2", "INCTOT_POP2", ] columns_types = [ "int64", "int64", "int64", "float64", "int64", "float64", "int64", "float64", "int64", "int64", "int64", "int64", "int64", "int64", "int64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", "float64", ] try: if not args.no_ibis: if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) X_ibis, y_ibis, etl_times_ibis = etl_ibis( filename=args.file, columns_names=columns_names, columns_types=columns_types, database_name=args.name, table_name=args.table, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, ) omnisci_server.terminate() omnisci_server = None print_times(etl_times_ibis, name='Ibis') if not args.no_ml: mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml( X_ibis, y_ibis, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer) print_times(ml_times) print("mean MSE ± deviation: {:.9f} ± {:.9f}".format( mse_mean, mse_dev)) print("mean COD ± deviation: {:.9f} ± {:.9f}".format( cod_mean, cod_dev)) import_pandas_into_module_namespace(main.__globals__, args.pandas_mode, args.ray_tmpdir, args.ray_memory) X, y, etl_times = etl_pandas(args.file, columns_names=columns_names, columns_types=columns_types) print_times(etl_times, name=args.pandas_mode) if not args.no_ml: mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml( X, y, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer) print_times(ml_times) print("mean MSE ± deviation: {:.9f} ± {:.9f}".format( mse_mean, mse_dev)) print("mean COD ± deviation: {:.9f} ± {:.9f}".format( cod_mean, cod_dev)) if args.val: compare_dataframes(ibis_df=(X_ibis, y_ibis), pandas_df=(X, y)) except Exception as err: print("Failed: ", err) sys.exit(1) finally: if omnisci_server: omnisci_server.terminate()
def main(): args = None omnisci_server = None port_default_value = -1 benchmarks = { "ny_taxi": "taxi", "santander": "santander", "census": "census", "plasticc": "plasticc", "mortgage": "mortgage", "h2o": "h2o", } benchmarks_with_ibis_queries = [ "ny_taxi", "santander", "census", "plasticc", "mortgage" ] ignore_fields_for_bd_report_etl = ["t_connect"] ignore_fields_for_bd_report_ml = [] ignore_fields_for_results_unit_conversion = [ "Backend", "dfiles_num", "dataset_size", "query_name", ] parser = argparse.ArgumentParser( description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-bench_name", dest="bench_name", choices=sorted(benchmarks.keys()), help="Benchmark name.", required=True, ) required.add_argument( "-data_file", dest="data_file", help="A datafile that should be loaded.", required=True, ) optional.add_argument( "-dfiles_num", dest="dfiles_num", default=None, type=int, help="Number of datafiles to input into database for processing.", ) optional.add_argument( "-iterations", dest="iterations", default=1, type=int, help= "Number of iterations to run every query. Best result is selected.", ) optional.add_argument("-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table.") optional.add_argument( "-dni", default=False, type=str_arg_to_bool, help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-validation", dest="validation", default=False, type=str_arg_to_bool, help= "validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-import_mode", dest="import_mode", default="fsi", help="measure 'COPY FROM' import, FSI import, import through pandas", ) optional.add_argument( "-optimizer", choices=["intel", "stock"], dest="optimizer", default=None, help="Which optimizer is used", ) optional.add_argument( "-no_ibis", default=False, type=str_arg_to_bool, help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) optional.add_argument( "-no_pandas", default=False, type=str_arg_to_bool, help="Do not run Pandas version of benchmark", ) optional.add_argument( "-pandas_mode", choices=[ "Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python", "Modin_on_omnisci" ], default="Pandas", help= "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask", ) optional.add_argument( "-ray_tmpdir", default="/tmp", help= "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory", ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, type=int, help="Size of memory to allocate for Ray plasma store", ) optional.add_argument( "-no_ml", default=None, type=str_arg_to_bool, help="Do not run machine learning benchmark, only ETL part", ) optional.add_argument( "-gpu_memory", dest="gpu_memory", type=int, help="specify the memory of your gpu" "(This controls the lines to be used. Also work for CPU version. )", default=None, ) optional.add_argument( "-extended_functionality", dest="extended_functionality", default=False, type=str_arg_to_bool, help= "Extends functionality of H2O benchmark by adding 'chk' functions and verbose local reporting of results", ) # MySQL database parameters optional.add_argument( "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db_user", dest="db_user", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db_pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db_name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db_table_etl", dest="db_table_etl", help="Table to use to store ETL results for this benchmark.", ) optional.add_argument( "-db_table_ml", dest="db_table_ml", help="Table to use to store ML results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-executable", dest="executable", help="Path to omnisci_server executable.", ) optional.add_argument( "-omnisci_cwd", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", dest="port", default=port_default_value, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-http_port", dest="http_port", default=port_default_value, type=int, help="HTTP port number to run omnisci_server on.", ) optional.add_argument( "-calcite_port", dest="calcite_port", default=port_default_value, type=int, help="Calcite port number to run omnisci_server on.", ) optional.add_argument( "-user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-database_name", dest="database_name", default="omnisci", help="Database name to use in omniscidb server.", ) optional.add_argument( "-table", dest="table", default="benchmark_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-ipc_conn", dest="ipc_conn", default=True, type=str_arg_to_bool, help="Table name name to use in omniscidb server.", ) optional.add_argument( "-debug_timer", dest="debug_timer", default=False, type=str_arg_to_bool, help="Enable fine-grained query execution timers for debug.", ) optional.add_argument( "-columnar_output", dest="columnar_output", default=True, type=str_arg_to_bool, help= "Allows OmniSci Core to directly materialize intermediate projections \ and the final ResultSet in Columnar format where appropriate.", ) optional.add_argument( "-lazy_fetch", dest="lazy_fetch", default=None, type=str_arg_to_bool, help="[lazy_fetch help message]", ) optional.add_argument( "-multifrag_rs", dest="multifrag_rs", default=None, type=str_arg_to_bool, help="[multifrag_rs help message]", ) optional.add_argument( "-fragments_size", dest="fragments_size", default=None, nargs="*", type=int, help= "Number of rows per fragment that is a unit of the table for query processing. \ Should be specified for each table in workload", ) optional.add_argument( "-omnisci_run_kwargs", dest="omnisci_run_kwargs", default={}, metavar="KEY1=VAL1,KEY2=VAL2...", action=KeyValueListParser, help="options to start omnisci server", ) # Additional information optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash used for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash used for benchmark.", ) optional.add_argument( "-commit_omniscripts", dest="commit_omniscripts", default="1234567890123456789012345678901234567890", help="Omniscripts commit hash used for benchmark.", ) optional.add_argument( "-commit_modin", dest="commit_modin", default="1234567890123456789012345678901234567890", help="Modin commit hash used for benchmark.", ) optional.add_argument( "-debug_mode", dest="debug_mode", default=False, type=str_arg_to_bool, help="Enable debug mode.", ) try: os.environ["PYTHONIOENCODING"] = "UTF-8" os.environ["PYTHONUNBUFFERED"] = "1" omnisci_server_worker = None omnisci_server = None args = parser.parse_args() launch_omnisci_server = (not args.no_ibis and args.bench_name in benchmarks_with_ibis_queries) if args.port == port_default_value: args.port = find_free_port() if args.http_port == port_default_value: args.http_port = find_free_port() if args.calcite_port == port_default_value: args.calcite_port = find_free_port() run_benchmark = __import__(benchmarks[args.bench_name]).run_benchmark parameters = { "data_file": args.data_file, "dfiles_num": args.dfiles_num, "no_ml": args.no_ml, "no_ibis": args.no_ibis, "optimizer": args.optimizer, "pandas_mode": args.pandas_mode, "ray_tmpdir": args.ray_tmpdir, "ray_memory": args.ray_memory, "gpu_memory": args.gpu_memory, "validation": args.validation, "no_pandas": args.no_pandas, "debug_mode": args.debug_mode, "extended_functionality": args.extended_functionality, } if launch_omnisci_server: if args.executable is None: parser.error( "Omnisci executable should be specified with -e/--executable for Ibis part" ) from server import OmnisciServer omnisci_server = OmnisciServer( omnisci_executable=args.executable, omnisci_port=args.port, http_port=args.http_port, calcite_port=args.calcite_port, database_name=args.database_name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, debug_timer=args.debug_timer, columnar_output=args.columnar_output, lazy_fetch=args.lazy_fetch, multifrag_rs=args.multifrag_rs, omnisci_run_kwargs=args.omnisci_run_kwargs, ) parameters["database_name"] = args.database_name parameters["table"] = args.table parameters["dnd"] = args.dnd parameters["dni"] = args.dni parameters["import_mode"] = args.import_mode parameters["fragments_size"] = args.fragments_size if parameters["validation"] and (parameters["no_pandas"] or parameters["no_ibis"]): parameters["validation"] = False print( "WARNING: validation was turned off as it requires both sides to compare." ) etl_results = [] ml_results = [] print(parameters) run_id = int(round(time.time())) for iter_num in range(1, args.iterations + 1): print(f"Iteration #{iter_num}") if launch_omnisci_server: from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) parameters["omnisci_server_worker"] = omnisci_server_worker parameters["ipc_connection"] = args.ipc_conn omnisci_server.launch() parameters = { key: os.path.expandvars(value) if isinstance(value, str) else value for key, value in parameters.items() } benchmark_results = run_benchmark(parameters) if launch_omnisci_server: omnisci_server_worker.terminate() omnisci_server.terminate() additional_fields_for_reporting = { "ETL": { "Iteration": iter_num, "run_id": run_id }, "ML": { "Iteration": iter_num, "run_id": run_id }, } etl_ml_results = refactor_results_for_reporting( benchmark_results=benchmark_results, ignore_fields_for_results_unit_conversion= ignore_fields_for_results_unit_conversion, additional_fields=additional_fields_for_reporting, reporting_unit="ms", ) etl_results = list(etl_ml_results["ETL"]) ml_results = list(etl_ml_results["ML"]) # Reporting to MySQL database if args.db_user is not None: if iter_num == 1: db = mysql.connector.connect( host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name, ) reporting_init_fields = { "OmnisciCommitHash": args.commit_omnisci, "IbisCommitHash": args.commit_ibis, "OmniscriptsCommitHash": args.commit_omniscripts, "ModinCommitHash": args.commit_modin, } reporting_fields_benchmark_etl = { x: "VARCHAR(500) NOT NULL" for x in etl_results[0] } if len(etl_results) != 1: reporting_fields_benchmark_etl.update({ x: "VARCHAR(500) NOT NULL" for x in etl_results[1] }) db_reporter_etl = DbReport( db, args.db_table_etl, reporting_fields_benchmark_etl, reporting_init_fields, ) if len(ml_results) != 0: reporting_fields_benchmark_ml = { x: "VARCHAR(500) NOT NULL" for x in ml_results[0] } if len(ml_results) != 1: reporting_fields_benchmark_ml.update({ x: "VARCHAR(500) NOT NULL" for x in ml_results[1] }) db_reporter_ml = DbReport( db, args.db_table_ml, reporting_fields_benchmark_ml, reporting_init_fields, ) if iter_num == args.iterations: for result_etl in etl_results: remove_fields_from_dict( result_etl, ignore_fields_for_bd_report_etl) db_reporter_etl.submit(result_etl) if len(ml_results) != 0: for result_ml in ml_results: remove_fields_from_dict( result_ml, ignore_fields_for_bd_report_ml) db_reporter_ml.submit(result_ml) finally: if omnisci_server_worker: omnisci_server_worker.terminate() if omnisci_server: omnisci_server.terminate()
def main(): args = None omnisci_server_worker = None train_final, test_final = None, None parser, args, skip_rows = get_args() try: if not args.no_ibis: sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from server import OmnisciServer if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) train_final, test_final, etl_times = etl_all_ibis( filename=args.dataset_path, database_name=args.name, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, skip_rows=skip_rows, validation=args.val, ) ml_data, etl_times = split_step(train_final, test_final, etl_times) print_times(etl_times) omnisci_server_worker.terminate() omnisci_server_worker = None if not args.no_ml: print("using ml with dataframes from ibis") ml_times = ml(ml_data) print_times(ml_times) ptrain_final, ptest_final, petl_times = etl_all_pandas( args.dataset_path, skip_rows ) ml_data, petl_times = split_step(ptrain_final, ptest_final, petl_times) print_times(petl_times) if not args.no_ml: print("using ml with dataframes from pandas") ml_times = ml(ml_data) print_times(ml_times) if args.val and (not train_final is None) and (not test_final is None): print("validating result ...") compare_dataframes((train_final, test_final), (ptrain_final, ptest_final)) finally: if omnisci_server_worker: omnisci_server_worker.terminate()
def main(): omniscript_path = os.path.dirname(__file__) args = None omnisci_server = None port_default_value = -1 benchmarks = ["ny_taxi", "santander", "census", "plasticc"] parser = argparse.ArgumentParser( description="Run internal tests from ibis project") optional = parser._action_groups.pop() required = parser.add_argument_group("required arguments") parser._action_groups.append(optional) required.add_argument( "-bench_name", dest="bench_name", choices=benchmarks, help="Benchmark name.", required=True, ) required.add_argument( "-data_file", dest="data_file", help="A datafile that should be loaded.", required=True, ) optional.add_argument( "-dfiles_num", dest="dfiles_num", default=1, type=int, help="Number of datafiles to input into database for processing.", ) optional.add_argument( "-iterations", dest="iterations", default=1, type=int, help= "Number of iterations to run every query. Best result is selected.", ) optional.add_argument("-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table.") optional.add_argument( "-dni", default=False, type=str_arg_to_bool, help="Do not create new table and import any data from CSV files.", ) optional.add_argument( "-validation", dest="validation", default=False, type=str_arg_to_bool, help= "validate queries results (by comparison with Pandas queries results).", ) optional.add_argument( "-optimizer", choices=["intel", "stock"], dest="optimizer", default="intel", help="Which optimizer is used", ) optional.add_argument( "-no_ibis", default=False, type=str_arg_to_bool, help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) optional.add_argument( "-pandas_mode", choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"], default="Pandas", help= "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask", ) optional.add_argument( "-ray_tmpdir", default="/tmp", help= "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory", ) optional.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, help="Size of memory to allocate for Ray plasma store", ) optional.add_argument( "-no_ml", default=False, type=str_arg_to_bool, help="Do not run machine learning benchmark, only ETL part", ) optional.add_argument( "-gpu_memory", dest="gpu_memory", type=int, help= "specify the memory of your gpu, default 16. (This controls the lines to be used. Also work for CPU version. )", default=16, ) # MySQL database parameters optional.add_argument( "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) optional.add_argument( "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) optional.add_argument( "-db_user", dest="db_user", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) optional.add_argument( "-db_pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) optional.add_argument( "-db_name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db_table_etl", dest="db_table_etl", help="Table to use to store ETL results for this benchmark.", ) optional.add_argument( "-db_table_ml", dest="db_table_ml", help="Table to use to store ML results for this benchmark.", ) # Omnisci server parameters optional.add_argument( "-executable", dest="executable", help="Path to omnisci_server executable.", ) optional.add_argument( "-omnisci_cwd", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) optional.add_argument( "-port", dest="port", default=port_default_value, type=int, help="TCP port number to run omnisci_server on.", ) optional.add_argument( "-http_port", dest="http_port", default=port_default_value, type=int, help="HTTP port number to run omnisci_server on.", ) optional.add_argument( "-calcite_port", dest="calcite_port", default=port_default_value, type=int, help="Calcite port number to run omnisci_server on.", ) optional.add_argument( "-user", dest="user", default="admin", help="User name to use on omniscidb server.", ) optional.add_argument( "-password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) optional.add_argument( "-database_name", dest="database_name", default="omnisci", help="Database name to use in omniscidb server.", ) optional.add_argument( "-table", dest="table", default="benchmark_table", help="Table name name to use in omniscidb server.", ) optional.add_argument( "-ipc_conn", dest="ipc_connection", default=True, type=str_arg_to_bool, help="Table name name to use in omniscidb server.", ) # Additional information optional.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash to use for benchmark.", ) optional.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for benchmark.", ) try: os.environ["PYTHONIOENCODING"] = "UTF-8" os.environ["PYTHONUNBUFFERED"] = "1" omnisci_server_worker = None args = parser.parse_args() if args.port == port_default_value: args.port = find_free_port() if args.http_port == port_default_value: args.http_port = find_free_port() if args.calcite_port == port_default_value: args.calcite_port = find_free_port() if args.bench_name == "ny_taxi": from taxi import run_benchmark elif args.bench_name == "santander": from santander import run_benchmark elif args.bench_name == "census": from census import run_benchmark elif args.bench_name == "plasticc": from plasticc import run_benchmark parameters = { "data_file": args.data_file, "dfiles_num": args.dfiles_num, "no_ml": args.no_ml, "no_ibis": args.no_ibis, "optimizer": args.optimizer, "pandas_mode": args.pandas_mode, "ray_tmpdir": args.ray_tmpdir, "ray_memory": args.ray_memory, "gpu_memory": args.gpu_memory, } if not args.no_ibis: if args.executable is None: parser.error( "Omnisci executable should be specified with -e/--executable for Ibis part" ) omnisci_server = OmnisciServer( omnisci_executable=args.executable, omnisci_port=args.port, http_port=args.http_port, calcite_port=args.calcite_port, database_name=args.database_name, user=args.user, password=args.password, ) parameters["database_name"] = args.database_name parameters["table"] = args.table parameters["dnd"] = args.dnd parameters["dni"] = args.dni parameters["validation"] = args.validation etl_results = [] ml_results = [] print(parameters) run_id = int(round(time.time())) for iter_num in range(1, args.iterations + 1): print(f"Iteration #{iter_num}") if not args.no_ibis: omnisci_server_worker = OmnisciServerWorker(omnisci_server) parameters["omnisci_server_worker"] = omnisci_server_worker parameters["ipc_connection"] = args.ipc_connection omnisci_server.launch() result = run_benchmark(parameters) if not args.no_ibis: omnisci_server.terminate() for backend_res in result["ETL"]: if backend_res: backend_res["Iteration"] = iter_num backend_res["run_id"] = run_id etl_results.append(backend_res) for backend_res in result["ML"]: if backend_res: backend_res["Iteration"] = iter_num backend_res["run_id"] = run_id ml_results.append(backend_res) # Reporting to MySQL database if args.db_user is not None: if iter_num == 1: db = mysql.connector.connect( host=args.db_server, port=args.db_port, user=args.db_user, passwd=args.db_pass, db=args.db_name, ) reporting_init_fields = { "OmnisciCommitHash": args.commit_omnisci, "IbisCommitHash": args.commit_ibis } reporting_fields_benchmark_etl = { x: "VARCHAR(500) NOT NULL" for x in etl_results[0] } if len(etl_results) is not 1: reporting_fields_benchmark_etl.update({ x: "VARCHAR(500) NOT NULL" for x in etl_results[1] }) db_reporter_etl = DbReport(db, args.db_table_etl, reporting_fields_benchmark_etl, reporting_init_fields) if len(ml_results) is not 0: reporting_fields_benchmark_ml = { x: "VARCHAR(500) NOT NULL" for x in ml_results[0] } if len(ml_results) is not 1: reporting_fields_benchmark_ml.update({ x: "VARCHAR(500) NOT NULL" for x in ml_results[1] }) db_reporter_ml = DbReport( db, args.db_table_ml, reporting_fields_benchmark_ml, reporting_init_fields) for result_etl in etl_results: db_reporter_etl.submit(result_etl) if len(ml_results) is not 0: for result_ml in ml_results: db_reporter_ml.submit(result_ml) except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1) finally: if omnisci_server_worker: omnisci_server_worker.terminate()
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False): filename = args.file database_name = args.name table_name = args.table delete_old_database = not args.dnd create_new_table = not args.dni run_import_queries = str_arg_to_bool(run_import_queries) validation = str_arg_to_bool(validation) tmp_table_name = "tmp_table" etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0} if run_import_queries: etl_times_import = { "t_readcsv_by_ibis": 0.0, "t_readcsv_by_COPY": 0.0, "t_readcsv_by_FSI": 0.0, } etl_times.update(etl_times_import) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, user=args.user, password=args.password, debug_timer=True, columnar_output=args.server_columnar_output, lazy_fetch=args.server_lazy_fetch, ) omnisci_server.launch() import ibis from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) omnisci_server_worker.create_database( database_name, delete_if_exists=delete_old_database ) time.sleep(2) omnisci_server_worker.connect_to_server() if run_import_queries: # SQL statemnts preparation for data file import queries connect_to_db_sql_template = "\c {0} admin HyperInteractive" create_table_sql_template = """ CREATE TABLE {0} ({1}); """ import_by_COPY_sql_template = """ COPY {0} FROM '{1}' WITH (header='{2}'); """ import_by_FSI_sql_template = """ CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}'); """ drop_table_sql_template = """ DROP TABLE IF EXISTS {0}; """ import_query_cols_list = ( ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"] + ["var_%s DOUBLE, \n" % i for i in range(199)] + ["var_199 DOUBLE"] ) import_query_cols_str = "".join(import_query_cols_list) connect_to_db_sql = connect_to_db_sql_template.format(database_name) create_table_sql = create_table_sql_template.format( tmp_table_name, import_query_cols_str ) import_by_COPY_sql = import_by_COPY_sql_template.format( tmp_table_name, filename, "true" ) import_by_FSI_sql = import_by_FSI_sql_template.format( tmp_table_name, import_query_cols_str, filename ) # data file import by ibis columns_types_import_query = ["string", "int64"] + [ "float64" for _ in range(200) ] schema_table_import = ibis.Schema( names=columns_names, types=columns_types_import_query ) omnisci_server_worker.get_conn().create_table( table_name=tmp_table_name, schema=schema_table_import, database=database_name, fragment_size=args.fragment_size, ) table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name) t0 = timer() table_import_query.read_csv(filename, delimiter=",") etl_times["t_readcsv_by_ibis"] = timer() - t0 # data file import by FSI omnisci_server_worker.drop_table(tmp_table_name) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_FSI_sql) etl_times["t_readcsv_by_FSI"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) # data file import by SQL COPY statement omnisci_server_worker.execute_sql_query(create_table_sql) t0 = timer() omnisci_server_worker.execute_sql_query(import_by_COPY_sql) etl_times["t_readcsv_by_COPY"] = timer() - t0 omnisci_server_worker.drop_table(tmp_table_name) if create_new_table: # Create table and import data for ETL queries schema_table = ibis.Schema(names=columns_names, types=columns_types) omnisci_server_worker.get_conn().create_table( table_name=table_name, schema=schema_table, database=database_name, fragment_size=args.fragment_size, ) table_import = omnisci_server_worker.database(database_name).table(table_name) table_import.read_csv(filename, delimiter=",") if args.server_conn_type == "regular": omnisci_server_worker.connect_to_server() elif args.server_conn_type == "ipc": omnisci_server_worker.ipc_connect_to_server() else: print("Wrong connection type is specified!") sys.exit(0) db = omnisci_server_worker.database(database_name) table = db.table(table_name) # group_by/count, merge (join) and filtration queries # We are making 400 columns and then insert them into original table thus avoiding # nested sql requests t0 = timer() count_cols = [] orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)] cast_cols = [] cast_cols.append(table["target"].cast("int64").name("target0")) gt1_cols = [] for i in range(200): col = "var_%d" % i col_count = "var_%d_count" % i col_gt1 = "var_%d_gt1" % i w = ibis.window(group_by=col) count_cols.append(table[col].count().over(w).name(col_count)) gt1_cols.append( ibis.case() .when( table[col].count().over(w).name(col_count) > 1, table[col].cast("float32"), ) .else_(ibis.null()) .end() .name("var_%d_gt1" % i) ) cast_cols.append(table[col].cast("float32").name(col)) table = table.mutate(count_cols) table = table.drop(orig_cols) table = table.mutate(gt1_cols) table = table.mutate(cast_cols) table_df = table.execute() etl_times["t_groupby_merge_where"] = timer() - t0 # rows split query t0 = timer() training_part, validation_part = table_df[:-10000], table_df[-10000:] etl_times["t_train_test_split"] = timer() - t0 etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"] x_train = training_part.drop(['target0'],axis=1) y_train = training_part['target0'] x_valid = validation_part.drop(['target0'],axis=1) y_valid = validation_part['target0'] omnisci_server.terminate() omnisci_server = None return x_train, y_train, x_valid, y_valid, etl_times
def main(): omniscript_path = os.path.dirname(__file__) omnisci_server = None args = None port_default_value = -1 parser = argparse.ArgumentParser(description="Run internal tests from ibis project") required = parser.add_argument_group("common") optional = parser.add_argument_group("optional arguments") omnisci = parser.add_argument_group("omnisci") benchmark = parser.add_argument_group("benchmark") mysql = parser.add_argument_group("mysql") commits = parser.add_argument_group("commits") possible_tasks = ["build", "test", "benchmark"] benchmarks = ["ny_taxi", "santander", "census", "plasticc", "mortgage", "h2o"] # Task required.add_argument( "-task", dest="task", required=True, help=f"Task for execute {possible_tasks}. Use , separator for multiple tasks", ) # Environment required.add_argument("-en", "--env_name", dest="env_name", help="Conda env name.") optional.add_argument( "-ec", "--env_check", dest="env_check", default=False, type=str_arg_to_bool, help="Check if env exists. If it exists don't recreate.", ) optional.add_argument( "-s", "--save_env", dest="save_env", default=False, type=str_arg_to_bool, help="Save conda env after executing.", ) optional.add_argument( "-r", "--report_path", dest="report_path", default=os.path.join(omniscript_path, ".."), help="Path to report file.", ) optional.add_argument( "-ci", "--ci_requirements", dest="ci_requirements", default=os.path.join(omniscript_path, "ci_requirements.yml"), help="File with ci requirements for conda env.", ) optional.add_argument( "-py", "--python_version", dest="python_version", default="3.7", help="File with ci requirements for conda env.", ) # Ibis optional.add_argument( "-i", "--ibis_path", dest="ibis_path", required=False, help="Path to ibis directory." ) # Ibis tests optional.add_argument( "-expression", dest="expression", default=" ", help="Run tests which match the given substring test names and their parent " "classes. Example: 'test_other', while 'not test_method' matches those " "that don't contain 'test_method' in their names.", ) # Modin optional.add_argument( "-m", "--modin_path", dest="modin_path", default=None, help="Path to modin directory." ) optional.add_argument( "--modin_pkgs_dir", dest="modin_pkgs_dir", default=None, type=str, help="Path where to store built Modin dependencies (--target flag for pip), can be helpful if you have space limited home directory.", ) optional.add_argument( "--manage_dbe_dir", dest="manage_dbe_dir", default=False, type=str_arg_to_bool, help="Manage (create and initialize) DBE data directory on the 'build' step.", ) # Omnisci server parameters omnisci.add_argument( "-executable", dest="executable", required=False, help="Path to omnisci_server executable." ) omnisci.add_argument( "-omnisci_cwd", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) omnisci.add_argument( "-port", dest="port", default=port_default_value, type=int, help="TCP port number to run omnisci_server on.", ) omnisci.add_argument( "-http_port", dest="http_port", default=port_default_value, type=int, help="HTTP port number to run omnisci_server on.", ) omnisci.add_argument( "-calcite_port", dest="calcite_port", default=port_default_value, type=int, help="Calcite port number to run omnisci_server on.", ) omnisci.add_argument( "-user", dest="user", default="admin", help="User name to use on omniscidb server." ) omnisci.add_argument( "-password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) omnisci.add_argument( "-database_name", dest="database_name", default="agent_test_ibis", help="Database name to use in omniscidb server.", ) omnisci.add_argument( "-table", dest="table", default="benchmark_table", help="Table name name to use in omniscidb server.", ) omnisci.add_argument( "-ipc_conn", dest="ipc_conn", default=True, type=str_arg_to_bool, help="Connection type for ETL operations", ) omnisci.add_argument( "-debug_timer", dest="debug_timer", default=False, type=str_arg_to_bool, help="Enable fine-grained query execution timers for debug.", ) omnisci.add_argument( "-columnar_output", dest="columnar_output", default=True, type=str_arg_to_bool, help="Allows OmniSci Core to directly materialize intermediate projections \ and the final ResultSet in Columnar format where appropriate.", ) omnisci.add_argument( "-lazy_fetch", dest="lazy_fetch", default=None, type=str_arg_to_bool, help="[lazy_fetch help message]", ) omnisci.add_argument( "-multifrag_rs", dest="multifrag_rs", default=None, type=str_arg_to_bool, help="[multifrag_rs help message]", ) omnisci.add_argument( "-fragments_size", dest="fragments_size", default=None, nargs="*", type=int, help="Number of rows per fragment that is a unit of the table for query processing. \ Should be specified for each table in workload", ) omnisci.add_argument( "-omnisci_run_kwargs", dest="omnisci_run_kwargs", default={}, metavar="KEY1=VAL1,KEY2=VAL2...", action=KeyValueListParser, help="options to start omnisci server", ) # Benchmark parameters benchmark.add_argument( "-bench_name", dest="bench_name", choices=benchmarks, help="Benchmark name." ) benchmark.add_argument( "-data_file", dest="data_file", help="A datafile that should be loaded." ) benchmark.add_argument( "-dfiles_num", dest="dfiles_num", default=None, type=int, help="Number of datafiles to input into database for processing.", ) benchmark.add_argument( "-iterations", dest="iterations", default=1, type=int, help="Number of iterations to run every query. Best result is selected.", ) benchmark.add_argument( "-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table." ) benchmark.add_argument( "-dni", default=False, type=str_arg_to_bool, help="Do not create new table and import any data from CSV files.", ) benchmark.add_argument( "-validation", dest="validation", default=False, type=str_arg_to_bool, help="validate queries results (by comparison with Pandas queries results).", ) benchmark.add_argument( "-import_mode", dest="import_mode", default="fsi", help="you can choose: {copy-from, pandas, fsi}", ) benchmark.add_argument( "-optimizer", choices=["intel", "stock"], dest="optimizer", default=None, help="Which optimizer is used", ) benchmark.add_argument( "-no_ibis", default=False, type=str_arg_to_bool, help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) benchmark.add_argument( "-no_pandas", default=False, type=str_arg_to_bool, help="Do not run Pandas version of benchmark", ) benchmark.add_argument( "-pandas_mode", choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python", "Modin_on_omnisci"], default="Pandas", help="Specifies which version of Pandas to use: " "plain Pandas, Modin runing on Ray or on Dask", ) benchmark.add_argument( "-ray_tmpdir", default="/tmp", help="Location where to keep Ray plasma store. " "It should have enough space to keep -ray_memory", ) benchmark.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, type=int, help="Size of memory to allocate for Ray plasma store", ) benchmark.add_argument( "-no_ml", default=None, type=str_arg_to_bool, help="Do not run machine learning benchmark, only ETL part", ) optional.add_argument( "-gpu_memory", dest="gpu_memory", type=int, help="specify the memory of your gpu" "(This controls the lines to be used. Also work for CPU version. )", default=None, ) benchmark.add_argument( "-extended_functionality", dest="extended_functionality", default=False, type=str_arg_to_bool, help="Extends functionality of H2O benchmark by adding 'chk' functions and verbose local reporting of results", ) # MySQL database parameters mysql.add_argument( "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server." ) mysql.add_argument( "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server." ) mysql.add_argument( "-db_user", dest="db_user", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) mysql.add_argument( "-db_pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) mysql.add_argument( "-db_name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db_table_etl", dest="db_table_etl", help="Table to use to store ETL results for this benchmark.", ) optional.add_argument( "-db_table_ml", dest="db_table_ml", help="Table to use to store ML results for this benchmark.", ) # Additional information commits.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash used for tests.", ) commits.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash used for tests.", ) commits.add_argument( "-commit_omniscripts", dest="commit_omniscripts", default="1234567890123456789012345678901234567890", help="Omniscripts commit hash used for tests.", ) commits.add_argument( "-commit_modin", dest="commit_modin", default="1234567890123456789012345678901234567890", help="Modin commit hash used for tests.", ) optional.add_argument( "-debug_mode", dest="debug_mode", default=False, type=str_arg_to_bool, help="Enable debug mode.", ) try: args = parser.parse_args() os.environ["IBIS_TEST_OMNISCIDB_DATABASE"] = args.database_name os.environ["IBIS_TEST_DATA_DB"] = args.database_name os.environ["IBIS_TEST_OMNISCIDB_PORT"] = str(args.port) os.environ["PYTHONIOENCODING"] = "UTF-8" os.environ["PYTHONUNBUFFERED"] = "1" if args.port == port_default_value: args.port = find_free_port() if args.http_port == port_default_value: args.http_port = find_free_port() if args.calcite_port == port_default_value: args.calcite_port = find_free_port() required_tasks = args.task.split(",") tasks = {} for task in possible_tasks: tasks[task] = True if task in required_tasks else False if True not in list(tasks.values()): raise ValueError( f"Only {list(tasks.keys())} are supported, {required_tasks} cannot find possible tasks" ) if args.python_version not in ["3.7", "3,6"]: raise NotImplementedError( f"Only 3.7 and 3.6 python versions are supported, {args.python_version} is not supported" ) conda_env = CondaEnvironment(args.env_name) print("PREPARING ENVIRONMENT") conda_env.create( args.env_check, requirements_file=args.ci_requirements, python_version=args.python_version, ) if tasks["build"]: install_cmdline = ["python3", "setup.py", "install"] if args.ibis_path: ibis_requirements = os.path.join( args.ibis_path, "ci", f"requirements-{args.python_version}-dev.yml" ) install_ibis_reqs_cmdline = [ "conda", "env", "update", "--name", f"{args.env_name}", "--file", ibis_requirements, ] print("INSTALLATION OF IBIS DEPENDENCIES") conda_env.run(install_ibis_reqs_cmdline, print_output=False) print("IBIS INSTALLATION") conda_env.run(install_cmdline, cwd=args.ibis_path, print_output=False) if args.modin_path: install_modin_reqs_cmdline = [ "conda", "env", "update", "--name", f"{args.env_name}", "--file", "environment-dev.yml", ] if args.modin_pkgs_dir: os.environ["PYTHONPATH"] = ( os.getenv("PYTHONPATH") + os.pathsep + args.modin_pkgs_dir if os.getenv("PYTHONPATH") else args.modin_pkgs_dir ) print("INSTALLATION OF MODIN DEPENDENCIES") # Installation of Modin dependencies can proceed with errors. If error occurs, please try to # rebase your branch to the current Modin master try: conda_env.run( install_modin_reqs_cmdline, cwd=args.modin_path, print_output=False ) except Exception: print("INSTALLATION OF MODIN DEPENDENCIES PROCESSED WITH ERRORS") print("MODIN INSTALLATION") # Modin installation handled this way because "conda run --name env_name python3 setup.py install" # (called by "conda_env.run") processed with warning that is not raised via "python3 setup.py install". # This warning is handled by omniscripts as error, that causing exception raise. try: conda_env.run(install_cmdline, cwd=args.modin_path, print_output=False) except Exception: print("MODIN INSTALLATION PROCESSED WITH ERRORS") # trying to install dbe extension if omnisci generated it executables_path = os.path.dirname(args.executable) dbe_path = os.path.join(os.path.abspath(f"{executables_path}/.."), "Embedded") initdb_path = os.path.join(executables_path, "initdb") data_dir = os.path.join(os.path.dirname(__file__), "data") initdb_cmdline = [initdb_path, "--data", data_dir] if not os.path.isdir(data_dir) and args.manage_dbe_dir: print("MANAGING OMNISCI DATA DIR", data_dir) os.makedirs(data_dir) conda_env.run(initdb_cmdline, print_output=False) if os.path.exists(dbe_path): print("DBE INSTALLATION") cmake_cmdline = [ "cmake", "--install", "build", "--component", "DBE", "--prefix", "$CONDA_PREFIX", ] cmake_qe_cmdline = [ "cmake", "--install", "build", "--component", "QE", "--prefix", "$CONDA_PREFIX", ] cmake_thrift_cmdline = [ "cmake", "--install", "build", "--component", "thrift", "--prefix", "$CONDA_PREFIX", ] cmake_jar_cmdline = [ "cmake", "--install", "build", "--component", "jar", "--prefix", "$CONDA_PREFIX", ] omniscidb_root = os.path.abspath(f"{executables_path}/../../") conda_env.run(cmake_cmdline, cwd=omniscidb_root, print_output=False) conda_env.run(cmake_qe_cmdline, cwd=omniscidb_root, print_output=False) conda_env.run(cmake_thrift_cmdline, cwd=omniscidb_root, print_output=False) conda_env.run(cmake_jar_cmdline, cwd=omniscidb_root, print_output=False) conda_env.run(install_cmdline, cwd=dbe_path, print_output=False) else: print("Using Omnisci server") if tasks["test"]: ibis_data_script = os.path.join(args.ibis_path, "ci", "datamgr.py") dataset_download_cmdline = ["python3", ibis_data_script, "download"] dataset_import_cmdline = [ "python3", ibis_data_script, "omniscidb", "-P", str(args.port), "--database", args.database_name, ] report_file_name = f"report-{args.commit_ibis[:8]}-{args.commit_omnisci[:8]}.html" if not os.path.isdir(args.report_path): os.makedirs(args.report_path) report_file_path = os.path.join(args.report_path, report_file_name) ibis_tests_cmdline = [ "pytest", "-m", "omniscidb", "--disable-pytest-warnings", "-k", args.expression, f"--html={report_file_path}", ] print("STARTING OMNISCI SERVER") omnisci_server = OmnisciServer( omnisci_executable=args.executable, omnisci_port=args.port, http_port=args.http_port, calcite_port=args.calcite_port, database_name=args.database_name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, debug_timer=args.debug_timer, columnar_output=args.columnar_output, lazy_fetch=args.lazy_fetch, multifrag_rs=args.multifrag_rs, omnisci_run_kwargs=args.omnisci_run_kwargs, ) omnisci_server.launch() print("PREPARING DATA") conda_env.run(dataset_download_cmdline) conda_env.run(dataset_import_cmdline) print("RUNNING TESTS") conda_env.run(ibis_tests_cmdline, cwd=args.ibis_path) if tasks["benchmark"]: # if not args.bench_name or args.bench_name not in benchmarks: # print( # f"Benchmark {args.bench_name} is not supported, only {benchmarks} are supported") # sys.exit(1) if not args.data_file: raise ValueError( "Parameter --data_file was received empty, but it is required for benchmarks" ) benchmark_script_path = os.path.join(omniscript_path, "run_ibis_benchmark.py") benchmark_cmd = ["python3", benchmark_script_path] possible_benchmark_args = [ "bench_name", "data_file", "dfiles_num", "iterations", "dnd", "dni", "validation", "optimizer", "no_ibis", "no_pandas", "pandas_mode", "ray_tmpdir", "ray_memory", "no_ml", "gpu_memory", "db_server", "db_port", "db_user", "db_pass", "db_name", "db_table_etl", "db_table_ml", "executable", "omnisci_cwd", "port", "http_port", "calcite_port", "user", "password", "ipc_conn", "database_name", "table", "commit_omnisci", "commit_ibis", "import_mode", "debug_timer", "columnar_output", "lazy_fetch", "multifrag_rs", "fragments_size", "omnisci_run_kwargs", "commit_omniscripts", "debug_mode", "extended_functionality", "commit_modin", ] args_dict = vars(args) args_dict["data_file"] = f"'{args_dict['data_file']}'" for arg_name in list(parser._option_string_actions.keys()): try: pure_arg = re.sub(r"^--*", "", arg_name) if pure_arg in possible_benchmark_args: arg_value = args_dict[pure_arg] # correct filling of arguments with default values if arg_value is not None: if isinstance(arg_value, dict): if arg_value: benchmark_cmd.extend( [ arg_name, ",".join( f"{key}={value}" for key, value in arg_value.items() ), ] ) elif isinstance(arg_value, (list, tuple)): if arg_value: benchmark_cmd.extend([arg_name] + [str(x) for x in arg_value]) else: benchmark_cmd.extend([arg_name, str(arg_value)]) except KeyError: pass print(benchmark_cmd) conda_env.run(benchmark_cmd) except Exception: traceback.print_exc(file=sys.stdout) raise finally: if omnisci_server: omnisci_server.terminate() if args and args.save_env is False: conda_env.remove()
def main(): omniscript_path = os.path.dirname(__file__) omnisci_server = None args = None port_default_value = -1 parser = argparse.ArgumentParser( description="Run internal tests from ibis project") required = parser._action_groups.pop() optional = parser.add_argument_group("optional arguments") omnisci = parser.add_argument_group("omnisci") benchmark = parser.add_argument_group("benchmark") mysql = parser.add_argument_group("mysql") commits = parser.add_argument_group("commits") possible_tasks = ["build", "test", "benchmark"] benchmarks = ["ny_taxi", "santander", "census", "plasticc"] # Task required.add_argument( "-task", dest="task", required=True, help= f"Task for execute {possible_tasks}. Use , separator for multiple tasks", ) # Environment required.add_argument("-en", "--env_name", dest="env_name", help="Conda env name.") optional.add_argument( "-ec", "--env_check", dest="env_check", default=False, type=str_arg_to_bool, help="Check if env exists. If it exists don't recreate.", ) optional.add_argument( "-s", "--save_env", dest="save_env", default=False, type=str_arg_to_bool, help="Save conda env after executing.", ) optional.add_argument( "-r", "--report_path", dest="report_path", default=os.path.join(omniscript_path, ".."), help="Path to report file.", ) optional.add_argument( "-ci", "--ci_requirements", dest="ci_requirements", default=os.path.join(omniscript_path, "ci_requirements.yml"), help="File with ci requirements for conda env.", ) optional.add_argument( "-py", "--python_version", dest="python_version", default="3.7", help="File with ci requirements for conda env.", ) # Ibis required.add_argument( "-i", "--ibis_path", dest="ibis_path", required=True, help="Path to ibis directory.", ) # Ibis tests optional.add_argument( "-expression", dest="expression", default=" ", help= "Run tests which match the given substring test names and their parent " "classes. Example: 'test_other', while 'not test_method' matches those " "that don't contain 'test_method' in their names.", ) # Omnisci server parameters omnisci.add_argument( "-executable", dest="executable", required=True, help="Path to omnisci_server executable.", ) omnisci.add_argument( "--omnisci_cwd", dest="omnisci_cwd", help="Path to omnisci working directory. " "By default parent directory of executable location is used. " "Data directory is used in this location.", ) omnisci.add_argument( "-port", dest="port", default=port_default_value, type=int, help="TCP port number to run omnisci_server on.", ) omnisci.add_argument( "-http_port", dest="http_port", default=port_default_value, type=int, help="HTTP port number to run omnisci_server on.", ) omnisci.add_argument( "-calcite_port", dest="calcite_port", default=port_default_value, type=int, help="Calcite port number to run omnisci_server on.", ) omnisci.add_argument( "-user", dest="user", default="admin", help="User name to use on omniscidb server.", ) omnisci.add_argument( "-password", dest="password", default="HyperInteractive", help="User password to use on omniscidb server.", ) omnisci.add_argument( "-database_name", dest="database_name", default="agent_test_ibis", help="Database name to use in omniscidb server.", ) omnisci.add_argument( "-table", dest="table", default="benchmark_table", help="Table name name to use in omniscidb server.", ) omnisci.add_argument( "-ipc_conn", dest="ipc_connection", default=True, type=str_arg_to_bool, help="Table name name to use in omniscidb server.", ) # Benchmark parameters benchmark.add_argument( "-bench_name", dest="bench_name", choices=benchmarks, help="Benchmark name.", ) benchmark.add_argument( "-data_file", dest="data_file", help="A datafile that should be loaded.", ) benchmark.add_argument( "-dfiles_num", dest="dfiles_num", default=1, type=int, help="Number of datafiles to input into database for processing.", ) benchmark.add_argument( "-iterations", dest="iterations", default=1, type=int, help= "Number of iterations to run every query. Best result is selected.", ) benchmark.add_argument("-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table.") benchmark.add_argument( "-dni", default=False, type=str_arg_to_bool, help="Do not create new table and import any data from CSV files.", ) benchmark.add_argument( "-validation", dest="validation", default=False, type=str_arg_to_bool, help= "validate queries results (by comparison with Pandas queries results).", ) benchmark.add_argument( "-optimizer", choices=["intel", "stock"], dest="optimizer", default="intel", help="Which optimizer is used", ) benchmark.add_argument( "-no_ibis", default=False, type=str_arg_to_bool, help="Do not run Ibis benchmark, run only Pandas (or Modin) version", ) benchmark.add_argument( "-pandas_mode", choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"], default="Pandas", help="Specifies which version of Pandas to use: " "plain Pandas, Modin runing on Ray or on Dask", ) benchmark.add_argument( "-ray_tmpdir", default="/tmp", help="Location where to keep Ray plasma store. " "It should have enough space to keep -ray_memory", ) benchmark.add_argument( "-ray_memory", default=200 * 1024 * 1024 * 1024, help="Size of memory to allocate for Ray plasma store", ) benchmark.add_argument( "-no_ml", default=False, type=str_arg_to_bool, help="Do not run machine learning benchmark, only ETL part", ) optional.add_argument( "-gpu_memory", dest="gpu_memory", type=int, help="specify the memory of your gpu, default 16. " "(This controls the lines to be used. Also work for CPU version. )", default=16, ) # MySQL database parameters mysql.add_argument( "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server.", ) mysql.add_argument( "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server.", ) mysql.add_argument( "-db_user", dest="db_user", help="Username to use to connect to MySQL database. " "If user name is specified, script attempts to store results in MySQL " "database using other -db-* parameters.", ) mysql.add_argument( "-db_pass", dest="db_pass", default="omniscidb", help="Password to use to connect to MySQL database.", ) mysql.add_argument( "-db_name", dest="db_name", default="omniscidb", help="MySQL database to use to store benchmark results.", ) optional.add_argument( "-db_table_etl", dest="db_table_etl", help="Table to use to store ETL results for this benchmark.", ) optional.add_argument( "-db_table_ml", dest="db_table_ml", help="Table to use to store ML results for this benchmark.", ) # Additional information commits.add_argument( "-commit_omnisci", dest="commit_omnisci", default="1234567890123456789012345678901234567890", help="Omnisci commit hash to use for tests.", ) commits.add_argument( "-commit_ibis", dest="commit_ibis", default="1234567890123456789012345678901234567890", help="Ibis commit hash to use for tests.", ) try: args = parser.parse_args() os.environ["IBIS_TEST_OMNISCIDB_DATABASE"] = args.database_name os.environ["IBIS_TEST_DATA_DB"] = args.database_name os.environ["IBIS_TEST_OMNISCIDB_PORT"] = str(args.port) os.environ["PYTHONIOENCODING"] = "UTF-8" os.environ["PYTHONUNBUFFERED"] = "1" if args.port == port_default_value: args.port = find_free_port() if args.http_port == port_default_value: args.http_port = find_free_port() if args.calcite_port == port_default_value: args.calcite_port = find_free_port() required_tasks = args.task.split(",") tasks = {} for task in possible_tasks: tasks[task] = True if task in required_tasks else False if True not in list(tasks.values()): print( f"Only {list(tasks.keys())} are supported, {required_tasks} cannot find possible tasks" ) sys.exit(1) if args.python_version not in ["3.7", "3,6"]: print( f"Only 3.7 and 3.6 python versions are supported, {args.python_version} is not supported" ) sys.exit(1) ibis_requirements = os.path.join( args.ibis_path, "ci", f"requirements-{args.python_version}-dev.yml") requirements_file = "requirements.yml" conda_env = CondaEnvironment(args.env_name) print("PREPARING ENVIRONMENT") combinate_requirements(ibis_requirements, args.ci_requirements, requirements_file) conda_env.create(args.env_check, requirements_file=requirements_file) if tasks["build"]: install_ibis_cmdline = [ "python3", os.path.join("setup.py"), "install" ] print("IBIS INSTALLATION") conda_env.run(install_ibis_cmdline, cwd=args.ibis_path, print_output=False) if tasks["test"]: ibis_data_script = os.path.join(args.ibis_path, "ci", "datamgr.py") dataset_download_cmdline = [ "python3", ibis_data_script, "download" ] dataset_import_cmdline = [ "python3", ibis_data_script, "omniscidb", "-P", str(args.port), "--database", args.database_name, ] report_file_name = ( f"report-{args.commit_ibis[:8]}-{args.commit_omnisci[:8]}.html" ) if not os.path.isdir(args.report_path): os.makedirs(args.report_path) report_file_path = os.path.join(args.report_path, report_file_name) ibis_tests_cmdline = [ "pytest", "-m", "omniscidb", "--disable-pytest-warnings", "-k", args.expression, f"--html={report_file_path}", ] print("STARTING OMNISCI SERVER") omnisci_server = OmnisciServer( omnisci_executable=args.executable, omnisci_port=args.port, http_port=args.http_port, calcite_port=args.calcite_port, database_name=args.database_name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, ) omnisci_server.launch() print("PREPARING DATA") conda_env.run(dataset_download_cmdline) conda_env.run(dataset_import_cmdline) print("RUNNING TESTS") conda_env.run(ibis_tests_cmdline, cwd=args.ibis_path) if tasks["benchmark"]: # if not args.bench_name or args.bench_name not in benchmarks: # print( # f"Benchmark {args.bench_name} is not supported, only {benchmarks} are supported") # sys.exit(1) if not args.data_file: print( f"Parameter --data_file was received empty, but it is required for benchmarks" ) sys.exit(1) benchmark_script_path = os.path.join(omniscript_path, "run_ibis_benchmark.py") benchmark_cmd = ["python3", benchmark_script_path] possible_benchmark_args = [ "bench_name", "data_file", "dfiles_num", "iterations", "dnd", "dni", "validation", "optimizer", "no_ibis", "pandas_mode", "ray_tmpdir", "ray_memory", "no_ml", "gpu_memory", "db_server", "db_port", "db_user", "db_pass", "db_name", "db_table_etl", "db_table_ml", "executable", "omnisci_cwd", "port", "http_port", "calcite_port", "user", "password", "ipc_connection", "database_name", "table", "commit_omnisci", "commit_ibis", ] args_dict = vars(args) args_dict["data_file"] = f"'{args_dict['data_file']}'" for arg_name in list(parser._option_string_actions.keys()): try: pure_arg = re.sub(r"^--*", "", arg_name) if pure_arg in possible_benchmark_args: arg_value = args_dict[pure_arg] if arg_value: benchmark_cmd.extend([arg_name, str(arg_value)]) except KeyError: pass print(benchmark_cmd) conda_env.run(benchmark_cmd) except Exception: traceback.print_exc(file=sys.stdout) sys.exit(1) finally: if omnisci_server: omnisci_server.terminate() if args and args.save_env is False: conda_env.remove()
def main(): args = None omnisci_server = None parser, args, skip_rows = get_args() try: if not args.no_ibis: sys.path.append(os.path.join(os.path.dirname(__file__), "..")) from server import OmnisciServer if args.omnisci_executable is None: parser.error( "Omnisci executable should be specified with -e/--executable" ) omnisci_server = OmnisciServer( omnisci_executable=args.omnisci_executable, omnisci_port=args.omnisci_port, database_name=args.name, omnisci_cwd=args.omnisci_cwd, user=args.user, password=args.password, ) omnisci_server.launch() from server_worker import OmnisciServerWorker omnisci_server_worker = OmnisciServerWorker(omnisci_server) ( X_train, y_train, X_test, y_test, Xt, classes, class_weights, etl_times, ) = etl_all_ibis( filename=args.dataset_path, database_name=args.name, omnisci_server_worker=omnisci_server_worker, delete_old_database=not args.dnd, create_new_table=not args.dni, skip_rows=skip_rows, ) print_times(etl_times) omnisci_server.terminate() omnisci_server = None if not args.no_ml: print("using ml with dataframes from ibis") ml_times = ml(X_train, y_train, X_test, y_test, Xt, classes, class_weights) print_times(ml_times) ( X_train, y_train, X_test, y_test, Xt, classes, class_weights, etl_times, ) = etl_all_pandas(args.dataset_path, skip_rows) print_times(etl_times) if not args.no_ml: print("using ml with dataframes from pandas") ml_times = ml(X_train, y_train, X_test, y_test, Xt, classes, class_weights) print_times(ml_times) if args.val: # this isn't work so easy # compare_dataframes(ibis_df=(X_train_ibis, y_train_ibis), pandas_df=(X, y)) print("validate by ml results") except Exception as err: print("Failed: ", err) sys.exit(1) finally: if omnisci_server: omnisci_server.terminate()
) try: args = parser.parse_args() if args.df <= 0: print("Bad number of data files specified", args.df) sys.exit(1) if args.i < 1: print("Bad number of iterations specified", args.i) database_name = args.n omnisci_server = OmnisciServer( omnisci_executable=args.e, omnisci_port=args.port, database_name=database_name, user=args.u, password=args.p, ) omnisci_server.launch() omnisci_server_worker = OmnisciServerWorker(omnisci_server) time.sleep(2) omnisci_server_worker.connect_to_server() taxibench_columns_names = [ "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",