示例#1
0
            'BenchName': 'VARCHAR(500) NOT NULL',
            'BestExecTimeMS': 'BIGINT UNSIGNED',
            'BestTotalTimeMS': 'BIGINT UNSIGNED',
            'WorstExecTimeMS': 'BIGINT UNSIGNED',
            'WorstTotalTimeMS': 'BIGINT UNSIGNED',
            'AverageExecTimeMS': 'BIGINT UNSIGNED',
            'AverageTotalTimeMS': 'BIGINT UNSIGNED'
        }, {
            'ScriptName': 'run_omnisci_benchmark.py',
            'CommitHash': args.commit
        })

try:
    omnisci_server = OmnisciServer(omnisci_executable=args.omnisci_executable,
                                   omnisci_port=args.omnisci_port,
                                   database_name=args.name,
                                   omnisci_cwd=args.omnisci_cwd,
                                   user=args.user,
                                   password=args.passwd)
    omnisci_server.launch()

    with open(args.report, "w") as report:
        print(
            "datafiles,fragment_size,query,query_exec_min,query_total_min,query_exec_max,"
            "query_total_max,query_exec_avg,query_total_avg,query_error_info",
            file=report,
            flush=True)
        if args.fragment_size is not None:
            for fs in args.fragment_size:
                print("RUNNING WITH FRAGMENT SIZE", fs)
                execute_benchmark(datafiles, import_cmdline,
                                  args.benchmarks_path, benchmark_cmdline, fs,
示例#2
0
def main():
    omniscript_path = os.path.dirname(__file__)
    args = None
    omnisci_server = None

    parser = argparse.ArgumentParser(
        description="Run internal tests from ibis project")
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")
    parser._action_groups.append(optional)

    required.add_argument(
        "-f",
        "--file",
        dest="file",
        required=True,
        help="A datafile that should be loaded",
    )
    optional.add_argument("-dnd",
                          action="store_true",
                          help="Do not delete old table.")
    optional.add_argument(
        "-dni",
        action="store_true",
        help="Do not create new table and import any data from CSV files.",
    )
    optional.add_argument(
        "-val",
        action="store_true",
        help=
        "validate queries results (by comparison with Pandas queries results).",
    )
    optional.add_argument(
        "-o",
        "--optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default="intel",
        help="Which optimizer is used",
    )
    # MySQL database parameters
    optional.add_argument(
        "-db-server",
        dest="db_server",
        default="localhost",
        help="Host name of MySQL server.",
    )
    optional.add_argument(
        "-db-port",
        dest="db_port",
        default=3306,
        type=int,
        help="Port number of MySQL server.",
    )
    optional.add_argument(
        "-db-user",
        dest="db_user",
        default="",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    optional.add_argument(
        "-db-pass",
        dest="db_password",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    optional.add_argument(
        "-db-name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db-table",
        dest="db_table",
        help="Table to use to store results for this benchmark.",
    )
    # Omnisci server parameters
    optional.add_argument(
        "-e",
        "--executable",
        dest="omnisci_executable",
        required=False,
        help="Path to omnisci_server executable.",
    )
    optional.add_argument(
        "-w",
        "--workdir",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    optional.add_argument(
        "-port",
        "--omnisci_port",
        dest="omnisci_port",
        default=6274,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-u",
        "--user",
        dest="user",
        default="admin",
        help="User name to use on omniscidb server.",
    )
    optional.add_argument(
        "-p",
        "--password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    optional.add_argument(
        "-n",
        "--name",
        dest="name",
        default="census_database",
        help="Database name to use in omniscidb server.",
    )
    optional.add_argument(
        "-t",
        "--table",
        dest="table",
        default="census_table",
        help="Table name name to use in omniscidb server.",
    )

    optional.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash to use for benchmark.",
    )
    optional.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash to use for benchmark.",
    )
    optional.add_argument(
        "-no_ibis",
        action="store_true",
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version")
    optional.add_argument(
        "-pandas_mode",
        choices=["pandas", "modin_on_ray", "modin_on_dask"],
        default="pandas",
        help=
        "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask"
    )
    optional.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help=
        "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory"
    )
    optional.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        help="Size of memory to allocate for Ray plasma store")
    optional.add_argument(
        "-no_ml",
        action="store_true",
        help="Do not run machine learning benchmark, only ETL part")

    args = parser.parse_args()
    args.file = args.file.replace("'", "")

    # ML specific
    N_RUNS = 50
    TRAIN_SIZE = 0.9
    RANDOM_STATE = 777

    columns_names = [
        "YEAR0",
        "DATANUM",
        "SERIAL",
        "CBSERIAL",
        "HHWT",
        "CPI99",
        "GQ",
        "QGQ",
        "PERNUM",
        "PERWT",
        "SEX",
        "AGE",
        "EDUC",
        "EDUCD",
        "INCTOT",
        "SEX_HEAD",
        "SEX_MOM",
        "SEX_POP",
        "SEX_SP",
        "SEX_MOM2",
        "SEX_POP2",
        "AGE_HEAD",
        "AGE_MOM",
        "AGE_POP",
        "AGE_SP",
        "AGE_MOM2",
        "AGE_POP2",
        "EDUC_HEAD",
        "EDUC_MOM",
        "EDUC_POP",
        "EDUC_SP",
        "EDUC_MOM2",
        "EDUC_POP2",
        "EDUCD_HEAD",
        "EDUCD_MOM",
        "EDUCD_POP",
        "EDUCD_SP",
        "EDUCD_MOM2",
        "EDUCD_POP2",
        "INCTOT_HEAD",
        "INCTOT_MOM",
        "INCTOT_POP",
        "INCTOT_SP",
        "INCTOT_MOM2",
        "INCTOT_POP2",
    ]
    columns_types = [
        "int64",
        "int64",
        "int64",
        "float64",
        "int64",
        "float64",
        "int64",
        "float64",
        "int64",
        "int64",
        "int64",
        "int64",
        "int64",
        "int64",
        "int64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
        "float64",
    ]

    try:
        if not args.no_ibis:
            if args.omnisci_executable is None:
                parser.error(
                    "Omnisci executable should be specified with -e/--executable"
                )
            omnisci_server = OmnisciServer(
                omnisci_executable=args.omnisci_executable,
                omnisci_port=args.omnisci_port,
                database_name=args.name,
                user=args.user,
                password=args.password,
            )
            omnisci_server.launch()
            from server_worker import OmnisciServerWorker
            omnisci_server_worker = OmnisciServerWorker(omnisci_server)

            X_ibis, y_ibis, etl_times_ibis = etl_ibis(
                filename=args.file,
                columns_names=columns_names,
                columns_types=columns_types,
                database_name=args.name,
                table_name=args.table,
                omnisci_server_worker=omnisci_server_worker,
                delete_old_database=not args.dnd,
                create_new_table=not args.dni,
            )
            omnisci_server.terminate()
            omnisci_server = None
            print_times(etl_times_ibis, name='Ibis')

            if not args.no_ml:
                mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml(
                    X_ibis, y_ibis, RANDOM_STATE, N_RUNS, TRAIN_SIZE,
                    args.optimizer)
                print_times(ml_times)
                print("mean MSE ± deviation: {:.9f} ± {:.9f}".format(
                    mse_mean, mse_dev))
                print("mean COD ± deviation: {:.9f} ± {:.9f}".format(
                    cod_mean, cod_dev))

        import_pandas_into_module_namespace(main.__globals__, args.pandas_mode,
                                            args.ray_tmpdir, args.ray_memory)
        X, y, etl_times = etl_pandas(args.file,
                                     columns_names=columns_names,
                                     columns_types=columns_types)
        print_times(etl_times, name=args.pandas_mode)

        if not args.no_ml:
            mse_mean, cod_mean, mse_dev, cod_dev, ml_times = ml(
                X, y, RANDOM_STATE, N_RUNS, TRAIN_SIZE, args.optimizer)
            print_times(ml_times)
            print("mean MSE ± deviation: {:.9f} ± {:.9f}".format(
                mse_mean, mse_dev))
            print("mean COD ± deviation: {:.9f} ± {:.9f}".format(
                cod_mean, cod_dev))

        if args.val:
            compare_dataframes(ibis_df=(X_ibis, y_ibis), pandas_df=(X, y))
    except Exception as err:
        print("Failed: ", err)
        sys.exit(1)
    finally:
        if omnisci_server:
            omnisci_server.terminate()
def main():
    args = None
    omnisci_server = None
    port_default_value = -1

    benchmarks = {
        "ny_taxi": "taxi",
        "santander": "santander",
        "census": "census",
        "plasticc": "plasticc",
        "mortgage": "mortgage",
        "h2o": "h2o",
    }
    benchmarks_with_ibis_queries = [
        "ny_taxi", "santander", "census", "plasticc", "mortgage"
    ]

    ignore_fields_for_bd_report_etl = ["t_connect"]
    ignore_fields_for_bd_report_ml = []
    ignore_fields_for_results_unit_conversion = [
        "Backend",
        "dfiles_num",
        "dataset_size",
        "query_name",
    ]

    parser = argparse.ArgumentParser(
        description="Run internal tests from ibis project")
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")
    parser._action_groups.append(optional)

    required.add_argument(
        "-bench_name",
        dest="bench_name",
        choices=sorted(benchmarks.keys()),
        help="Benchmark name.",
        required=True,
    )
    required.add_argument(
        "-data_file",
        dest="data_file",
        help="A datafile that should be loaded.",
        required=True,
    )
    optional.add_argument(
        "-dfiles_num",
        dest="dfiles_num",
        default=None,
        type=int,
        help="Number of datafiles to input into database for processing.",
    )
    optional.add_argument(
        "-iterations",
        dest="iterations",
        default=1,
        type=int,
        help=
        "Number of iterations to run every query. Best result is selected.",
    )
    optional.add_argument("-dnd",
                          default=False,
                          type=str_arg_to_bool,
                          help="Do not delete old table.")
    optional.add_argument(
        "-dni",
        default=False,
        type=str_arg_to_bool,
        help="Do not create new table and import any data from CSV files.",
    )
    optional.add_argument(
        "-validation",
        dest="validation",
        default=False,
        type=str_arg_to_bool,
        help=
        "validate queries results (by comparison with Pandas queries results).",
    )
    optional.add_argument(
        "-import_mode",
        dest="import_mode",
        default="fsi",
        help="measure 'COPY FROM' import, FSI import, import through pandas",
    )
    optional.add_argument(
        "-optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default=None,
        help="Which optimizer is used",
    )
    optional.add_argument(
        "-no_ibis",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version",
    )
    optional.add_argument(
        "-no_pandas",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Pandas version of benchmark",
    )
    optional.add_argument(
        "-pandas_mode",
        choices=[
            "Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python",
            "Modin_on_omnisci"
        ],
        default="Pandas",
        help=
        "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask",
    )
    optional.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help=
        "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory",
    )
    optional.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        type=int,
        help="Size of memory to allocate for Ray plasma store",
    )
    optional.add_argument(
        "-no_ml",
        default=None,
        type=str_arg_to_bool,
        help="Do not run machine learning benchmark, only ETL part",
    )
    optional.add_argument(
        "-gpu_memory",
        dest="gpu_memory",
        type=int,
        help="specify the memory of your gpu"
        "(This controls the lines to be used. Also work for CPU version. )",
        default=None,
    )
    optional.add_argument(
        "-extended_functionality",
        dest="extended_functionality",
        default=False,
        type=str_arg_to_bool,
        help=
        "Extends functionality of H2O benchmark by adding 'chk' functions and verbose local reporting of results",
    )

    # MySQL database parameters
    optional.add_argument(
        "-db_server",
        dest="db_server",
        default="localhost",
        help="Host name of MySQL server.",
    )
    optional.add_argument(
        "-db_port",
        dest="db_port",
        default=3306,
        type=int,
        help="Port number of MySQL server.",
    )
    optional.add_argument(
        "-db_user",
        dest="db_user",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    optional.add_argument(
        "-db_pass",
        dest="db_pass",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    optional.add_argument(
        "-db_name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db_table_etl",
        dest="db_table_etl",
        help="Table to use to store ETL results for this benchmark.",
    )
    optional.add_argument(
        "-db_table_ml",
        dest="db_table_ml",
        help="Table to use to store ML results for this benchmark.",
    )
    # Omnisci server parameters
    optional.add_argument(
        "-executable",
        dest="executable",
        help="Path to omnisci_server executable.",
    )
    optional.add_argument(
        "-omnisci_cwd",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    optional.add_argument(
        "-port",
        dest="port",
        default=port_default_value,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-http_port",
        dest="http_port",
        default=port_default_value,
        type=int,
        help="HTTP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-calcite_port",
        dest="calcite_port",
        default=port_default_value,
        type=int,
        help="Calcite port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-user",
        dest="user",
        default="admin",
        help="User name to use on omniscidb server.",
    )
    optional.add_argument(
        "-password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    optional.add_argument(
        "-database_name",
        dest="database_name",
        default="omnisci",
        help="Database name to use in omniscidb server.",
    )
    optional.add_argument(
        "-table",
        dest="table",
        default="benchmark_table",
        help="Table name name to use in omniscidb server.",
    )
    optional.add_argument(
        "-ipc_conn",
        dest="ipc_conn",
        default=True,
        type=str_arg_to_bool,
        help="Table name name to use in omniscidb server.",
    )
    optional.add_argument(
        "-debug_timer",
        dest="debug_timer",
        default=False,
        type=str_arg_to_bool,
        help="Enable fine-grained query execution timers for debug.",
    )
    optional.add_argument(
        "-columnar_output",
        dest="columnar_output",
        default=True,
        type=str_arg_to_bool,
        help=
        "Allows OmniSci Core to directly materialize intermediate projections \
            and the final ResultSet in Columnar format where appropriate.",
    )
    optional.add_argument(
        "-lazy_fetch",
        dest="lazy_fetch",
        default=None,
        type=str_arg_to_bool,
        help="[lazy_fetch help message]",
    )
    optional.add_argument(
        "-multifrag_rs",
        dest="multifrag_rs",
        default=None,
        type=str_arg_to_bool,
        help="[multifrag_rs help message]",
    )
    optional.add_argument(
        "-fragments_size",
        dest="fragments_size",
        default=None,
        nargs="*",
        type=int,
        help=
        "Number of rows per fragment that is a unit of the table for query processing. \
            Should be specified for each table in workload",
    )
    optional.add_argument(
        "-omnisci_run_kwargs",
        dest="omnisci_run_kwargs",
        default={},
        metavar="KEY1=VAL1,KEY2=VAL2...",
        action=KeyValueListParser,
        help="options to start omnisci server",
    )
    # Additional information
    optional.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash used for benchmark.",
    )
    optional.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash used for benchmark.",
    )
    optional.add_argument(
        "-commit_omniscripts",
        dest="commit_omniscripts",
        default="1234567890123456789012345678901234567890",
        help="Omniscripts commit hash used for benchmark.",
    )
    optional.add_argument(
        "-commit_modin",
        dest="commit_modin",
        default="1234567890123456789012345678901234567890",
        help="Modin commit hash used for benchmark.",
    )
    optional.add_argument(
        "-debug_mode",
        dest="debug_mode",
        default=False,
        type=str_arg_to_bool,
        help="Enable debug mode.",
    )

    try:
        os.environ["PYTHONIOENCODING"] = "UTF-8"
        os.environ["PYTHONUNBUFFERED"] = "1"
        omnisci_server_worker = None
        omnisci_server = None

        args = parser.parse_args()

        launch_omnisci_server = (not args.no_ibis and args.bench_name
                                 in benchmarks_with_ibis_queries)

        if args.port == port_default_value:
            args.port = find_free_port()
        if args.http_port == port_default_value:
            args.http_port = find_free_port()
        if args.calcite_port == port_default_value:
            args.calcite_port = find_free_port()

        run_benchmark = __import__(benchmarks[args.bench_name]).run_benchmark

        parameters = {
            "data_file": args.data_file,
            "dfiles_num": args.dfiles_num,
            "no_ml": args.no_ml,
            "no_ibis": args.no_ibis,
            "optimizer": args.optimizer,
            "pandas_mode": args.pandas_mode,
            "ray_tmpdir": args.ray_tmpdir,
            "ray_memory": args.ray_memory,
            "gpu_memory": args.gpu_memory,
            "validation": args.validation,
            "no_pandas": args.no_pandas,
            "debug_mode": args.debug_mode,
            "extended_functionality": args.extended_functionality,
        }

        if launch_omnisci_server:
            if args.executable is None:
                parser.error(
                    "Omnisci executable should be specified with -e/--executable for Ibis part"
                )
            from server import OmnisciServer

            omnisci_server = OmnisciServer(
                omnisci_executable=args.executable,
                omnisci_port=args.port,
                http_port=args.http_port,
                calcite_port=args.calcite_port,
                database_name=args.database_name,
                omnisci_cwd=args.omnisci_cwd,
                user=args.user,
                password=args.password,
                debug_timer=args.debug_timer,
                columnar_output=args.columnar_output,
                lazy_fetch=args.lazy_fetch,
                multifrag_rs=args.multifrag_rs,
                omnisci_run_kwargs=args.omnisci_run_kwargs,
            )

            parameters["database_name"] = args.database_name
            parameters["table"] = args.table
            parameters["dnd"] = args.dnd
            parameters["dni"] = args.dni
            parameters["import_mode"] = args.import_mode
            parameters["fragments_size"] = args.fragments_size

        if parameters["validation"] and (parameters["no_pandas"]
                                         or parameters["no_ibis"]):
            parameters["validation"] = False
            print(
                "WARNING: validation was turned off as it requires both sides to compare."
            )

        etl_results = []
        ml_results = []
        print(parameters)
        run_id = int(round(time.time()))
        for iter_num in range(1, args.iterations + 1):
            print(f"Iteration #{iter_num}")

            if launch_omnisci_server:
                from server_worker import OmnisciServerWorker

                omnisci_server_worker = OmnisciServerWorker(omnisci_server)
                parameters["omnisci_server_worker"] = omnisci_server_worker
                parameters["ipc_connection"] = args.ipc_conn
                omnisci_server.launch()
            parameters = {
                key:
                os.path.expandvars(value) if isinstance(value, str) else value
                for key, value in parameters.items()
            }
            benchmark_results = run_benchmark(parameters)

            if launch_omnisci_server:
                omnisci_server_worker.terminate()
                omnisci_server.terminate()

            additional_fields_for_reporting = {
                "ETL": {
                    "Iteration": iter_num,
                    "run_id": run_id
                },
                "ML": {
                    "Iteration": iter_num,
                    "run_id": run_id
                },
            }
            etl_ml_results = refactor_results_for_reporting(
                benchmark_results=benchmark_results,
                ignore_fields_for_results_unit_conversion=
                ignore_fields_for_results_unit_conversion,
                additional_fields=additional_fields_for_reporting,
                reporting_unit="ms",
            )
            etl_results = list(etl_ml_results["ETL"])
            ml_results = list(etl_ml_results["ML"])

            # Reporting to MySQL database
            if args.db_user is not None:
                if iter_num == 1:
                    db = mysql.connector.connect(
                        host=args.db_server,
                        port=args.db_port,
                        user=args.db_user,
                        passwd=args.db_pass,
                        db=args.db_name,
                    )

                    reporting_init_fields = {
                        "OmnisciCommitHash": args.commit_omnisci,
                        "IbisCommitHash": args.commit_ibis,
                        "OmniscriptsCommitHash": args.commit_omniscripts,
                        "ModinCommitHash": args.commit_modin,
                    }

                    reporting_fields_benchmark_etl = {
                        x: "VARCHAR(500) NOT NULL"
                        for x in etl_results[0]
                    }
                    if len(etl_results) != 1:
                        reporting_fields_benchmark_etl.update({
                            x: "VARCHAR(500) NOT NULL"
                            for x in etl_results[1]
                        })

                    db_reporter_etl = DbReport(
                        db,
                        args.db_table_etl,
                        reporting_fields_benchmark_etl,
                        reporting_init_fields,
                    )

                    if len(ml_results) != 0:
                        reporting_fields_benchmark_ml = {
                            x: "VARCHAR(500) NOT NULL"
                            for x in ml_results[0]
                        }
                        if len(ml_results) != 1:
                            reporting_fields_benchmark_ml.update({
                                x: "VARCHAR(500) NOT NULL"
                                for x in ml_results[1]
                            })

                        db_reporter_ml = DbReport(
                            db,
                            args.db_table_ml,
                            reporting_fields_benchmark_ml,
                            reporting_init_fields,
                        )

                if iter_num == args.iterations:
                    for result_etl in etl_results:
                        remove_fields_from_dict(
                            result_etl, ignore_fields_for_bd_report_etl)
                        db_reporter_etl.submit(result_etl)

                    if len(ml_results) != 0:
                        for result_ml in ml_results:
                            remove_fields_from_dict(
                                result_ml, ignore_fields_for_bd_report_ml)
                            db_reporter_ml.submit(result_ml)

    finally:
        if omnisci_server_worker:
            omnisci_server_worker.terminate()
        if omnisci_server:
            omnisci_server.terminate()
示例#4
0
def main():
    args = None
    omnisci_server_worker = None
    train_final, test_final = None, None

    parser, args, skip_rows = get_args()

    try:
        if not args.no_ibis:
            sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
            from server import OmnisciServer

            if args.omnisci_executable is None:
                parser.error(
                    "Omnisci executable should be specified with -e/--executable"
                )

            omnisci_server = OmnisciServer(
                omnisci_executable=args.omnisci_executable,
                omnisci_port=args.omnisci_port,
                database_name=args.name,
                omnisci_cwd=args.omnisci_cwd,
                user=args.user,
                password=args.password,
            )
            omnisci_server.launch()

            from server_worker import OmnisciServerWorker

            omnisci_server_worker = OmnisciServerWorker(omnisci_server)

            train_final, test_final, etl_times = etl_all_ibis(
                filename=args.dataset_path,
                database_name=args.name,
                omnisci_server_worker=omnisci_server_worker,
                delete_old_database=not args.dnd,
                create_new_table=not args.dni,
                skip_rows=skip_rows,
                validation=args.val,
            )
            ml_data, etl_times = split_step(train_final, test_final, etl_times)
            print_times(etl_times)

            omnisci_server_worker.terminate()
            omnisci_server_worker = None

            if not args.no_ml:
                print("using ml with dataframes from ibis")
                ml_times = ml(ml_data)
                print_times(ml_times)

        ptrain_final, ptest_final, petl_times = etl_all_pandas(
            args.dataset_path, skip_rows
        )
        ml_data, petl_times = split_step(ptrain_final, ptest_final, petl_times)
        print_times(petl_times)

        if not args.no_ml:
            print("using ml with dataframes from pandas")
            ml_times = ml(ml_data)
            print_times(ml_times)

        if args.val and (not train_final is None) and (not test_final is None):
            print("validating result ...")
            compare_dataframes((train_final, test_final), (ptrain_final, ptest_final))

    finally:
        if omnisci_server_worker:
            omnisci_server_worker.terminate()
示例#5
0
def main():
    omniscript_path = os.path.dirname(__file__)
    args = None
    omnisci_server = None
    port_default_value = -1

    benchmarks = ["ny_taxi", "santander", "census", "plasticc"]

    parser = argparse.ArgumentParser(
        description="Run internal tests from ibis project")
    optional = parser._action_groups.pop()
    required = parser.add_argument_group("required arguments")
    parser._action_groups.append(optional)

    required.add_argument(
        "-bench_name",
        dest="bench_name",
        choices=benchmarks,
        help="Benchmark name.",
        required=True,
    )
    required.add_argument(
        "-data_file",
        dest="data_file",
        help="A datafile that should be loaded.",
        required=True,
    )
    optional.add_argument(
        "-dfiles_num",
        dest="dfiles_num",
        default=1,
        type=int,
        help="Number of datafiles to input into database for processing.",
    )
    optional.add_argument(
        "-iterations",
        dest="iterations",
        default=1,
        type=int,
        help=
        "Number of iterations to run every query. Best result is selected.",
    )
    optional.add_argument("-dnd",
                          default=False,
                          type=str_arg_to_bool,
                          help="Do not delete old table.")
    optional.add_argument(
        "-dni",
        default=False,
        type=str_arg_to_bool,
        help="Do not create new table and import any data from CSV files.",
    )
    optional.add_argument(
        "-validation",
        dest="validation",
        default=False,
        type=str_arg_to_bool,
        help=
        "validate queries results (by comparison with Pandas queries results).",
    )
    optional.add_argument(
        "-optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default="intel",
        help="Which optimizer is used",
    )
    optional.add_argument(
        "-no_ibis",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version",
    )
    optional.add_argument(
        "-pandas_mode",
        choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"],
        default="Pandas",
        help=
        "Specifies which version of Pandas to use: plain Pandas, Modin runing on Ray or on Dask",
    )
    optional.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help=
        "Location where to keep Ray plasma store. It should have enough space to keep -ray_memory",
    )
    optional.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        help="Size of memory to allocate for Ray plasma store",
    )
    optional.add_argument(
        "-no_ml",
        default=False,
        type=str_arg_to_bool,
        help="Do not run machine learning benchmark, only ETL part",
    )
    optional.add_argument(
        "-gpu_memory",
        dest="gpu_memory",
        type=int,
        help=
        "specify the memory of your gpu, default 16. (This controls the lines to be used. Also work for CPU version. )",
        default=16,
    )
    # MySQL database parameters
    optional.add_argument(
        "-db_server",
        dest="db_server",
        default="localhost",
        help="Host name of MySQL server.",
    )
    optional.add_argument(
        "-db_port",
        dest="db_port",
        default=3306,
        type=int,
        help="Port number of MySQL server.",
    )
    optional.add_argument(
        "-db_user",
        dest="db_user",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    optional.add_argument(
        "-db_pass",
        dest="db_pass",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    optional.add_argument(
        "-db_name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db_table_etl",
        dest="db_table_etl",
        help="Table to use to store ETL results for this benchmark.",
    )
    optional.add_argument(
        "-db_table_ml",
        dest="db_table_ml",
        help="Table to use to store ML results for this benchmark.",
    )
    # Omnisci server parameters
    optional.add_argument(
        "-executable",
        dest="executable",
        help="Path to omnisci_server executable.",
    )
    optional.add_argument(
        "-omnisci_cwd",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    optional.add_argument(
        "-port",
        dest="port",
        default=port_default_value,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-http_port",
        dest="http_port",
        default=port_default_value,
        type=int,
        help="HTTP port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-calcite_port",
        dest="calcite_port",
        default=port_default_value,
        type=int,
        help="Calcite port number to run omnisci_server on.",
    )
    optional.add_argument(
        "-user",
        dest="user",
        default="admin",
        help="User name to use on omniscidb server.",
    )
    optional.add_argument(
        "-password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    optional.add_argument(
        "-database_name",
        dest="database_name",
        default="omnisci",
        help="Database name to use in omniscidb server.",
    )
    optional.add_argument(
        "-table",
        dest="table",
        default="benchmark_table",
        help="Table name name to use in omniscidb server.",
    )
    optional.add_argument(
        "-ipc_conn",
        dest="ipc_connection",
        default=True,
        type=str_arg_to_bool,
        help="Table name name to use in omniscidb server.",
    )
    # Additional information
    optional.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash to use for benchmark.",
    )
    optional.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash to use for benchmark.",
    )

    try:
        os.environ["PYTHONIOENCODING"] = "UTF-8"
        os.environ["PYTHONUNBUFFERED"] = "1"
        omnisci_server_worker = None

        args = parser.parse_args()

        if args.port == port_default_value:
            args.port = find_free_port()
        if args.http_port == port_default_value:
            args.http_port = find_free_port()
        if args.calcite_port == port_default_value:
            args.calcite_port = find_free_port()

        if args.bench_name == "ny_taxi":
            from taxi import run_benchmark
        elif args.bench_name == "santander":
            from santander import run_benchmark
        elif args.bench_name == "census":
            from census import run_benchmark
        elif args.bench_name == "plasticc":
            from plasticc import run_benchmark

        parameters = {
            "data_file": args.data_file,
            "dfiles_num": args.dfiles_num,
            "no_ml": args.no_ml,
            "no_ibis": args.no_ibis,
            "optimizer": args.optimizer,
            "pandas_mode": args.pandas_mode,
            "ray_tmpdir": args.ray_tmpdir,
            "ray_memory": args.ray_memory,
            "gpu_memory": args.gpu_memory,
        }

        if not args.no_ibis:
            if args.executable is None:
                parser.error(
                    "Omnisci executable should be specified with -e/--executable for Ibis part"
                )
            omnisci_server = OmnisciServer(
                omnisci_executable=args.executable,
                omnisci_port=args.port,
                http_port=args.http_port,
                calcite_port=args.calcite_port,
                database_name=args.database_name,
                user=args.user,
                password=args.password,
            )

            parameters["database_name"] = args.database_name
            parameters["table"] = args.table
            parameters["dnd"] = args.dnd
            parameters["dni"] = args.dni
            parameters["validation"] = args.validation

        etl_results = []
        ml_results = []
        print(parameters)
        run_id = int(round(time.time()))
        for iter_num in range(1, args.iterations + 1):
            print(f"Iteration #{iter_num}")

            if not args.no_ibis:
                omnisci_server_worker = OmnisciServerWorker(omnisci_server)
                parameters["omnisci_server_worker"] = omnisci_server_worker
                parameters["ipc_connection"] = args.ipc_connection
                omnisci_server.launch()

            result = run_benchmark(parameters)

            if not args.no_ibis:
                omnisci_server.terminate()

            for backend_res in result["ETL"]:
                if backend_res:
                    backend_res["Iteration"] = iter_num
                    backend_res["run_id"] = run_id
                    etl_results.append(backend_res)
            for backend_res in result["ML"]:
                if backend_res:
                    backend_res["Iteration"] = iter_num
                    backend_res["run_id"] = run_id
                    ml_results.append(backend_res)

            # Reporting to MySQL database
            if args.db_user is not None:
                if iter_num == 1:
                    db = mysql.connector.connect(
                        host=args.db_server,
                        port=args.db_port,
                        user=args.db_user,
                        passwd=args.db_pass,
                        db=args.db_name,
                    )

                    reporting_init_fields = {
                        "OmnisciCommitHash": args.commit_omnisci,
                        "IbisCommitHash": args.commit_ibis
                    }

                    reporting_fields_benchmark_etl = {
                        x: "VARCHAR(500) NOT NULL"
                        for x in etl_results[0]
                    }
                    if len(etl_results) is not 1:
                        reporting_fields_benchmark_etl.update({
                            x: "VARCHAR(500) NOT NULL"
                            for x in etl_results[1]
                        })

                    db_reporter_etl = DbReport(db, args.db_table_etl,
                                               reporting_fields_benchmark_etl,
                                               reporting_init_fields)

                    if len(ml_results) is not 0:
                        reporting_fields_benchmark_ml = {
                            x: "VARCHAR(500) NOT NULL"
                            for x in ml_results[0]
                        }
                        if len(ml_results) is not 1:
                            reporting_fields_benchmark_ml.update({
                                x: "VARCHAR(500) NOT NULL"
                                for x in ml_results[1]
                            })

                        db_reporter_ml = DbReport(
                            db, args.db_table_ml,
                            reporting_fields_benchmark_ml,
                            reporting_init_fields)

                for result_etl in etl_results:
                    db_reporter_etl.submit(result_etl)

                if len(ml_results) is not 0:
                    for result_ml in ml_results:
                        db_reporter_ml.submit(result_ml)

    except Exception:
        traceback.print_exc(file=sys.stdout)
        sys.exit(1)
    finally:
        if omnisci_server_worker:
            omnisci_server_worker.terminate()
示例#6
0
def etl_ibis(args, run_import_queries, columns_names, columns_types, validation=False):

    filename = args.file
    database_name = args.name
    table_name = args.table
    delete_old_database = not args.dnd
    create_new_table = not args.dni
    run_import_queries = str_arg_to_bool(run_import_queries)
    validation = str_arg_to_bool(validation)

    tmp_table_name = "tmp_table"

    etl_times = {"t_groupby_merge_where": 0.0, "t_train_test_split": 0.0, "t_etl": 0.0}

    if run_import_queries:
        etl_times_import = {
            "t_readcsv_by_ibis": 0.0,
            "t_readcsv_by_COPY": 0.0,
            "t_readcsv_by_FSI": 0.0,
        }
        etl_times.update(etl_times_import)

    omnisci_server = OmnisciServer(
        omnisci_executable=args.omnisci_executable,
        omnisci_port=args.omnisci_port,
        database_name=args.name,
        user=args.user,
        password=args.password,
        debug_timer=True,
        columnar_output=args.server_columnar_output,
        lazy_fetch=args.server_lazy_fetch,
    )
    omnisci_server.launch()

    import ibis
    from server_worker import OmnisciServerWorker

    omnisci_server_worker = OmnisciServerWorker(omnisci_server)
    omnisci_server_worker.create_database(
        database_name, delete_if_exists=delete_old_database
    )

    time.sleep(2)
    omnisci_server_worker.connect_to_server()

    if run_import_queries:
        # SQL statemnts preparation for data file import queries
        connect_to_db_sql_template = "\c {0} admin HyperInteractive"
        create_table_sql_template = """
        CREATE TABLE {0} ({1});
        """
        import_by_COPY_sql_template = """
        COPY {0} FROM '{1}' WITH (header='{2}');
        """
        import_by_FSI_sql_template = """
        CREATE TEMPORARY TABLE {0} ({1}) WITH (storage_type='CSV:{2}');
        """
        drop_table_sql_template = """
        DROP TABLE IF EXISTS {0};
        """

        import_query_cols_list = (
            ["ID_code TEXT ENCODING NONE, \n", "target SMALLINT, \n"]
            + ["var_%s DOUBLE, \n" % i for i in range(199)]
            + ["var_199 DOUBLE"]
        )
        import_query_cols_str = "".join(import_query_cols_list)

        connect_to_db_sql = connect_to_db_sql_template.format(database_name)
        create_table_sql = create_table_sql_template.format(
            tmp_table_name, import_query_cols_str
        )
        import_by_COPY_sql = import_by_COPY_sql_template.format(
            tmp_table_name, filename, "true"
        )
        import_by_FSI_sql = import_by_FSI_sql_template.format(
            tmp_table_name, import_query_cols_str, filename
        )

        # data file import by ibis
        columns_types_import_query = ["string", "int64"] + [
            "float64" for _ in range(200)
        ]
        schema_table_import = ibis.Schema(
            names=columns_names, types=columns_types_import_query
        )
        omnisci_server_worker.get_conn().create_table(
            table_name=tmp_table_name,
            schema=schema_table_import,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import_query = omnisci_server_worker.database(database_name).table(tmp_table_name)
        t0 = timer()
        table_import_query.read_csv(filename, delimiter=",")
        etl_times["t_readcsv_by_ibis"] = timer() - t0

        # data file import by FSI
        omnisci_server_worker.drop_table(tmp_table_name)
        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_FSI_sql)
        etl_times["t_readcsv_by_FSI"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

        # data file import by SQL COPY statement
        omnisci_server_worker.execute_sql_query(create_table_sql)

        t0 = timer()
        omnisci_server_worker.execute_sql_query(import_by_COPY_sql)
        etl_times["t_readcsv_by_COPY"] = timer() - t0

        omnisci_server_worker.drop_table(tmp_table_name)

    if create_new_table:
        # Create table and import data for ETL queries
        schema_table = ibis.Schema(names=columns_names, types=columns_types)
        omnisci_server_worker.get_conn().create_table(
            table_name=table_name,
            schema=schema_table,
            database=database_name,
            fragment_size=args.fragment_size,
        )

        table_import = omnisci_server_worker.database(database_name).table(table_name)
        table_import.read_csv(filename, delimiter=",")

    if args.server_conn_type == "regular":
        omnisci_server_worker.connect_to_server()
    elif args.server_conn_type == "ipc":
        omnisci_server_worker.ipc_connect_to_server()
    else:
        print("Wrong connection type is specified!")
        sys.exit(0)

    db = omnisci_server_worker.database(database_name)
    table = db.table(table_name)

    # group_by/count, merge (join) and filtration queries
    # We are making 400 columns and then insert them into original table thus avoiding
    # nested sql requests
    t0 = timer()
    count_cols = []
    orig_cols = ["ID_code", "target"] + ['var_%s'%i for i in range(200)]
    cast_cols = []
    cast_cols.append(table["target"].cast("int64").name("target0"))
    gt1_cols = []
    for i in range(200):
        col = "var_%d" % i
        col_count = "var_%d_count" % i
        col_gt1 = "var_%d_gt1" % i
        w = ibis.window(group_by=col)
        count_cols.append(table[col].count().over(w).name(col_count))
        gt1_cols.append(
            ibis.case()
            .when(
                table[col].count().over(w).name(col_count) > 1,
                table[col].cast("float32"),
            )
            .else_(ibis.null())
            .end()
            .name("var_%d_gt1" % i)
        )
        cast_cols.append(table[col].cast("float32").name(col))

    table = table.mutate(count_cols)
    table = table.drop(orig_cols)
    table = table.mutate(gt1_cols)
    table = table.mutate(cast_cols)

    table_df = table.execute()
    etl_times["t_groupby_merge_where"] = timer() - t0

    # rows split query
    t0 = timer()
    training_part, validation_part = table_df[:-10000], table_df[-10000:]
    etl_times["t_train_test_split"] = timer() - t0
    
    etl_times["t_etl"] = etl_times["t_groupby_merge_where"] + etl_times["t_train_test_split"]
    
    x_train = training_part.drop(['target0'],axis=1)
    y_train = training_part['target0']
    x_valid = validation_part.drop(['target0'],axis=1)
    y_valid = validation_part['target0']
    
    omnisci_server.terminate()
    omnisci_server = None

    return x_train, y_train, x_valid, y_valid, etl_times
示例#7
0
def main():
    omniscript_path = os.path.dirname(__file__)
    omnisci_server = None
    args = None
    port_default_value = -1

    parser = argparse.ArgumentParser(description="Run internal tests from ibis project")
    required = parser.add_argument_group("common")
    optional = parser.add_argument_group("optional arguments")
    omnisci = parser.add_argument_group("omnisci")
    benchmark = parser.add_argument_group("benchmark")
    mysql = parser.add_argument_group("mysql")
    commits = parser.add_argument_group("commits")

    possible_tasks = ["build", "test", "benchmark"]
    benchmarks = ["ny_taxi", "santander", "census", "plasticc", "mortgage", "h2o"]

    # Task
    required.add_argument(
        "-task",
        dest="task",
        required=True,
        help=f"Task for execute {possible_tasks}. Use , separator for multiple tasks",
    )

    # Environment
    required.add_argument("-en", "--env_name", dest="env_name", help="Conda env name.")
    optional.add_argument(
        "-ec",
        "--env_check",
        dest="env_check",
        default=False,
        type=str_arg_to_bool,
        help="Check if env exists. If it exists don't recreate.",
    )
    optional.add_argument(
        "-s",
        "--save_env",
        dest="save_env",
        default=False,
        type=str_arg_to_bool,
        help="Save conda env after executing.",
    )
    optional.add_argument(
        "-r",
        "--report_path",
        dest="report_path",
        default=os.path.join(omniscript_path, ".."),
        help="Path to report file.",
    )
    optional.add_argument(
        "-ci",
        "--ci_requirements",
        dest="ci_requirements",
        default=os.path.join(omniscript_path, "ci_requirements.yml"),
        help="File with ci requirements for conda env.",
    )
    optional.add_argument(
        "-py",
        "--python_version",
        dest="python_version",
        default="3.7",
        help="File with ci requirements for conda env.",
    )

    # Ibis
    optional.add_argument(
        "-i", "--ibis_path", dest="ibis_path", required=False, help="Path to ibis directory."
    )

    # Ibis tests
    optional.add_argument(
        "-expression",
        dest="expression",
        default=" ",
        help="Run tests which match the given substring test names and their parent "
        "classes. Example: 'test_other', while 'not test_method' matches those "
        "that don't contain 'test_method' in their names.",
    )

    # Modin
    optional.add_argument(
        "-m", "--modin_path", dest="modin_path", default=None, help="Path to modin directory."
    )
    optional.add_argument(
        "--modin_pkgs_dir",
        dest="modin_pkgs_dir",
        default=None,
        type=str,
        help="Path where to store built Modin dependencies (--target flag for pip), can be helpful if you have space limited home directory.",
    )
    optional.add_argument(
        "--manage_dbe_dir",
        dest="manage_dbe_dir",
        default=False,
        type=str_arg_to_bool,
        help="Manage (create and initialize) DBE data directory on the 'build' step.",
    )

    # Omnisci server parameters
    omnisci.add_argument(
        "-executable", dest="executable", required=False, help="Path to omnisci_server executable."
    )
    omnisci.add_argument(
        "-omnisci_cwd",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    omnisci.add_argument(
        "-port",
        dest="port",
        default=port_default_value,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-http_port",
        dest="http_port",
        default=port_default_value,
        type=int,
        help="HTTP port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-calcite_port",
        dest="calcite_port",
        default=port_default_value,
        type=int,
        help="Calcite port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-user", dest="user", default="admin", help="User name to use on omniscidb server."
    )
    omnisci.add_argument(
        "-password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    omnisci.add_argument(
        "-database_name",
        dest="database_name",
        default="agent_test_ibis",
        help="Database name to use in omniscidb server.",
    )
    omnisci.add_argument(
        "-table",
        dest="table",
        default="benchmark_table",
        help="Table name name to use in omniscidb server.",
    )
    omnisci.add_argument(
        "-ipc_conn",
        dest="ipc_conn",
        default=True,
        type=str_arg_to_bool,
        help="Connection type for ETL operations",
    )
    omnisci.add_argument(
        "-debug_timer",
        dest="debug_timer",
        default=False,
        type=str_arg_to_bool,
        help="Enable fine-grained query execution timers for debug.",
    )
    omnisci.add_argument(
        "-columnar_output",
        dest="columnar_output",
        default=True,
        type=str_arg_to_bool,
        help="Allows OmniSci Core to directly materialize intermediate projections \
            and the final ResultSet in Columnar format where appropriate.",
    )
    omnisci.add_argument(
        "-lazy_fetch",
        dest="lazy_fetch",
        default=None,
        type=str_arg_to_bool,
        help="[lazy_fetch help message]",
    )
    omnisci.add_argument(
        "-multifrag_rs",
        dest="multifrag_rs",
        default=None,
        type=str_arg_to_bool,
        help="[multifrag_rs help message]",
    )
    omnisci.add_argument(
        "-fragments_size",
        dest="fragments_size",
        default=None,
        nargs="*",
        type=int,
        help="Number of rows per fragment that is a unit of the table for query processing. \
            Should be specified for each table in workload",
    )
    omnisci.add_argument(
        "-omnisci_run_kwargs",
        dest="omnisci_run_kwargs",
        default={},
        metavar="KEY1=VAL1,KEY2=VAL2...",
        action=KeyValueListParser,
        help="options to start omnisci server",
    )

    # Benchmark parameters
    benchmark.add_argument(
        "-bench_name", dest="bench_name", choices=benchmarks, help="Benchmark name."
    )
    benchmark.add_argument(
        "-data_file", dest="data_file", help="A datafile that should be loaded."
    )
    benchmark.add_argument(
        "-dfiles_num",
        dest="dfiles_num",
        default=None,
        type=int,
        help="Number of datafiles to input into database for processing.",
    )
    benchmark.add_argument(
        "-iterations",
        dest="iterations",
        default=1,
        type=int,
        help="Number of iterations to run every query. Best result is selected.",
    )
    benchmark.add_argument(
        "-dnd", default=False, type=str_arg_to_bool, help="Do not delete old table."
    )
    benchmark.add_argument(
        "-dni",
        default=False,
        type=str_arg_to_bool,
        help="Do not create new table and import any data from CSV files.",
    )
    benchmark.add_argument(
        "-validation",
        dest="validation",
        default=False,
        type=str_arg_to_bool,
        help="validate queries results (by comparison with Pandas queries results).",
    )
    benchmark.add_argument(
        "-import_mode",
        dest="import_mode",
        default="fsi",
        help="you can choose: {copy-from, pandas, fsi}",
    )
    benchmark.add_argument(
        "-optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default=None,
        help="Which optimizer is used",
    )
    benchmark.add_argument(
        "-no_ibis",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version",
    )
    benchmark.add_argument(
        "-no_pandas",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Pandas version of benchmark",
    )
    benchmark.add_argument(
        "-pandas_mode",
        choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python", "Modin_on_omnisci"],
        default="Pandas",
        help="Specifies which version of Pandas to use: "
        "plain Pandas, Modin runing on Ray or on Dask",
    )
    benchmark.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help="Location where to keep Ray plasma store. "
        "It should have enough space to keep -ray_memory",
    )
    benchmark.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        type=int,
        help="Size of memory to allocate for Ray plasma store",
    )
    benchmark.add_argument(
        "-no_ml",
        default=None,
        type=str_arg_to_bool,
        help="Do not run machine learning benchmark, only ETL part",
    )
    optional.add_argument(
        "-gpu_memory",
        dest="gpu_memory",
        type=int,
        help="specify the memory of your gpu"
        "(This controls the lines to be used. Also work for CPU version. )",
        default=None,
    )
    benchmark.add_argument(
        "-extended_functionality",
        dest="extended_functionality",
        default=False,
        type=str_arg_to_bool,
        help="Extends functionality of H2O benchmark by adding 'chk' functions and verbose local reporting of results",
    )
    # MySQL database parameters
    mysql.add_argument(
        "-db_server", dest="db_server", default="localhost", help="Host name of MySQL server."
    )
    mysql.add_argument(
        "-db_port", dest="db_port", default=3306, type=int, help="Port number of MySQL server."
    )
    mysql.add_argument(
        "-db_user",
        dest="db_user",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    mysql.add_argument(
        "-db_pass",
        dest="db_pass",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    mysql.add_argument(
        "-db_name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db_table_etl",
        dest="db_table_etl",
        help="Table to use to store ETL results for this benchmark.",
    )
    optional.add_argument(
        "-db_table_ml",
        dest="db_table_ml",
        help="Table to use to store ML results for this benchmark.",
    )
    # Additional information
    commits.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash used for tests.",
    )
    commits.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash used for tests.",
    )
    commits.add_argument(
        "-commit_omniscripts",
        dest="commit_omniscripts",
        default="1234567890123456789012345678901234567890",
        help="Omniscripts commit hash used for tests.",
    )
    commits.add_argument(
        "-commit_modin",
        dest="commit_modin",
        default="1234567890123456789012345678901234567890",
        help="Modin commit hash used for tests.",
    )
    optional.add_argument(
        "-debug_mode",
        dest="debug_mode",
        default=False,
        type=str_arg_to_bool,
        help="Enable debug mode.",
    )

    try:
        args = parser.parse_args()

        os.environ["IBIS_TEST_OMNISCIDB_DATABASE"] = args.database_name
        os.environ["IBIS_TEST_DATA_DB"] = args.database_name
        os.environ["IBIS_TEST_OMNISCIDB_PORT"] = str(args.port)
        os.environ["PYTHONIOENCODING"] = "UTF-8"
        os.environ["PYTHONUNBUFFERED"] = "1"

        if args.port == port_default_value:
            args.port = find_free_port()
        if args.http_port == port_default_value:
            args.http_port = find_free_port()
        if args.calcite_port == port_default_value:
            args.calcite_port = find_free_port()

        required_tasks = args.task.split(",")
        tasks = {}
        for task in possible_tasks:
            tasks[task] = True if task in required_tasks else False

        if True not in list(tasks.values()):
            raise ValueError(
                f"Only {list(tasks.keys())} are supported, {required_tasks} cannot find possible tasks"
            )

        if args.python_version not in ["3.7", "3,6"]:
            raise NotImplementedError(
                f"Only 3.7 and 3.6 python versions are supported, {args.python_version} is not supported"
            )

        conda_env = CondaEnvironment(args.env_name)
        print("PREPARING ENVIRONMENT")
        conda_env.create(
            args.env_check,
            requirements_file=args.ci_requirements,
            python_version=args.python_version,
        )
        if tasks["build"]:
            install_cmdline = ["python3", "setup.py", "install"]

            if args.ibis_path:
                ibis_requirements = os.path.join(
                    args.ibis_path, "ci", f"requirements-{args.python_version}-dev.yml"
                )
                install_ibis_reqs_cmdline = [
                    "conda",
                    "env",
                    "update",
                    "--name",
                    f"{args.env_name}",
                    "--file",
                    ibis_requirements,
                ]

                print("INSTALLATION OF IBIS DEPENDENCIES")
                conda_env.run(install_ibis_reqs_cmdline, print_output=False)

                print("IBIS INSTALLATION")
                conda_env.run(install_cmdline, cwd=args.ibis_path, print_output=False)

            if args.modin_path:
                install_modin_reqs_cmdline = [
                    "conda",
                    "env",
                    "update",
                    "--name",
                    f"{args.env_name}",
                    "--file",
                    "environment-dev.yml",
                ]
                if args.modin_pkgs_dir:
                    os.environ["PYTHONPATH"] = (
                        os.getenv("PYTHONPATH") + os.pathsep + args.modin_pkgs_dir
                        if os.getenv("PYTHONPATH")
                        else args.modin_pkgs_dir
                    )
                print("INSTALLATION OF MODIN DEPENDENCIES")
                # Installation of Modin dependencies can proceed with errors. If error occurs, please try to
                # rebase your branch to the current Modin master
                try:
                    conda_env.run(
                        install_modin_reqs_cmdline, cwd=args.modin_path, print_output=False
                    )
                except Exception:
                    print("INSTALLATION OF MODIN DEPENDENCIES PROCESSED WITH ERRORS")

                print("MODIN INSTALLATION")
                # Modin installation handled this way because "conda run --name env_name python3 setup.py install"
                # (called by "conda_env.run") processed with warning that is not raised via "python3 setup.py install".
                # This warning is handled by omniscripts as error, that causing exception raise.
                try:
                    conda_env.run(install_cmdline, cwd=args.modin_path, print_output=False)
                except Exception:
                    print("MODIN INSTALLATION PROCESSED WITH ERRORS")

            # trying to install dbe extension if omnisci generated it
            executables_path = os.path.dirname(args.executable)
            dbe_path = os.path.join(os.path.abspath(f"{executables_path}/.."), "Embedded")
            initdb_path = os.path.join(executables_path, "initdb")
            data_dir = os.path.join(os.path.dirname(__file__), "data")
            initdb_cmdline = [initdb_path, "--data", data_dir]

            if not os.path.isdir(data_dir) and args.manage_dbe_dir:
                print("MANAGING OMNISCI DATA DIR", data_dir)
                os.makedirs(data_dir)
                conda_env.run(initdb_cmdline, print_output=False)

            if os.path.exists(dbe_path):
                print("DBE INSTALLATION")
                cmake_cmdline = [
                    "cmake",
                    "--install",
                    "build",
                    "--component",
                    "DBE",
                    "--prefix",
                    "$CONDA_PREFIX",
                ]
                cmake_qe_cmdline = [
                    "cmake",
                    "--install",
                    "build",
                    "--component",
                    "QE",
                    "--prefix",
                    "$CONDA_PREFIX",
                ]
                cmake_thrift_cmdline = [
                    "cmake",
                    "--install",
                    "build",
                    "--component",
                    "thrift",
                    "--prefix",
                    "$CONDA_PREFIX",
                ]
                cmake_jar_cmdline = [
                    "cmake",
                    "--install",
                    "build",
                    "--component",
                    "jar",
                    "--prefix",
                    "$CONDA_PREFIX",
                ]
                omniscidb_root = os.path.abspath(f"{executables_path}/../../")
                conda_env.run(cmake_cmdline, cwd=omniscidb_root, print_output=False)
                conda_env.run(cmake_qe_cmdline, cwd=omniscidb_root, print_output=False)
                conda_env.run(cmake_thrift_cmdline, cwd=omniscidb_root, print_output=False)
                conda_env.run(cmake_jar_cmdline, cwd=omniscidb_root, print_output=False)
                conda_env.run(install_cmdline, cwd=dbe_path, print_output=False)
            else:
                print("Using Omnisci server")

        if tasks["test"]:
            ibis_data_script = os.path.join(args.ibis_path, "ci", "datamgr.py")
            dataset_download_cmdline = ["python3", ibis_data_script, "download"]
            dataset_import_cmdline = [
                "python3",
                ibis_data_script,
                "omniscidb",
                "-P",
                str(args.port),
                "--database",
                args.database_name,
            ]
            report_file_name = f"report-{args.commit_ibis[:8]}-{args.commit_omnisci[:8]}.html"
            if not os.path.isdir(args.report_path):
                os.makedirs(args.report_path)
            report_file_path = os.path.join(args.report_path, report_file_name)

            ibis_tests_cmdline = [
                "pytest",
                "-m",
                "omniscidb",
                "--disable-pytest-warnings",
                "-k",
                args.expression,
                f"--html={report_file_path}",
            ]

            print("STARTING OMNISCI SERVER")
            omnisci_server = OmnisciServer(
                omnisci_executable=args.executable,
                omnisci_port=args.port,
                http_port=args.http_port,
                calcite_port=args.calcite_port,
                database_name=args.database_name,
                omnisci_cwd=args.omnisci_cwd,
                user=args.user,
                password=args.password,
                debug_timer=args.debug_timer,
                columnar_output=args.columnar_output,
                lazy_fetch=args.lazy_fetch,
                multifrag_rs=args.multifrag_rs,
                omnisci_run_kwargs=args.omnisci_run_kwargs,
            )
            omnisci_server.launch()

            print("PREPARING DATA")
            conda_env.run(dataset_download_cmdline)
            conda_env.run(dataset_import_cmdline)

            print("RUNNING TESTS")
            conda_env.run(ibis_tests_cmdline, cwd=args.ibis_path)

        if tasks["benchmark"]:
            # if not args.bench_name or args.bench_name not in benchmarks:
            #     print(
            #     f"Benchmark {args.bench_name} is not supported, only {benchmarks} are supported")
            # sys.exit(1)

            if not args.data_file:
                raise ValueError(
                    "Parameter --data_file was received empty, but it is required for benchmarks"
                )

            benchmark_script_path = os.path.join(omniscript_path, "run_ibis_benchmark.py")

            benchmark_cmd = ["python3", benchmark_script_path]

            possible_benchmark_args = [
                "bench_name",
                "data_file",
                "dfiles_num",
                "iterations",
                "dnd",
                "dni",
                "validation",
                "optimizer",
                "no_ibis",
                "no_pandas",
                "pandas_mode",
                "ray_tmpdir",
                "ray_memory",
                "no_ml",
                "gpu_memory",
                "db_server",
                "db_port",
                "db_user",
                "db_pass",
                "db_name",
                "db_table_etl",
                "db_table_ml",
                "executable",
                "omnisci_cwd",
                "port",
                "http_port",
                "calcite_port",
                "user",
                "password",
                "ipc_conn",
                "database_name",
                "table",
                "commit_omnisci",
                "commit_ibis",
                "import_mode",
                "debug_timer",
                "columnar_output",
                "lazy_fetch",
                "multifrag_rs",
                "fragments_size",
                "omnisci_run_kwargs",
                "commit_omniscripts",
                "debug_mode",
                "extended_functionality",
                "commit_modin",
            ]
            args_dict = vars(args)
            args_dict["data_file"] = f"'{args_dict['data_file']}'"
            for arg_name in list(parser._option_string_actions.keys()):
                try:
                    pure_arg = re.sub(r"^--*", "", arg_name)
                    if pure_arg in possible_benchmark_args:
                        arg_value = args_dict[pure_arg]
                        # correct filling of arguments with default values
                        if arg_value is not None:
                            if isinstance(arg_value, dict):
                                if arg_value:
                                    benchmark_cmd.extend(
                                        [
                                            arg_name,
                                            ",".join(
                                                f"{key}={value}"
                                                for key, value in arg_value.items()
                                            ),
                                        ]
                                    )
                            elif isinstance(arg_value, (list, tuple)):
                                if arg_value:
                                    benchmark_cmd.extend([arg_name] + [str(x) for x in arg_value])
                            else:
                                benchmark_cmd.extend([arg_name, str(arg_value)])

                except KeyError:
                    pass

            print(benchmark_cmd)

            conda_env.run(benchmark_cmd)

    except Exception:
        traceback.print_exc(file=sys.stdout)
        raise

    finally:
        if omnisci_server:
            omnisci_server.terminate()
        if args and args.save_env is False:
            conda_env.remove()
示例#8
0
def main():
    omniscript_path = os.path.dirname(__file__)
    omnisci_server = None
    args = None
    port_default_value = -1

    parser = argparse.ArgumentParser(
        description="Run internal tests from ibis project")
    required = parser._action_groups.pop()
    optional = parser.add_argument_group("optional arguments")
    omnisci = parser.add_argument_group("omnisci")
    benchmark = parser.add_argument_group("benchmark")
    mysql = parser.add_argument_group("mysql")
    commits = parser.add_argument_group("commits")

    possible_tasks = ["build", "test", "benchmark"]
    benchmarks = ["ny_taxi", "santander", "census", "plasticc"]
    # Task
    required.add_argument(
        "-task",
        dest="task",
        required=True,
        help=
        f"Task for execute {possible_tasks}. Use , separator for multiple tasks",
    )

    # Environment
    required.add_argument("-en",
                          "--env_name",
                          dest="env_name",
                          help="Conda env name.")
    optional.add_argument(
        "-ec",
        "--env_check",
        dest="env_check",
        default=False,
        type=str_arg_to_bool,
        help="Check if env exists. If it exists don't recreate.",
    )
    optional.add_argument(
        "-s",
        "--save_env",
        dest="save_env",
        default=False,
        type=str_arg_to_bool,
        help="Save conda env after executing.",
    )
    optional.add_argument(
        "-r",
        "--report_path",
        dest="report_path",
        default=os.path.join(omniscript_path, ".."),
        help="Path to report file.",
    )
    optional.add_argument(
        "-ci",
        "--ci_requirements",
        dest="ci_requirements",
        default=os.path.join(omniscript_path, "ci_requirements.yml"),
        help="File with ci requirements for conda env.",
    )
    optional.add_argument(
        "-py",
        "--python_version",
        dest="python_version",
        default="3.7",
        help="File with ci requirements for conda env.",
    )
    # Ibis
    required.add_argument(
        "-i",
        "--ibis_path",
        dest="ibis_path",
        required=True,
        help="Path to ibis directory.",
    )
    # Ibis tests
    optional.add_argument(
        "-expression",
        dest="expression",
        default=" ",
        help=
        "Run tests which match the given substring test names and their parent "
        "classes. Example: 'test_other', while 'not test_method' matches those "
        "that don't contain 'test_method' in their names.",
    )
    # Omnisci server parameters
    omnisci.add_argument(
        "-executable",
        dest="executable",
        required=True,
        help="Path to omnisci_server executable.",
    )
    omnisci.add_argument(
        "--omnisci_cwd",
        dest="omnisci_cwd",
        help="Path to omnisci working directory. "
        "By default parent directory of executable location is used. "
        "Data directory is used in this location.",
    )
    omnisci.add_argument(
        "-port",
        dest="port",
        default=port_default_value,
        type=int,
        help="TCP port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-http_port",
        dest="http_port",
        default=port_default_value,
        type=int,
        help="HTTP port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-calcite_port",
        dest="calcite_port",
        default=port_default_value,
        type=int,
        help="Calcite port number to run omnisci_server on.",
    )
    omnisci.add_argument(
        "-user",
        dest="user",
        default="admin",
        help="User name to use on omniscidb server.",
    )
    omnisci.add_argument(
        "-password",
        dest="password",
        default="HyperInteractive",
        help="User password to use on omniscidb server.",
    )
    omnisci.add_argument(
        "-database_name",
        dest="database_name",
        default="agent_test_ibis",
        help="Database name to use in omniscidb server.",
    )
    omnisci.add_argument(
        "-table",
        dest="table",
        default="benchmark_table",
        help="Table name name to use in omniscidb server.",
    )
    omnisci.add_argument(
        "-ipc_conn",
        dest="ipc_connection",
        default=True,
        type=str_arg_to_bool,
        help="Table name name to use in omniscidb server.",
    )
    # Benchmark parameters
    benchmark.add_argument(
        "-bench_name",
        dest="bench_name",
        choices=benchmarks,
        help="Benchmark name.",
    )
    benchmark.add_argument(
        "-data_file",
        dest="data_file",
        help="A datafile that should be loaded.",
    )
    benchmark.add_argument(
        "-dfiles_num",
        dest="dfiles_num",
        default=1,
        type=int,
        help="Number of datafiles to input into database for processing.",
    )
    benchmark.add_argument(
        "-iterations",
        dest="iterations",
        default=1,
        type=int,
        help=
        "Number of iterations to run every query. Best result is selected.",
    )
    benchmark.add_argument("-dnd",
                           default=False,
                           type=str_arg_to_bool,
                           help="Do not delete old table.")
    benchmark.add_argument(
        "-dni",
        default=False,
        type=str_arg_to_bool,
        help="Do not create new table and import any data from CSV files.",
    )
    benchmark.add_argument(
        "-validation",
        dest="validation",
        default=False,
        type=str_arg_to_bool,
        help=
        "validate queries results (by comparison with Pandas queries results).",
    )
    benchmark.add_argument(
        "-optimizer",
        choices=["intel", "stock"],
        dest="optimizer",
        default="intel",
        help="Which optimizer is used",
    )
    benchmark.add_argument(
        "-no_ibis",
        default=False,
        type=str_arg_to_bool,
        help="Do not run Ibis benchmark, run only Pandas (or Modin) version",
    )
    benchmark.add_argument(
        "-pandas_mode",
        choices=["Pandas", "Modin_on_ray", "Modin_on_dask", "Modin_on_python"],
        default="Pandas",
        help="Specifies which version of Pandas to use: "
        "plain Pandas, Modin runing on Ray or on Dask",
    )
    benchmark.add_argument(
        "-ray_tmpdir",
        default="/tmp",
        help="Location where to keep Ray plasma store. "
        "It should have enough space to keep -ray_memory",
    )
    benchmark.add_argument(
        "-ray_memory",
        default=200 * 1024 * 1024 * 1024,
        help="Size of memory to allocate for Ray plasma store",
    )
    benchmark.add_argument(
        "-no_ml",
        default=False,
        type=str_arg_to_bool,
        help="Do not run machine learning benchmark, only ETL part",
    )
    optional.add_argument(
        "-gpu_memory",
        dest="gpu_memory",
        type=int,
        help="specify the memory of your gpu, default 16. "
        "(This controls the lines to be used. Also work for CPU version. )",
        default=16,
    )
    # MySQL database parameters
    mysql.add_argument(
        "-db_server",
        dest="db_server",
        default="localhost",
        help="Host name of MySQL server.",
    )
    mysql.add_argument(
        "-db_port",
        dest="db_port",
        default=3306,
        type=int,
        help="Port number of MySQL server.",
    )
    mysql.add_argument(
        "-db_user",
        dest="db_user",
        help="Username to use to connect to MySQL database. "
        "If user name is specified, script attempts to store results in MySQL "
        "database using other -db-* parameters.",
    )
    mysql.add_argument(
        "-db_pass",
        dest="db_pass",
        default="omniscidb",
        help="Password to use to connect to MySQL database.",
    )
    mysql.add_argument(
        "-db_name",
        dest="db_name",
        default="omniscidb",
        help="MySQL database to use to store benchmark results.",
    )
    optional.add_argument(
        "-db_table_etl",
        dest="db_table_etl",
        help="Table to use to store ETL results for this benchmark.",
    )
    optional.add_argument(
        "-db_table_ml",
        dest="db_table_ml",
        help="Table to use to store ML results for this benchmark.",
    )
    # Additional information
    commits.add_argument(
        "-commit_omnisci",
        dest="commit_omnisci",
        default="1234567890123456789012345678901234567890",
        help="Omnisci commit hash to use for tests.",
    )
    commits.add_argument(
        "-commit_ibis",
        dest="commit_ibis",
        default="1234567890123456789012345678901234567890",
        help="Ibis commit hash to use for tests.",
    )

    try:
        args = parser.parse_args()

        os.environ["IBIS_TEST_OMNISCIDB_DATABASE"] = args.database_name
        os.environ["IBIS_TEST_DATA_DB"] = args.database_name
        os.environ["IBIS_TEST_OMNISCIDB_PORT"] = str(args.port)
        os.environ["PYTHONIOENCODING"] = "UTF-8"
        os.environ["PYTHONUNBUFFERED"] = "1"

        if args.port == port_default_value:
            args.port = find_free_port()
        if args.http_port == port_default_value:
            args.http_port = find_free_port()
        if args.calcite_port == port_default_value:
            args.calcite_port = find_free_port()

        required_tasks = args.task.split(",")
        tasks = {}
        for task in possible_tasks:
            tasks[task] = True if task in required_tasks else False

        if True not in list(tasks.values()):
            print(
                f"Only {list(tasks.keys())} are supported, {required_tasks} cannot find possible tasks"
            )
            sys.exit(1)

        if args.python_version not in ["3.7", "3,6"]:
            print(
                f"Only 3.7 and 3.6 python versions are supported, {args.python_version} is not supported"
            )
            sys.exit(1)

        ibis_requirements = os.path.join(
            args.ibis_path, "ci",
            f"requirements-{args.python_version}-dev.yml")
        requirements_file = "requirements.yml"

        conda_env = CondaEnvironment(args.env_name)

        print("PREPARING ENVIRONMENT")
        combinate_requirements(ibis_requirements, args.ci_requirements,
                               requirements_file)
        conda_env.create(args.env_check, requirements_file=requirements_file)

        if tasks["build"]:
            install_ibis_cmdline = [
                "python3", os.path.join("setup.py"), "install"
            ]

            print("IBIS INSTALLATION")
            conda_env.run(install_ibis_cmdline,
                          cwd=args.ibis_path,
                          print_output=False)

        if tasks["test"]:
            ibis_data_script = os.path.join(args.ibis_path, "ci", "datamgr.py")
            dataset_download_cmdline = [
                "python3", ibis_data_script, "download"
            ]
            dataset_import_cmdline = [
                "python3",
                ibis_data_script,
                "omniscidb",
                "-P",
                str(args.port),
                "--database",
                args.database_name,
            ]
            report_file_name = (
                f"report-{args.commit_ibis[:8]}-{args.commit_omnisci[:8]}.html"
            )
            if not os.path.isdir(args.report_path):
                os.makedirs(args.report_path)
            report_file_path = os.path.join(args.report_path, report_file_name)

            ibis_tests_cmdline = [
                "pytest",
                "-m",
                "omniscidb",
                "--disable-pytest-warnings",
                "-k",
                args.expression,
                f"--html={report_file_path}",
            ]

            print("STARTING OMNISCI SERVER")
            omnisci_server = OmnisciServer(
                omnisci_executable=args.executable,
                omnisci_port=args.port,
                http_port=args.http_port,
                calcite_port=args.calcite_port,
                database_name=args.database_name,
                omnisci_cwd=args.omnisci_cwd,
                user=args.user,
                password=args.password,
            )
            omnisci_server.launch()

            print("PREPARING DATA")
            conda_env.run(dataset_download_cmdline)
            conda_env.run(dataset_import_cmdline)

            print("RUNNING TESTS")
            conda_env.run(ibis_tests_cmdline, cwd=args.ibis_path)

        if tasks["benchmark"]:
            # if not args.bench_name or args.bench_name not in benchmarks:
            #     print(
            #     f"Benchmark {args.bench_name} is not supported, only {benchmarks} are supported")
            # sys.exit(1)

            if not args.data_file:
                print(
                    f"Parameter --data_file was received empty, but it is required for benchmarks"
                )
                sys.exit(1)

            benchmark_script_path = os.path.join(omniscript_path,
                                                 "run_ibis_benchmark.py")

            benchmark_cmd = ["python3", benchmark_script_path]

            possible_benchmark_args = [
                "bench_name",
                "data_file",
                "dfiles_num",
                "iterations",
                "dnd",
                "dni",
                "validation",
                "optimizer",
                "no_ibis",
                "pandas_mode",
                "ray_tmpdir",
                "ray_memory",
                "no_ml",
                "gpu_memory",
                "db_server",
                "db_port",
                "db_user",
                "db_pass",
                "db_name",
                "db_table_etl",
                "db_table_ml",
                "executable",
                "omnisci_cwd",
                "port",
                "http_port",
                "calcite_port",
                "user",
                "password",
                "ipc_connection",
                "database_name",
                "table",
                "commit_omnisci",
                "commit_ibis",
            ]
            args_dict = vars(args)
            args_dict["data_file"] = f"'{args_dict['data_file']}'"
            for arg_name in list(parser._option_string_actions.keys()):
                try:
                    pure_arg = re.sub(r"^--*", "", arg_name)
                    if pure_arg in possible_benchmark_args:
                        arg_value = args_dict[pure_arg]
                        if arg_value:
                            benchmark_cmd.extend([arg_name, str(arg_value)])
                except KeyError:
                    pass

            print(benchmark_cmd)

            conda_env.run(benchmark_cmd)

    except Exception:
        traceback.print_exc(file=sys.stdout)
        sys.exit(1)

    finally:
        if omnisci_server:
            omnisci_server.terminate()
        if args and args.save_env is False:
            conda_env.remove()
def main():
    args = None
    omnisci_server = None

    parser, args, skip_rows = get_args()

    try:
        if not args.no_ibis:
            sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
            from server import OmnisciServer

            if args.omnisci_executable is None:
                parser.error(
                    "Omnisci executable should be specified with -e/--executable"
                )

            omnisci_server = OmnisciServer(
                omnisci_executable=args.omnisci_executable,
                omnisci_port=args.omnisci_port,
                database_name=args.name,
                omnisci_cwd=args.omnisci_cwd,
                user=args.user,
                password=args.password,
            )
            omnisci_server.launch()

            from server_worker import OmnisciServerWorker

            omnisci_server_worker = OmnisciServerWorker(omnisci_server)

            (
                X_train,
                y_train,
                X_test,
                y_test,
                Xt,
                classes,
                class_weights,
                etl_times,
            ) = etl_all_ibis(
                filename=args.dataset_path,
                database_name=args.name,
                omnisci_server_worker=omnisci_server_worker,
                delete_old_database=not args.dnd,
                create_new_table=not args.dni,
                skip_rows=skip_rows,
            )
            print_times(etl_times)

            omnisci_server.terminate()
            omnisci_server = None

            if not args.no_ml:
                print("using ml with dataframes from ibis")
                ml_times = ml(X_train, y_train, X_test, y_test, Xt, classes,
                              class_weights)
                print_times(ml_times)

        (
            X_train,
            y_train,
            X_test,
            y_test,
            Xt,
            classes,
            class_weights,
            etl_times,
        ) = etl_all_pandas(args.dataset_path, skip_rows)
        print_times(etl_times)

        if not args.no_ml:
            print("using ml with dataframes from pandas")
            ml_times = ml(X_train, y_train, X_test, y_test, Xt, classes,
                          class_weights)
            print_times(ml_times)

        if args.val:
            # this isn't work so easy
            # compare_dataframes(ibis_df=(X_train_ibis, y_train_ibis), pandas_df=(X, y))
            print("validate by ml results")

    except Exception as err:
        print("Failed: ", err)
        sys.exit(1)
    finally:
        if omnisci_server:
            omnisci_server.terminate()
示例#10
0
)

try:
    args = parser.parse_args()
    if args.df <= 0:
        print("Bad number of data files specified", args.df)
        sys.exit(1)

    if args.i < 1:
        print("Bad number of iterations specified", args.i)

    database_name = args.n
    omnisci_server = OmnisciServer(
        omnisci_executable=args.e,
        omnisci_port=args.port,
        database_name=database_name,
        user=args.u,
        password=args.p,
    )
    omnisci_server.launch()
    omnisci_server_worker = OmnisciServerWorker(omnisci_server)

    time.sleep(2)
    omnisci_server_worker.connect_to_server()

    taxibench_columns_names = [
        "trip_id",
        "vendor_id",
        "pickup_datetime",
        "dropoff_datetime",
        "store_and_fwd_flag",