help='A comma separated list of data types to exclude while generating queries.') profiles = dict() for profile in PROFILES: profile_name = profile.__name__ if profile_name.endswith('Profile'): profile_name = profile_name[:-1 * len('Profile')] profiles[profile_name.lower()] = profile parser.add_option('--profile', default='default', choices=(sorted(profiles.keys())), help='Determines the mix of SQL features to use during query generation.') # TODO: Seed the random query generator for repeatable queries? cli_options.add_default_values_to_help(parser) options, args = parser.parse_args() cli_options.configure_logging(options.log_level) db_connector_param_key = options.ref_db_type.lower() ref_connection = DbConnector(options.ref_db_type, user_name=getattr(options, db_connector_param_key + '_user'), password=getattr(options, db_connector_param_key + '_password'), host_name=getattr(options, db_connector_param_key + '_host'), port=getattr(options, db_connector_param_key + '_port')) \ .create_connection(options.db_name) db_connector_param_key = options.test_db_type.lower() test_connection = DbConnector(options.test_db_type, user_name=getattr(options, db_connector_param_key + '_user', None), password=getattr(options, db_connector_param_key + '_password', None), host_name=getattr(options, db_connector_param_key + '_host', None), port=getattr(options, db_connector_param_key + '_port', None)) \ .create_connection(options.db_name)
if len(args.command) > 1: raise Exception( 'Only one command can be chosen. Requested commands were: %s' % args.command) command = args.command[0] if args.command else 'populate' if command not in ('populate', 'migrate'): raise Exception( 'Command must either be "populate" or "migrate" but was "%s"' % command) if command == 'migrate' and \ not any((args.use_mysql, args.use_postgresql, args.use_oracle)): raise Exception( 'At least one destination database must be chosen with ' '--use-<database type>') cli_options.configure_logging(args.log_level, debug_log_file=args.debug_log_file) seed(args.randomization_seed) cluster = cli_options.create_cluster(args) populator = DbPopulator( db_connection.HIVE if args.use_hive else db_connection.IMPALA) if command == 'populate': populator.randomization_seed = args.randomization_seed populator.cluster = cluster populator.db_name = args.db_name populator.min_col_count = args.min_column_count populator.max_col_count = args.max_column_count populator.min_row_count = args.min_row_count populator.max_row_count = args.max_row_count
def main(): from optparse import OptionParser import tests.comparison.cli_options as cli_options parser = OptionParser(epilog=dedent( """Before running this script a CM cluster must be setup and any needed data such as TPC-H/DS must be loaded. The first time this script is run it will find memory limits and runtimes for each query and save the data to disk (since collecting the data is slow) at --runtime-info-path then run the stress test. Later runs will reuse the saved memory limits and timings. If the cluster changes significantly the memory limits should be re-measured (deleting the file at --runtime-info-path will cause re-measuring to happen).""")) cli_options.add_logging_options(parser) cli_options.add_cm_options(parser) cli_options.add_db_name_option(parser) parser.add_option("--runtime-info-path", default=os.path.join(gettempdir(), "{cm_host}_query_runtime_info.json"), help="The path to store query runtime info at. '{cm_host}' will be replaced with" " the actual host name from --cm-host.") parser.add_option("--no-status", action="store_true", help="Do not print the status table.") parser.add_option("--cancel-current-queries", action="store_true", help="Cancel any queries running on the cluster before beginning.") parser.add_option("--filter-query-mem-ratio", type=float, default=0.333, help="Queries that require this ratio of total available memory will be filtered.") parser.add_option("--mem-limit-padding-pct", type=int, default=25, help="Pad query mem limits found by solo execution with this percentage when" " running concurrently. After padding queries will not be expected to fail" " due to mem limit exceeded.") parser.add_option("--timeout-multiplier", type=float, default=1.0, help="Query timeouts will be multiplied by this value.") parser.add_option("--max-queries", type=int, default=100) parser.add_option("--tpcds-db-name") parser.add_option("--tpch-db-name") parser.add_option("--mem-overcommit-pct", type=float, default=0) parser.add_option("--mem-spill-probability", type=float, default=0.33, dest="spill_probability", help="The probability that a mem limit will be set low enough to induce spilling.") parser.add_option("--cancel-probability", type=float, default=0.1, help="The probability a query will be cancelled.") cli_options.add_default_values_to_help(parser) opts, args = parser.parse_args() if not opts.tpcds_db_name and not opts.tpch_db_name: raise Exception("At least one of --tpcds-db-name --tpch-db-name is required") cli_options.configure_logging(opts.log_level, debug_log_file=opts.debug_log_file, log_thread_id=True, log_process_id=True) LOG.debug("CLI opts: %s" % (opts, )) LOG.debug("CLI args: %s" % (args, )) impala = find_impala_in_cm( opts.cm_host, opts.cm_user, opts.cm_password, opts.cm_cluster_name) if opts.cancel_current_queries: impala.cancel_queries() if impala.queries_are_running(): raise Exception("Queries are currently running on the cluster") runtime_info_path = opts.runtime_info_path if "{cm_host}" in runtime_info_path: runtime_info_path = runtime_info_path.format(cm_host=opts.cm_host) queries_with_runtime_info_by_db_and_sql = load_runtime_info(runtime_info_path, impala) queries = list() if opts.tpcds_db_name: tpcds_queries = load_tpc_queries("tpcds") for query in tpcds_queries: query.db_name = opts.tpcds_db_name queries.extend(tpcds_queries) if opts.tpch_db_name: tpch_queries = load_tpc_queries("tpch") for query in tpch_queries: query.db_name = opts.tpch_db_name queries.extend(tpch_queries) for idx in xrange(len(queries) - 1, -1, -1): query = queries[idx] if query.sql in queries_with_runtime_info_by_db_and_sql[query.db_name]: query = queries_with_runtime_info_by_db_and_sql[query.db_name][query.sql] LOG.debug("Reusing previous runtime data for query: " + query.sql) queries[idx] = query else: populate_runtime_info(query, impala) save_runtime_info(runtime_info_path, query, impala) if query.required_mem_mb_with_spilling: query.required_mem_mb_with_spilling += int(query.required_mem_mb_with_spilling * opts.mem_limit_padding_pct / 100.0) if query.required_mem_mb_without_spilling: query.required_mem_mb_without_spilling += int(query.required_mem_mb_without_spilling * opts.mem_limit_padding_pct / 100.0) if query.solo_runtime_secs_with_spilling: query.solo_runtime_secs_with_spilling *= opts.timeout_multiplier if query.solo_runtime_secs_without_spilling: query.solo_runtime_secs_without_spilling *= opts.timeout_multiplier # Remove any queries that would use "too many" resources. This way a larger number # of queries will run concurrently. if query.required_mem_mb_with_spilling is None \ or query.required_mem_mb_with_spilling / impala.min_impalad_mem_mb \ > opts.filter_query_mem_ratio: LOG.debug("Filtered query due to mem ratio option: " + query.sql) del queries[idx] if len(queries) == 0: raise Exception("All queries were filtered") stress_runner = StressRunner() stress_runner.cancel_probability = opts.cancel_probability stress_runner.spill_probability = opts.spill_probability stress_runner.run_queries(queries, impala, opts.max_queries, opts.mem_overcommit_pct, not opts.no_status)
for profile in PROFILES: profile_name = profile.__name__ if profile_name.endswith('Profile'): profile_name = profile_name[:-1 * len('Profile')] profiles[profile_name.lower()] = profile parser.add_argument( '--profile', default='default', choices=(sorted(profiles.keys())), help= 'Determines the mix of SQL features to use during query generation.') # TODO: Seed the random query generator for repeatable queries? args = parser.parse_args() cli_options.configure_logging(args.log_level, debug_log_file=args.debug_log_file, log_thread_name=True) cluster = cli_options.create_cluster(args) ref_conn = cli_options.create_connection(args, args.ref_db_type, db_name=args.db_name) if args.test_db_type == IMPALA: test_conn = cluster.impala.connect(db_name=args.db_name) elif args.test_db_type == HIVE: test_conn = cluster.hive.connect(db_name=args.db_name) else: test_conn = cli_options.create_connection(args, args.test_db_type, db_name=args.db_name) # Create an instance of profile class (e.g. DefaultProfile)
cli_options.add_logging_options(parser) cli_options.add_cluster_options(parser) parser.add_argument("-s", "--source-db", required=True, help="Source DB to load data from.") parser.add_argument("-t", "--target-db", required=True, help="Target DB to load data to.") parser.add_argument("-w", "--workload", choices=['tpch', 'tpcds'], required=True) parser.add_argument("--kudu_master", required=True, help="Address or host name of Kudu master") # TODO: Automatically set #buckets as a function of cluster nodes and/or # scale parser.add_argument("-b", "--buckets", default="9", help="Number of buckets to partition Kudu tables (only for hash-based).") parser.add_argument("-v", "--verbose", action='store_true', help="Print the executed statements.") parser.add_argument("--clean", action='store_true', help="Drop all tables in the speficied target database.") args = parser.parse_args() cli_options.configure_logging(args.log_level, debug_log_file=args.debug_log_file) cluster = cli_options.create_cluster(args) source_db = args.source_db target_db = args.target_db buckets = args.buckets kudu_master = args.kudu_master workload = args.workload verbose = args.verbose if args.clean: clean_data() load_data()
parser.add_argument('--explain-only', action='store_true', help="Don't run the queries only explain them to see if there was an error in " "planning.") profiles = dict() for profile in PROFILES: profile_name = profile.__name__ if profile_name.endswith('Profile'): profile_name = profile_name[:-1 * len('Profile')] profiles[profile_name.lower()] = profile parser.add_argument('--profile', default='default', choices=(sorted(profiles.keys())), help='Determines the mix of SQL features to use during query generation.') # TODO: Seed the random query generator for repeatable queries? args = parser.parse_args() cli_options.configure_logging( args.log_level, debug_log_file=args.debug_log_file, log_thread_name=True) cluster = cli_options.create_cluster(args) ref_conn = cli_options.create_connection(args, args.ref_db_type, db_name=args.db_name) if args.test_db_type == IMPALA: test_conn = cluster.impala.connect(db_name=args.db_name) elif args.test_db_type == HIVE: test_conn = cluster.hive.connect(db_name=args.db_name) else: test_conn = cli_options.create_connection( args, args.test_db_type, db_name=args.db_name) # Create an instance of profile class (e.g. DefaultProfile) query_profile = profiles[args.profile]() if args.explain_only: searcher = FrontendExceptionSearcher(query_profile, ref_conn, test_conn) searcher.search(args.query_count)
profile_name = profile.__name__ if profile_name.endswith('Profile'): profile_name = profile_name[:-1 * len('Profile')] profiles[profile_name.lower()] = profile parser.add_option( '--profile', default='default', choices=(sorted(profiles.keys())), help= 'Determines the mix of SQL features to use during query generation.') # TODO: Seed the random query generator for repeatable queries? cli_options.add_default_values_to_help(parser) options, args = parser.parse_args() cli_options.configure_logging(options.log_level) db_connector_param_key = options.ref_db_type.lower() ref_connection = DbConnector(options.ref_db_type, user_name=getattr(options, db_connector_param_key + '_user'), password=getattr(options, db_connector_param_key + '_password'), host_name=getattr(options, db_connector_param_key + '_host'), port=getattr(options, db_connector_param_key + '_port')) \ .create_connection(options.db_name) db_connector_param_key = options.test_db_type.lower() test_connection = DbConnector(options.test_db_type, user_name=getattr(options, db_connector_param_key + '_user', None), password=getattr(options, db_connector_param_key + '_password', None), host_name=getattr(options, db_connector_param_key + '_host', None), port=getattr(options, db_connector_param_key + '_port', None)) \ .create_connection(options.db_name)