def main():
  from optparse import OptionParser
  import tests.comparison.cli_options as cli_options

  parser = OptionParser(epilog=dedent(
      """Before running this script a CM cluster must be setup and any needed data
         such as TPC-H/DS must be loaded. The first time this script is run it will
         find memory limits and runtimes for each query and save the data to disk (since
         collecting the data is slow) at --runtime-info-path then run the stress test.
         Later runs will reuse the saved memory limits and timings. If the cluster changes
         significantly the memory limits should be re-measured (deleting the file at
         --runtime-info-path will cause re-measuring to happen)."""))
  cli_options.add_logging_options(parser)
  cli_options.add_cm_options(parser)
  cli_options.add_db_name_option(parser)
  parser.add_option("--runtime-info-path",
      default=os.path.join(gettempdir(), "{cm_host}_query_runtime_info.json"),
      help="The path to store query runtime info at. '{cm_host}' will be replaced with"
      " the actual host name from --cm-host.")
  parser.add_option("--no-status", action="store_true",
      help="Do not print the status table.")
  parser.add_option("--cancel-current-queries", action="store_true",
      help="Cancel any queries running on the cluster before beginning.")
  parser.add_option("--filter-query-mem-ratio", type=float, default=0.333,
      help="Queries that require this ratio of total available memory will be filtered.")
  parser.add_option("--mem-limit-padding-pct", type=int, default=25,
      help="Pad query mem limits found by solo execution with this percentage when"
      " running concurrently. After padding queries will not be expected to fail"
      " due to mem limit exceeded.")
  parser.add_option("--timeout-multiplier", type=float, default=1.0,
      help="Query timeouts will be multiplied by this value.")
  parser.add_option("--max-queries", type=int, default=100)
  parser.add_option("--tpcds-db-name")
  parser.add_option("--tpch-db-name")
  parser.add_option("--mem-overcommit-pct", type=float, default=0)
  parser.add_option("--mem-spill-probability", type=float, default=0.33,
      dest="spill_probability",
      help="The probability that a mem limit will be set low enough to induce spilling.")
  parser.add_option("--cancel-probability", type=float, default=0.1,
      help="The probability a query will be cancelled.")
  cli_options.add_default_values_to_help(parser)
  opts, args = parser.parse_args()

  if not opts.tpcds_db_name and not opts.tpch_db_name:
    raise Exception("At least one of --tpcds-db-name --tpch-db-name is required")

  cli_options.configure_logging(opts.log_level, debug_log_file=opts.debug_log_file,
      log_thread_id=True, log_process_id=True)
  LOG.debug("CLI opts: %s" % (opts, ))
  LOG.debug("CLI args: %s" % (args, ))

  impala = find_impala_in_cm(
      opts.cm_host, opts.cm_user, opts.cm_password, opts.cm_cluster_name)
  if opts.cancel_current_queries:
    impala.cancel_queries()
  if impala.queries_are_running():
    raise Exception("Queries are currently running on the cluster")

  runtime_info_path = opts.runtime_info_path
  if "{cm_host}" in runtime_info_path:
    runtime_info_path = runtime_info_path.format(cm_host=opts.cm_host)
  queries_with_runtime_info_by_db_and_sql = load_runtime_info(runtime_info_path, impala)
  queries = list()
  if opts.tpcds_db_name:
    tpcds_queries = load_tpc_queries("tpcds")
    for query in tpcds_queries:
      query.db_name = opts.tpcds_db_name
    queries.extend(tpcds_queries)
  if opts.tpch_db_name:
    tpch_queries = load_tpc_queries("tpch")
    for query in tpch_queries:
      query.db_name = opts.tpch_db_name
    queries.extend(tpch_queries)
  for idx in xrange(len(queries) - 1, -1, -1):
    query = queries[idx]
    if query.sql in queries_with_runtime_info_by_db_and_sql[query.db_name]:
      query = queries_with_runtime_info_by_db_and_sql[query.db_name][query.sql]
      LOG.debug("Reusing previous runtime data for query: " + query.sql)
      queries[idx] = query
    else:
      populate_runtime_info(query, impala)
      save_runtime_info(runtime_info_path, query, impala)
    if query.required_mem_mb_with_spilling:
      query.required_mem_mb_with_spilling += int(query.required_mem_mb_with_spilling
          * opts.mem_limit_padding_pct / 100.0)
    if query.required_mem_mb_without_spilling:
      query.required_mem_mb_without_spilling += int(query.required_mem_mb_without_spilling
          * opts.mem_limit_padding_pct / 100.0)
    if query.solo_runtime_secs_with_spilling:
      query.solo_runtime_secs_with_spilling *= opts.timeout_multiplier
    if query.solo_runtime_secs_without_spilling:
      query.solo_runtime_secs_without_spilling *= opts.timeout_multiplier

    # Remove any queries that would use "too many" resources. This way a larger number
    # of queries will run concurrently.
    if query.required_mem_mb_with_spilling is None \
        or query.required_mem_mb_with_spilling / impala.min_impalad_mem_mb \
            > opts.filter_query_mem_ratio:
      LOG.debug("Filtered query due to mem ratio option: " + query.sql)
      del queries[idx]
  if len(queries) == 0:
    raise Exception("All queries were filtered")

  stress_runner = StressRunner()
  stress_runner.cancel_probability = opts.cancel_probability
  stress_runner.spill_probability = opts.spill_probability
  stress_runner.run_queries(queries, impala, opts.max_queries, opts.mem_overcommit_pct,
      not opts.no_status)
Exemplo n.º 2
0
  parser.add_option('--query-count', default=1000000, type=int,
      help='Exit after running the given number of queries.')
  parser.add_option('--exclude-types', default='',
      help='A comma separated list of data types to exclude while generating queries.')
  profiles = dict()
  for profile in PROFILES:
    profile_name = profile.__name__
    if profile_name.endswith('Profile'):
      profile_name = profile_name[:-1 * len('Profile')]
    profiles[profile_name.lower()] = profile
  parser.add_option('--profile', default='default',
      choices=(sorted(profiles.keys())),
      help='Determines the mix of SQL features to use during query generation.')
  # TODO: Seed the random query generator for repeatable queries?

  cli_options.add_default_values_to_help(parser)

  options, args = parser.parse_args()
  cli_options.configure_logging(options.log_level)

  db_connector_param_key = options.ref_db_type.lower()
  ref_connection = DbConnector(options.ref_db_type,
      user_name=getattr(options, db_connector_param_key + '_user'),
      password=getattr(options, db_connector_param_key + '_password'),
      host_name=getattr(options, db_connector_param_key + '_host'),
      port=getattr(options, db_connector_param_key + '_port')) \
      .create_connection(options.db_name)
  db_connector_param_key = options.test_db_type.lower()
  test_connection = DbConnector(options.test_db_type,
      user_name=getattr(options, db_connector_param_key + '_user', None),
      password=getattr(options, db_connector_param_key + '_password', None),
    )
    profiles = dict()
    for profile in PROFILES:
        profile_name = profile.__name__
        if profile_name.endswith('Profile'):
            profile_name = profile_name[:-1 * len('Profile')]
        profiles[profile_name.lower()] = profile
    parser.add_option(
        '--profile',
        default='default',
        choices=(sorted(profiles.keys())),
        help=
        'Determines the mix of SQL features to use during query generation.')
    # TODO: Seed the random query generator for repeatable queries?

    cli_options.add_default_values_to_help(parser)

    options, args = parser.parse_args()
    cli_options.configure_logging(options.log_level)

    db_connector_param_key = options.ref_db_type.lower()
    ref_connection = DbConnector(options.ref_db_type,
        user_name=getattr(options, db_connector_param_key + '_user'),
        password=getattr(options, db_connector_param_key + '_password'),
        host_name=getattr(options, db_connector_param_key + '_host'),
        port=getattr(options, db_connector_param_key + '_port')) \
        .create_connection(options.db_name)
    db_connector_param_key = options.test_db_type.lower()
    test_connection = DbConnector(options.test_db_type,
        user_name=getattr(options, db_connector_param_key + '_user', None),
        password=getattr(options, db_connector_param_key + '_password', None),