예제 #1
0
      help='A comma separated list of data types to exclude while generating queries.')
  profiles = dict()
  for profile in PROFILES:
    profile_name = profile.__name__
    if profile_name.endswith('Profile'):
      profile_name = profile_name[:-1 * len('Profile')]
    profiles[profile_name.lower()] = profile
  parser.add_option('--profile', default='default',
      choices=(sorted(profiles.keys())),
      help='Determines the mix of SQL features to use during query generation.')
  # TODO: Seed the random query generator for repeatable queries?

  cli_options.add_default_values_to_help(parser)

  options, args = parser.parse_args()
  cli_options.configure_logging(options.log_level)

  db_connector_param_key = options.ref_db_type.lower()
  ref_connection = DbConnector(options.ref_db_type,
      user_name=getattr(options, db_connector_param_key + '_user'),
      password=getattr(options, db_connector_param_key + '_password'),
      host_name=getattr(options, db_connector_param_key + '_host'),
      port=getattr(options, db_connector_param_key + '_port')) \
      .create_connection(options.db_name)
  db_connector_param_key = options.test_db_type.lower()
  test_connection = DbConnector(options.test_db_type,
      user_name=getattr(options, db_connector_param_key + '_user', None),
      password=getattr(options, db_connector_param_key + '_password', None),
      host_name=getattr(options, db_connector_param_key + '_host', None),
      port=getattr(options, db_connector_param_key + '_port', None)) \
      .create_connection(options.db_name)
예제 #2
0
    if len(args.command) > 1:
        raise Exception(
            'Only one command can be chosen. Requested commands were: %s' %
            args.command)
    command = args.command[0] if args.command else 'populate'
    if command not in ('populate', 'migrate'):
        raise Exception(
            'Command must either be "populate" or "migrate" but was "%s"' %
            command)
    if command == 'migrate' and \
        not any((args.use_mysql, args.use_postgresql, args.use_oracle)):
        raise Exception(
            'At least one destination database must be chosen with '
            '--use-<database type>')

    cli_options.configure_logging(args.log_level,
                                  debug_log_file=args.debug_log_file)

    seed(args.randomization_seed)

    cluster = cli_options.create_cluster(args)

    populator = DbPopulator(
        db_connection.HIVE if args.use_hive else db_connection.IMPALA)
    if command == 'populate':
        populator.randomization_seed = args.randomization_seed
        populator.cluster = cluster
        populator.db_name = args.db_name
        populator.min_col_count = args.min_column_count
        populator.max_col_count = args.max_column_count
        populator.min_row_count = args.min_row_count
        populator.max_row_count = args.max_row_count
def main():
  from optparse import OptionParser
  import tests.comparison.cli_options as cli_options

  parser = OptionParser(epilog=dedent(
      """Before running this script a CM cluster must be setup and any needed data
         such as TPC-H/DS must be loaded. The first time this script is run it will
         find memory limits and runtimes for each query and save the data to disk (since
         collecting the data is slow) at --runtime-info-path then run the stress test.
         Later runs will reuse the saved memory limits and timings. If the cluster changes
         significantly the memory limits should be re-measured (deleting the file at
         --runtime-info-path will cause re-measuring to happen)."""))
  cli_options.add_logging_options(parser)
  cli_options.add_cm_options(parser)
  cli_options.add_db_name_option(parser)
  parser.add_option("--runtime-info-path",
      default=os.path.join(gettempdir(), "{cm_host}_query_runtime_info.json"),
      help="The path to store query runtime info at. '{cm_host}' will be replaced with"
      " the actual host name from --cm-host.")
  parser.add_option("--no-status", action="store_true",
      help="Do not print the status table.")
  parser.add_option("--cancel-current-queries", action="store_true",
      help="Cancel any queries running on the cluster before beginning.")
  parser.add_option("--filter-query-mem-ratio", type=float, default=0.333,
      help="Queries that require this ratio of total available memory will be filtered.")
  parser.add_option("--mem-limit-padding-pct", type=int, default=25,
      help="Pad query mem limits found by solo execution with this percentage when"
      " running concurrently. After padding queries will not be expected to fail"
      " due to mem limit exceeded.")
  parser.add_option("--timeout-multiplier", type=float, default=1.0,
      help="Query timeouts will be multiplied by this value.")
  parser.add_option("--max-queries", type=int, default=100)
  parser.add_option("--tpcds-db-name")
  parser.add_option("--tpch-db-name")
  parser.add_option("--mem-overcommit-pct", type=float, default=0)
  parser.add_option("--mem-spill-probability", type=float, default=0.33,
      dest="spill_probability",
      help="The probability that a mem limit will be set low enough to induce spilling.")
  parser.add_option("--cancel-probability", type=float, default=0.1,
      help="The probability a query will be cancelled.")
  cli_options.add_default_values_to_help(parser)
  opts, args = parser.parse_args()

  if not opts.tpcds_db_name and not opts.tpch_db_name:
    raise Exception("At least one of --tpcds-db-name --tpch-db-name is required")

  cli_options.configure_logging(opts.log_level, debug_log_file=opts.debug_log_file,
      log_thread_id=True, log_process_id=True)
  LOG.debug("CLI opts: %s" % (opts, ))
  LOG.debug("CLI args: %s" % (args, ))

  impala = find_impala_in_cm(
      opts.cm_host, opts.cm_user, opts.cm_password, opts.cm_cluster_name)
  if opts.cancel_current_queries:
    impala.cancel_queries()
  if impala.queries_are_running():
    raise Exception("Queries are currently running on the cluster")

  runtime_info_path = opts.runtime_info_path
  if "{cm_host}" in runtime_info_path:
    runtime_info_path = runtime_info_path.format(cm_host=opts.cm_host)
  queries_with_runtime_info_by_db_and_sql = load_runtime_info(runtime_info_path, impala)
  queries = list()
  if opts.tpcds_db_name:
    tpcds_queries = load_tpc_queries("tpcds")
    for query in tpcds_queries:
      query.db_name = opts.tpcds_db_name
    queries.extend(tpcds_queries)
  if opts.tpch_db_name:
    tpch_queries = load_tpc_queries("tpch")
    for query in tpch_queries:
      query.db_name = opts.tpch_db_name
    queries.extend(tpch_queries)
  for idx in xrange(len(queries) - 1, -1, -1):
    query = queries[idx]
    if query.sql in queries_with_runtime_info_by_db_and_sql[query.db_name]:
      query = queries_with_runtime_info_by_db_and_sql[query.db_name][query.sql]
      LOG.debug("Reusing previous runtime data for query: " + query.sql)
      queries[idx] = query
    else:
      populate_runtime_info(query, impala)
      save_runtime_info(runtime_info_path, query, impala)
    if query.required_mem_mb_with_spilling:
      query.required_mem_mb_with_spilling += int(query.required_mem_mb_with_spilling
          * opts.mem_limit_padding_pct / 100.0)
    if query.required_mem_mb_without_spilling:
      query.required_mem_mb_without_spilling += int(query.required_mem_mb_without_spilling
          * opts.mem_limit_padding_pct / 100.0)
    if query.solo_runtime_secs_with_spilling:
      query.solo_runtime_secs_with_spilling *= opts.timeout_multiplier
    if query.solo_runtime_secs_without_spilling:
      query.solo_runtime_secs_without_spilling *= opts.timeout_multiplier

    # Remove any queries that would use "too many" resources. This way a larger number
    # of queries will run concurrently.
    if query.required_mem_mb_with_spilling is None \
        or query.required_mem_mb_with_spilling / impala.min_impalad_mem_mb \
            > opts.filter_query_mem_ratio:
      LOG.debug("Filtered query due to mem ratio option: " + query.sql)
      del queries[idx]
  if len(queries) == 0:
    raise Exception("All queries were filtered")

  stress_runner = StressRunner()
  stress_runner.cancel_probability = opts.cancel_probability
  stress_runner.spill_probability = opts.spill_probability
  stress_runner.run_queries(queries, impala, opts.max_queries, opts.mem_overcommit_pct,
      not opts.no_status)
예제 #4
0
    for profile in PROFILES:
        profile_name = profile.__name__
        if profile_name.endswith('Profile'):
            profile_name = profile_name[:-1 * len('Profile')]
        profiles[profile_name.lower()] = profile
    parser.add_argument(
        '--profile',
        default='default',
        choices=(sorted(profiles.keys())),
        help=
        'Determines the mix of SQL features to use during query generation.')
    # TODO: Seed the random query generator for repeatable queries?

    args = parser.parse_args()
    cli_options.configure_logging(args.log_level,
                                  debug_log_file=args.debug_log_file,
                                  log_thread_name=True)
    cluster = cli_options.create_cluster(args)

    ref_conn = cli_options.create_connection(args,
                                             args.ref_db_type,
                                             db_name=args.db_name)
    if args.test_db_type == IMPALA:
        test_conn = cluster.impala.connect(db_name=args.db_name)
    elif args.test_db_type == HIVE:
        test_conn = cluster.hive.connect(db_name=args.db_name)
    else:
        test_conn = cli_options.create_connection(args,
                                                  args.test_db_type,
                                                  db_name=args.db_name)
    # Create an instance of profile class (e.g. DefaultProfile)
예제 #5
0
  cli_options.add_logging_options(parser)
  cli_options.add_cluster_options(parser)
  parser.add_argument("-s", "--source-db", required=True,
      help="Source DB to load data from.")
  parser.add_argument("-t", "--target-db", required=True,
      help="Target DB to load data to.")
  parser.add_argument("-w", "--workload", choices=['tpch', 'tpcds'],
      required=True)
  parser.add_argument("--kudu_master", required=True,
      help="Address or host name of Kudu master")
  # TODO: Automatically set #buckets as a function of cluster nodes and/or
  # scale
  parser.add_argument("-b", "--buckets", default="9",
      help="Number of buckets to partition Kudu tables (only for hash-based).")
  parser.add_argument("-v", "--verbose", action='store_true',
      help="Print the executed statements.")
  parser.add_argument("--clean", action='store_true',
      help="Drop all tables in the speficied target database.")
  args = parser.parse_args()

  cli_options.configure_logging(args.log_level, debug_log_file=args.debug_log_file)
  cluster = cli_options.create_cluster(args)
  source_db = args.source_db
  target_db = args.target_db
  buckets = args.buckets
  kudu_master = args.kudu_master
  workload = args.workload
  verbose = args.verbose
  if args.clean: clean_data()
  load_data()
  parser.add_argument('--explain-only', action='store_true',
      help="Don't run the queries only explain them to see if there was an error in "
      "planning.")
  profiles = dict()
  for profile in PROFILES:
    profile_name = profile.__name__
    if profile_name.endswith('Profile'):
      profile_name = profile_name[:-1 * len('Profile')]
    profiles[profile_name.lower()] = profile
  parser.add_argument('--profile', default='default',
      choices=(sorted(profiles.keys())),
      help='Determines the mix of SQL features to use during query generation.')
  # TODO: Seed the random query generator for repeatable queries?

  args = parser.parse_args()
  cli_options.configure_logging(
      args.log_level, debug_log_file=args.debug_log_file, log_thread_name=True)
  cluster = cli_options.create_cluster(args)

  ref_conn = cli_options.create_connection(args, args.ref_db_type, db_name=args.db_name)
  if args.test_db_type == IMPALA:
    test_conn = cluster.impala.connect(db_name=args.db_name)
  elif args.test_db_type == HIVE:
    test_conn = cluster.hive.connect(db_name=args.db_name)
  else:
    test_conn = cli_options.create_connection(
        args, args.test_db_type, db_name=args.db_name)
  # Create an instance of profile class (e.g. DefaultProfile)
  query_profile = profiles[args.profile]()
  if args.explain_only:
    searcher = FrontendExceptionSearcher(query_profile, ref_conn, test_conn)
    searcher.search(args.query_count)
        profile_name = profile.__name__
        if profile_name.endswith('Profile'):
            profile_name = profile_name[:-1 * len('Profile')]
        profiles[profile_name.lower()] = profile
    parser.add_option(
        '--profile',
        default='default',
        choices=(sorted(profiles.keys())),
        help=
        'Determines the mix of SQL features to use during query generation.')
    # TODO: Seed the random query generator for repeatable queries?

    cli_options.add_default_values_to_help(parser)

    options, args = parser.parse_args()
    cli_options.configure_logging(options.log_level)

    db_connector_param_key = options.ref_db_type.lower()
    ref_connection = DbConnector(options.ref_db_type,
        user_name=getattr(options, db_connector_param_key + '_user'),
        password=getattr(options, db_connector_param_key + '_password'),
        host_name=getattr(options, db_connector_param_key + '_host'),
        port=getattr(options, db_connector_param_key + '_port')) \
        .create_connection(options.db_name)
    db_connector_param_key = options.test_db_type.lower()
    test_connection = DbConnector(options.test_db_type,
        user_name=getattr(options, db_connector_param_key + '_user', None),
        password=getattr(options, db_connector_param_key + '_password', None),
        host_name=getattr(options, db_connector_param_key + '_host', None),
        port=getattr(options, db_connector_param_key + '_port', None)) \
        .create_connection(options.db_name)