Пример #1
0
  def test_hive_udfs_missing_jar(self, vector):
    """ IMPALA-2365: Impalad shouldn't crash if the udf jar isn't present
    on HDFS"""
    # Copy hive-exec.jar to a temporary file
    jar_path = "tmp/" + get_random_id(5) + ".jar"
    self.hdfs_client.copy('test-warehouse/hive-exec.jar', jar_path)
    drop_fn_stmt = "drop function if exists default.pi_missing_jar()"
    create_fn_stmt = "create function default.pi_missing_jar() returns double \
        location '/%s' symbol='org.apache.hadoop.hive.ql.udf.UDFPI'" % jar_path

    cluster = ImpalaCluster()
    impalad = cluster.get_any_impalad()
    client = impalad.service.create_beeswax_client()
    # Create and drop functions with sync_ddl to make sure they are reflected
    # in every impalad.
    exec_option = vector.get_value('exec_option')
    exec_option['sync_ddl'] = 1

    self.execute_query_expect_success(client, drop_fn_stmt, exec_option)
    self.execute_query_expect_success(client, create_fn_stmt, exec_option)
    # Delete the udf jar
    self.hdfs_client.delete_file_dir(jar_path)

    different_impalad = cluster.get_different_impalad(impalad)
    client = different_impalad.service.create_beeswax_client()
    # Run a query using the udf from an impalad other than the one
    # we used to create the function. This is to bypass loading from
    # the cache
    try:
      self.execute_query_using_client(client,
          "select default.pi_missing_jar()", vector)
      assert False, "Query expected to fail"
    except ImpalaBeeswaxException, e:
      assert "Failed to get file info" in str(e)
 def test_query_profile_encoded_unknown_query_id(self):
   """Test that /query_profile_encoded error message starts with the expected line in
   case of missing query and does not contain any leading whitespace.
   """
   cluster = ImpalaCluster()
   impalad = cluster.get_any_impalad()
   result = impalad.service.read_debug_webpage("query_profile_encoded?query_id=123")
   assert result.startswith("Could not obtain runtime profile: Query id")
Пример #3
0
  def test_create_drop_data_src(self, vector):
    """This will create, run, and drop the same data source repeatedly, exercising
    the lib cache mechanism.
    """
    create_ds_stmt = ("CREATE DATA SOURCE test_data_src "
        "LOCATION '%s/data-sources/test-data-source.jar' "
        "CLASS 'com.cloudera.impala.extdatasource.AllTypesDataSource' "
        "API_VERSION 'V1'" % WAREHOUSE)
    create_tbl_stmt = """CREATE TABLE data_src_tbl (x int)
        PRODUCED BY DATA SOURCE test_data_src('dummy_init_string')"""
    drop_ds_stmt = "drop data source %s test_data_src"
    drop_tbl_stmt = "drop table %s data_src_tbl"
    select_stmt = "select * from data_src_tbl limit 1"
    class_cache_hits_metric = "external-data-source.class-cache.hits"
    class_cache_misses_metric = "external-data-source.class-cache.misses"

    create_stmts = [create_ds_stmt, create_tbl_stmt]
    drop_stmts = [drop_tbl_stmt, drop_ds_stmt]

    # Get the impalad to capture metrics
    impala_cluster = ImpalaCluster()
    impalad = impala_cluster.get_first_impalad()

    # Initial metric values
    class_cache_hits = impalad.service.get_metric_value(class_cache_hits_metric)
    class_cache_misses = impalad.service.get_metric_value(class_cache_misses_metric)
    # Test with 1 node so we can check the metrics on only the coordinator
    vector.get_value('exec_option')['num_nodes'] = 1
    num_iterations = 2
    self.create_drop_ddl(vector, "data_src_test", create_stmts, drop_stmts,
        select_stmt, num_iterations)

    # Check class cache metrics. Shouldn't have any new cache hits, there should be
    # 2 cache misses for every iteration (jar is loaded by both the FE and BE).
    expected_cache_misses = class_cache_misses + (num_iterations * 2)
    impalad.service.wait_for_metric_value(class_cache_hits_metric, class_cache_hits)
    impalad.service.wait_for_metric_value(class_cache_misses_metric,
        expected_cache_misses)

    # Test with a table that caches the class
    create_tbl_stmt = """CREATE TABLE data_src_tbl (x int)
        PRODUCED BY DATA SOURCE test_data_src('CACHE_CLASS::dummy_init_string')"""
    create_stmts = [create_ds_stmt, create_tbl_stmt]
    # Run once before capturing metrics because the class already may be cached from
    # a previous test run.
    # TODO: Provide a way to clear the cache
    self.create_drop_ddl(vector, "data_src_test", create_stmts, drop_stmts,
        select_stmt, 1)

    # Capture metric values and run again, should hit the cache.
    class_cache_hits = impalad.service.get_metric_value(class_cache_hits_metric)
    class_cache_misses = impalad.service.get_metric_value(class_cache_misses_metric)
    self.create_drop_ddl(vector, "data_src_test", create_stmts, drop_stmts,
        select_stmt, 1)
    impalad.service.wait_for_metric_value(class_cache_hits_metric, class_cache_hits + 2)
    impalad.service.wait_for_metric_value(class_cache_misses_metric, class_cache_misses)
Пример #4
0
  def __test_invalid_result_caching(self, sql_stmt):
    """ Tests that invalid requests for query-result caching fail
    using the given sql_stmt."""
    impala_cluster = ImpalaCluster.get_e2e_test_cluster()
    impalad = impala_cluster.impalads[0].service

    execute_statement_req = TCLIService.TExecuteStatementReq()
    execute_statement_req.sessionHandle = self.session_handle
    execute_statement_req.statement = sql_stmt
    execute_statement_req.confOverlay = dict()

    # Test that a malformed result-cache size returns an error.
    execute_statement_req.confOverlay[self.IMPALA_RESULT_CACHING_OPT] = "bad_number"
    execute_statement_resp = self.hs2_client.ExecuteStatement(execute_statement_req)
    HS2TestSuite.check_response(execute_statement_resp,
        TCLIService.TStatusCode.ERROR_STATUS,
        "Invalid value 'bad_number' for 'impala.resultset.cache.size' option")
    self.__verify_num_cached_rows(0)
    assert 0 == impalad.get_num_in_flight_queries()

    # Test that a result-cache size exceeding the per-Impalad maximum returns an error.
    # The default maximum result-cache size is 100000.
    execute_statement_req.confOverlay[self.IMPALA_RESULT_CACHING_OPT] = "100001"
    execute_statement_resp = self.hs2_client.ExecuteStatement(execute_statement_req)
    HS2TestSuite.check_response(execute_statement_resp,
        TCLIService.TStatusCode.ERROR_STATUS,
        "Requested result-cache size of 100001 exceeds Impala's maximum of 100000")
    self.__verify_num_cached_rows(0)
    assert 0 == impalad.get_num_in_flight_queries()
Пример #5
0
  def test_run_invalidate_refresh(self, vector):
    """Verifies that running concurrent invalidate table/catalog and refresh commands
    don't cause failures with other running workloads and ensures catalog versions
    are strictly increasing."""
    target_db = self.execute_scalar('select current_database()', vector=vector)
    impala_cluster = ImpalaCluster.get_e2e_test_cluster()
    impalad = impala_cluster.impalads[0].service
    catalogd = impala_cluster.catalogd.service

    for i in xrange(NUM_ITERATIONS):
      # Get the catalog versions for the table before running the workload
      before_versions = dict()
      before_versions['catalogd'] =\
          self.get_table_version(catalogd, target_db, 'lineitem')
      before_versions['impalad'] = self.get_table_version(impalad, target_db, 'lineitem')

      self.run_test_case('stress-with-invalidate-refresh', vector)

      # Get the catalog versions for the table after running the workload
      after_versions = dict()
      after_versions['catalogd'] = self.get_table_version(catalogd, target_db, 'lineitem')
      after_versions['impalad'] = self.get_table_version(impalad, target_db, 'lineitem')

      # Catalog versions should be strictly increasing
      assert before_versions['impalad'] < after_versions['impalad']
      assert before_versions['catalogd'] < after_versions['catalogd']
Пример #6
0
  def _verify_describe_view(self, vector, view_name, expected_substr):
    """
    Verify across all impalads that the view 'view_name' has the given substring in its
    expanded SQL.

    If SYNC_DDL is enabled, the verification should complete immediately. Otherwise,
    loops waiting for the expected condition to pass.
    """
    if vector.get_value('exec_option')['sync_ddl']:
      num_attempts = 1
    else:
      num_attempts = 60
    for impalad in ImpalaCluster.get_e2e_test_cluster().impalads:
      client = impalad.service.create_beeswax_client()
      try:
        for attempt in itertools.count(1):
          assert attempt <= num_attempts, "ran out of attempts"
          try:
            result = self.execute_query_expect_success(
                client, "describe formatted %s" % view_name)
            exp_line = [l for l in result.data if 'View Expanded' in l][0]
          except ImpalaBeeswaxException, e:
            # In non-SYNC_DDL tests, it's OK to get a "missing view" type error
            # until the metadata propagates.
            exp_line = "Exception: %s" % e
          if expected_substr in exp_line.lower():
            return
          time.sleep(1)
      finally:
        client.close()
  def test_jvm_mem_tracking(self, vector):
    service = ImpalaCluster.get_e2e_test_cluster().impalads[0].service
    verifier = MemUsageVerifier(service)
    proc_values = verifier.get_mem_usage_values('Process')
    proc_total = proc_values['total']
    proc_limit = proc_values['limit']
    max_heap_size = verifier.get_mem_usage_values('JVM: max heap size')['total']
    non_heap_committed = verifier.get_mem_usage_values('JVM: non-heap committed')['total']
    MB = 1024 * 1024
    LOG.info("proc_total={0}, max_heap_size={1} non_heap_committed={2}".format(
        proc_total, max_heap_size, non_heap_committed))
    # The max heap size will be lower than -Xmx but should be in the same general range.
    assert max_heap_size >= 900 * MB and max_heap_size <= 1024 * MB
    # The non-heap committed value is hard to predict but should be non-zero.
    assert non_heap_committed > 0
    # Process mem consumption should include both of the above values.
    assert proc_total > max_heap_size + non_heap_committed

    # Make sure that the admittable memory is within 100MB of the process limit
    # minus the heap size (there may be some rounding errors).
    backend_json = json.loads(service.read_debug_webpage('backends?json'))
    admit_limit_human_readable = backend_json['backends'][0]['admit_mem_limit']
    admit_limit = parse_mem_value(admit_limit_human_readable)
    LOG.info("proc_limit={0}, admit_limit={1}".format(proc_limit, admit_limit))
    assert abs(admit_limit - (proc_limit - max_heap_size)) <= 100 * MB
  def test_pull_stats_profile(self, vector, unique_database):
    """Checks that the frontend profile includes metrics when computing
       incremental statistics.
    """
    try:
      client = ImpalaCluster().impalads[0].service.create_beeswax_client()
      create = "create table test like functional.alltypes"
      load = "insert into test partition(year, month) select * from functional.alltypes"
      insert = """insert into test partition(year=2009, month=1) values
                  (29349999, true, 4, 4, 4, 40,4.400000095367432,40.4,
                  "10/21/09","4","2009-10-21 03:24:09.600000000")"""
      stats_all = "compute incremental stats test"
      stats_part = "compute incremental stats test partition (year=2009,month=1)"

      # Checks that profile does not have metrics for incremental stats when
      # the operation is not 'compute incremental stats'.
      self.execute_query_expect_success(client, "use %s" % unique_database)
      profile = self.execute_query_expect_success(client, create).runtime_profile
      assert profile.count("StatsFetch") == 0
      # Checks that incremental stats metrics are present when 'compute incremental
      # stats' is run. Since the table has no stats, expect that no bytes are fetched.
      self.execute_query_expect_success(client, load)
      profile = self.execute_query_expect_success(client, stats_all).runtime_profile
      assert profile.count("StatsFetch") > 1
      assert profile.count("StatsFetch.CompressedBytes: 0") == 1
      # Checks that bytes fetched is non-zero since incremental stats are present now
      # and should have been fetched.
      self.execute_query_expect_success(client, insert)
      profile = self.execute_query_expect_success(client, stats_part).runtime_profile
      assert profile.count("StatsFetch") > 1
      assert profile.count("StatsFetch.CompressedBytes") == 1
      assert profile.count("StatsFetch.CompressedBytes: 0") == 0
      # Adds a partition, computes stats, and checks that the metrics in the profile
      # reflect the operation.
      alter = "alter table test add partition(year=2011, month=1)"
      insert_new_partition = """
          insert into test partition(year=2011, month=1) values
          (29349999, true, 4, 4, 4, 40,4.400000095367432,40.4,
          "10/21/09","4","2009-10-21 03:24:09.600000000")
          """
      self.execute_query_expect_success(client, alter)
      self.execute_query_expect_success(client, insert_new_partition)
      profile = self.execute_query_expect_success(client, stats_all).runtime_profile
      assert profile.count("StatsFetch.TotalPartitions: 25") == 1
      assert profile.count("StatsFetch.NumPartitionsWithStats: 24") == 1
    finally:
      client.close()
  def test_hash_join_timer(self, vector):
    # This test runs serially because it requires the query to come back within
    # some amount of time. Running this with other tests makes it hard to bound
    # that time. It also assumes that it will be able to get a thread token to
    # execute the join build in parallel.
    test_case = vector.get_value('test cases')
    query = test_case[0]
    join_type = test_case[1]

    # Ensure that the cluster is idle before starting the test query.
    for impalad in ImpalaCluster.get_e2e_test_cluster().impalads:
      verifier = MetricVerifier(impalad.service)
      verifier.wait_for_metric("impala-server.num-fragments-in-flight", 0)

    # Execute the query. The query summary and profile are stored in 'result'.
    result = self.execute_query(query, vector.get_value('exec_option'))

    # Parse the query summary; The join node is "id=3".
    # In the ExecSummary, search for the join operator's summary and verify the
    # avg and max times are within acceptable limits.
    exec_summary = result.exec_summary
    check_execsummary_count = 0
    join_node_name = "03:%s" % (join_type)
    for line in exec_summary:
      if line['operator'] == join_node_name:
        avg_time_ms = line['avg_time'] / self.NANOS_PER_MILLI
        self.__verify_join_time(avg_time_ms, "ExecSummary Avg")
        max_time_ms = line['max_time'] / self.NANOS_PER_MILLI
        self.__verify_join_time(max_time_ms, "ExecSummary Max")
        check_execsummary_count += 1
    assert (check_execsummary_count == 1), \
        "Unable to verify ExecSummary: {0}".format(exec_summary)

    # Parse the query profile; The join node is "id=3".
    # In the profiles, search for lines containing "(id=3)" and parse for the avg and
    # non-child times to verify that they are within acceptable limits. Also verify
    # that the build side is built in a different thread by searching for the string:
    # "Join Build-Side Prepared Asynchronously"
    profile = result.runtime_profile
    check_fragment_count = 0
    asyn_build = False
    for line in profile.split("\n"):
      if ("(id=3)" in line):
        # Sample line:
        # HASH_JOIN_NODE (id=3):(Total: 3s580ms, non-child: 11.89ms, % non-child: 0.31%)
        strip1 = re.split("non-child: ", line)[1]
        non_child_time = re.split(", ", strip1)[0]
        non_child_time_ms = parse_duration_string_ms(non_child_time)
        self.__verify_join_time(non_child_time_ms, "Fragment non-child")
        check_fragment_count += 1
      # Search for "Join Build-Side Prepared Asynchronously"
      if ("Join Build-Side Prepared Asynchronously" in line):
        asyn_build = True
    assert (asyn_build), "Join is not prepared asynchronously: {0}".format(profile)
    assert (check_fragment_count > 1), \
        "Unable to verify Fragment or Average Fragment: {0}".format(profile)
  def _start_impala_cluster(cls, options, impala_log_dir=os.getenv('LOG_DIR', "/tmp/"),
      cluster_size=DEFAULT_CLUSTER_SIZE, num_coordinators=NUM_COORDINATORS,
      use_exclusive_coordinators=False, log_level=1,
      expected_num_executors=DEFAULT_CLUSTER_SIZE, default_query_options=None):
    cls.impala_log_dir = impala_log_dir
    # We ignore TEST_START_CLUSTER_ARGS here. Custom cluster tests specifically test that
    # certain custom startup arguments work and we want to keep them independent of dev
    # environments.
    cmd = [os.path.join(IMPALA_HOME, 'bin/start-impala-cluster.py'),
           '--cluster_size=%d' % cluster_size,
           '--num_coordinators=%d' % num_coordinators,
           '--log_dir=%s' % impala_log_dir,
           '--log_level=%s' % log_level]

    if use_exclusive_coordinators:
      cmd.append("--use_exclusive_coordinators")

    if pytest.config.option.use_local_catalog:
      cmd.append("--impalad_args=--use_local_catalog=1")
      cmd.append("--catalogd_args=--catalog_topic_mode=minimal")

    if pytest.config.option.pull_incremental_statistics:
      cmd.append("--impalad_args=%s --catalogd_args=%s" %
                 ("--pull_incremental_statistics", "--pull_incremental_statistics"))

    default_query_option_kvs = []
    # Put any defaults first, then any arguments after that so they can override defaults.
    if os.environ.get("ERASURE_CODING") == "true":
      default_query_option_kvs.append(("allow_erasure_coded_files", "true"))
    if default_query_options is not None:
      default_query_option_kvs.extend(default_query_options)
    # Add the default query options after any arguments. This will override any default
    # options set in --impalad_args by design to force tests to pass default_query_options
    # into this function directly.
    options.append("--impalad_args=--default_query_options={0}".format(
        ','.join(["{0}={1}".format(k, v) for k, v in default_query_option_kvs])))

    logging.info("Starting cluster with command: %s" %
                 " ".join(pipes.quote(arg) for arg in cmd + options))
    try:
      check_call(cmd + options, close_fds=True)
    finally:
      # Failure tests expect cluster to be initialised even if start-impala-cluster fails.
      cls.cluster = ImpalaCluster.get_e2e_test_cluster()
    statestored = cls.cluster.statestored
    if statestored is None:
      raise Exception("statestored was not found")

    # The number of statestore subscribers is
    # cluster_size (# of impalad) + 1 (for catalogd).
    expected_subscribers = cluster_size + 1

    statestored.service.wait_for_live_subscribers(expected_subscribers, timeout=60)
    for impalad in cls.cluster.impalads:
      impalad.service.wait_for_num_known_live_backends(expected_num_executors, timeout=60)
Пример #11
0
 def verify_mem_usage(self, non_zero_peak_metrics):
   """Verifies that the memory used by KRPC is returned to the memtrackers and that
   metrics in 'non_zero_peak_metrics' have a peak value > 0.
   """
   verifiers = [MemUsageVerifier(i.service)
                for i in ImpalaCluster.get_e2e_test_cluster().impalads]
   for verifier in verifiers:
     for metric_name in ALL_METRICS:
       usage = verifier.get_mem_usage_values(metric_name)
       assert usage["total"] == 0
       if metric_name in non_zero_peak_metrics:
         assert usage["peak"] > 0, metric_name
Пример #12
0
 def test_insert_mem_limit(self, vector):
   if (vector.get_value('table_format').file_format == 'parquet'):
     vector.get_value('exec_option')['COMPRESSION_CODEC'] = \
         vector.get_value('compression_codec')
   self.run_test_case('QueryTest/insert-mem-limit', vector,
       multiple_impalad=vector.get_value('exec_option')['sync_ddl'] == 1)
   # IMPALA-7023: These queries can linger and use up memory, causing subsequent
   # tests to hit memory limits. Wait for some time to allow the query to
   # be reclaimed.
   verifiers = [MetricVerifier(i.service)
                for i in ImpalaCluster.get_e2e_test_cluster().impalads]
   for v in verifiers:
     v.wait_for_metric("impala-server.num-fragments-in-flight", 0, timeout=60)
Пример #13
0
  def test_hive_udfs_missing_jar(self, vector, unique_database):
    """ IMPALA-2365: Impalad shouldn't crash if the udf jar isn't present
    on HDFS"""
    # Copy hive-exec.jar to a temporary file
    jar_path = get_fs_path("/test-warehouse/{0}.db/".format(unique_database)
                           + get_random_id(5) + ".jar")
    hive_jar = get_fs_path("/test-warehouse/hive-exec.jar")
    check_call(["hadoop", "fs", "-cp", hive_jar, jar_path])
    drop_fn_stmt = (
        "drop function if exists "
        "`{0}`.`pi_missing_jar`()".format(unique_database))
    create_fn_stmt = (
        "create function `{0}`.`pi_missing_jar`() returns double location '{1}' "
        "symbol='org.apache.hadoop.hive.ql.udf.UDFPI'".format(unique_database, jar_path))

    cluster = ImpalaCluster()
    impalad = cluster.get_any_impalad()
    client = impalad.service.create_beeswax_client()
    # Create and drop functions with sync_ddl to make sure they are reflected
    # in every impalad.
    exec_option = copy(vector.get_value('exec_option'))
    exec_option['sync_ddl'] = 1

    self.execute_query_expect_success(client, drop_fn_stmt, exec_option)
    self.execute_query_expect_success(client, create_fn_stmt, exec_option)
    # Delete the udf jar
    check_call(["hadoop", "fs", "-rm", jar_path])

    different_impalad = cluster.get_different_impalad(impalad)
    client = different_impalad.service.create_beeswax_client()
    # Run a query using the udf from an impalad other than the one
    # we used to create the function. This is to bypass loading from
    # the cache
    try:
      self.execute_query_using_client(
          client, "select `{0}`.`pi_missing_jar`()".format(unique_database), vector)
      assert False, "Query expected to fail"
    except ImpalaBeeswaxException, e:
      assert "Failed to get file info" in str(e)
Пример #14
0
  def test_failure_in_prepare(self):
    # Fail the scan node
    verifiers = [MetricVerifier(i.service)
                 for i in ImpalaCluster.get_e2e_test_cluster().impalads]
    self.client.execute("SET DEBUG_ACTION='-1:0:PREPARE:FAIL'");
    try:
      self.client.execute("SELECT COUNT(*) FROM functional.alltypes")
      assert "Query should have thrown an error"
    except ImpalaBeeswaxException:
      pass

    for v in verifiers:
      v.wait_for_metric(self.IN_FLIGHT_FRAGMENTS, 0)
Пример #15
0
  def test_table_is_cached(self, vector):
    cached_read_metric = "impala-server.io-mgr.cached-bytes-read"
    query_string = "select count(*) from tpch.nation"
    expected_bytes_delta = 2199
    impala_cluster = ImpalaCluster.get_e2e_test_cluster()

    # Collect the cached read metric on all the impalads before running the query
    cached_bytes_before = list()
    for impalad in impala_cluster.impalads:
      cached_bytes_before.append(impalad.service.get_metric_value(cached_read_metric))

    # Execute the query.
    result = self.execute_query(query_string)
    assert(len(result.data) == 1)
    assert(result.data[0] == '25')

    # Read the metrics again.
    cached_bytes_after = list()
    for impalad in impala_cluster.impalads:
      cached_bytes_after.append(impalad.service.get_metric_value(cached_read_metric))

    # Verify that the cached bytes increased by the expected number on exactly one of
    # the impalads.
    num_metrics_increased = 0
    assert(len(cached_bytes_before) == len(cached_bytes_after))
    for i in range(0, len(cached_bytes_before)):
      assert(cached_bytes_before[i] == cached_bytes_after[i] or\
             cached_bytes_before[i] + expected_bytes_delta == cached_bytes_after[i])
      if cached_bytes_after[i] > cached_bytes_before[i]:
        num_metrics_increased = num_metrics_increased + 1

    if IS_DOCKERIZED_TEST_CLUSTER:
      assert num_metrics_increased == 0, "HDFS caching is disabled in dockerised cluster."
    elif num_metrics_increased != 1:
      # Test failed, print the metrics
      for i in range(0, len(cached_bytes_before)):
        print "%d %d" % (cached_bytes_before[i], cached_bytes_after[i])
      assert(False)
Пример #16
0
  def test_failure_in_prepare_multi_fragment(self):
    # Test that if one fragment fails that the others are cleaned up during the ensuing
    # cancellation.
    verifiers = [MetricVerifier(i.service)
                 for i in ImpalaCluster.get_e2e_test_cluster().impalads]
    # Fail the scan node
    self.client.execute("SET DEBUG_ACTION='-1:0:PREPARE:FAIL'");

    # Force a query plan that will have three fragments or more.
    try:
      self.client.execute("SELECT COUNT(*) FROM functional.alltypes a JOIN [SHUFFLE] \
        functional.alltypes b on a.id = b.id")
      assert "Query should have thrown an error"
    except ImpalaBeeswaxException:
      pass

    for v in verifiers:
      # Long timeout required because fragments may be blocked while sending data. The
      # default value of --datastream_sender_timeout_ms is 120s before they wake up and
      # cancel themselves.
      #
      # TODO: Fix when we have cancellable RPCs.
      v.wait_for_metric(self.IN_FLIGHT_FRAGMENTS, 0, timeout=125)
Пример #17
0
  def test_views_describe(self, vector, unique_database):
    # IMPALA-6896: Tests that altered views can be described by all impalads.
    impala_cluster = ImpalaCluster.get_e2e_test_cluster()
    impalads = impala_cluster.impalads
    view_name = "%s.test_describe_view" % unique_database
    query_opts = vector.get_value('exec_option')
    first_client = impalads[0].service.create_beeswax_client()
    try:
      # Create a view and verify it's visible.
      self.execute_query_expect_success(first_client,
                                        "create view {0} as "
                                        "select * from functional.alltypes"
                                        .format(view_name), query_opts)
      self._verify_describe_view(vector, view_name, "select * from functional.alltypes")

      # Alter the view and verify the alter is visible.
      self.execute_query_expect_success(first_client,
                                        "alter view {0} as "
                                        "select * from functional.alltypesagg"
                                        .format(view_name), query_opts)
      self._verify_describe_view(vector, view_name,
                                 "select * from functional.alltypesagg")
    finally:
      first_client.close()
  def test_get_functions(self, vector, unique_database):
    impala_cluster = ImpalaCluster.get_e2e_test_cluster()
    catalogd = impala_cluster.catalogd.service
    trans_type = 'buffered'
    if pytest.config.option.use_kerberos:
      trans_type = 'kerberos'
    transport = create_transport(host=catalogd.hostname, port=catalogd.service_port,
                                 service='impala', transport_type=trans_type)
    transport.open()
    protocol = TBinaryProtocol.TBinaryProtocol(transport)
    catalog_client = CatalogService.Client(protocol)

    request = TGetFunctionsRequest()
    request.db_name = unique_database
    response = catalog_client.GetFunctions(request)
    assert response.status.status_code == TErrorCode.OK
    assert len(response.functions) == 0

    self.client.execute("create function %s.fn() RETURNS int "
                        "LOCATION '%s/libTestUdfs.so' SYMBOL='Fn'"
                        % (unique_database, WAREHOUSE))

    response = catalog_client.GetFunctions(request)
    LOG.debug(response)
    assert len(response.functions) == 1
    assert len(response.functions[0].arg_types) == 0
    assert response.functions[0].name.db_name == unique_database
    assert response.functions[0].name.function_name == 'fn'
    assert response.functions[0].aggregate_fn is None
    assert response.functions[0].scalar_fn is not None
    assert '/test-warehouse/libTestUdfs.so' in response.functions[0].hdfs_location

    # Add another scalar function with overloaded parameters ensure it shows up.
    self.client.execute("create function %s.fn(int) RETURNS double "\
        "LOCATION '%s/libTestUdfs.so' SYMBOL='Fn'" % (unique_database, WAREHOUSE))
    response = catalog_client.GetFunctions(request)
    LOG.debug(response)
    assert response.status.status_code == TErrorCode.OK
    assert len(response.functions) == 2

    functions = [fn for fn in response.functions]

    # Sort by number of arg in the function (ascending)
    functions.sort(key=lambda fn: len(fn.arg_types))
    assert len(functions[0].arg_types) == 0
    assert len(functions[1].arg_types) == 1
    assert functions[0].signature == 'fn()'
    assert functions[1].signature == 'fn(INT)'

    # Verify aggregate functions can also be retrieved
    self.client.execute("create aggregate function %s.agg_fn(int, string) RETURNS int "
                        "LOCATION '%s/libTestUdas.so' UPDATE_FN='TwoArgUpdate'"
                        % (unique_database, WAREHOUSE))
    response = catalog_client.GetFunctions(request)
    LOG.debug(response)
    assert response.status.status_code == TErrorCode.OK
    assert len(response.functions) == 3
    functions = [fn for fn in response.functions if fn.aggregate_fn is not None]
    # Should be only 1 aggregate function
    assert len(functions) == 1

    # Negative test cases for database name
    request.db_name = unique_database + "_does_not_exist"
    response = catalog_client.GetFunctions(request)
    LOG.debug(response)
    assert response.status.status_code == TErrorCode.GENERAL
    assert 'Database does not exist: ' in str(response.status)

    request = TGetFunctionsRequest()
    response = catalog_client.GetFunctions(request)
    LOG.debug(response)
    assert response.status.status_code == TErrorCode.GENERAL
    assert 'Database name must be set' in str(response.status)
Пример #19
0
    try:
      self.run_test_case('QueryTest/udf-mem-limit', vector, use_db=unique_database)
      assert False, "Query was expected to fail"
    except ImpalaBeeswaxException, e:
      self._check_mem_limit_exception(e)

    try:
      self.run_test_case('QueryTest/uda-mem-limit', vector, use_db=unique_database)
      assert False, "Query was expected to fail"
    except ImpalaBeeswaxException, e:
      self._check_mem_limit_exception(e)

    # It takes a long time for Impala to free up memory after this test, especially if
    # ASAN is enabled. Verify that all fragments finish executing before moving on to the
    # next test to make sure that the next test is not affected.
    for impalad in ImpalaCluster.get_e2e_test_cluster().impalads:
      verifier = MetricVerifier(impalad.service)
      verifier.wait_for_metric("impala-server.num-fragments-in-flight", 0)
      verifier.verify_num_unused_buffers()

  def test_udf_constant_folding(self, vector, unique_database):
    """Test that constant folding of UDFs is handled correctly. Uses count_rows(),
    which returns a unique value every time it is evaluated in the same thread."""
    exec_options = copy(vector.get_value('exec_option'))
    # Execute on a single node so that all counter values will be unique.
    exec_options["num_nodes"] = 1
    create_fn_query = """create function {database}.count_rows() returns bigint
                         location '{location}' symbol='Count' prepare_fn='CountPrepare'
                         close_fn='CountClose'"""
    self._load_functions(create_fn_query, vector, unique_database,
        get_fs_path('/test-warehouse/libTestUdfs.so'))
  # Kill existing cluster processes based on the current configuration.
  if options.restart_impalad_only:
    cluster_ops.kill_all_impalads(force=options.force_kill)
  elif options.restart_catalogd_only:
    cluster_ops.kill_catalogd(force=options.force_kill)
  elif options.restart_statestored_only:
    cluster_ops.kill_statestored(force=options.force_kill)
  else:
    cluster_ops.kill_all_daemons(force=options.force_kill)

  if options.kill_only:
    sys.exit(0)

  if options.restart_impalad_only:
    impala_cluster = ImpalaCluster()
    if not impala_cluster.statestored or not impala_cluster.catalogd:
      LOG.info("No running statestored or catalogd detected. "
          "Restarting entire cluster.")
      options.restart_impalad_only = False

  try:
    if options.restart_catalogd_only:
      cluster_ops.start_catalogd()
    elif options.restart_statestored_only:
      cluster_ops.start_statestore()
    elif options.restart_impalad_only:
      cluster_ops.start_impalads(options.cluster_size, options.num_coordinators,
                              options.use_exclusive_coordinators)
    else:
      cluster_ops.start_statestore()
  def test_native_functions_race(self, vector, unique_database):
    """ IMPALA-6488: stress concurrent adds, uses, and deletes of native functions.
        Exposes a crash caused by use-after-free in lib-cache."""

    # Native function used by a query. Stresses lib-cache during analysis and
    # backend expressions.
    create_fn_to_use = \
      """create function {0}.use_it(string) returns string
         LOCATION '{1}'
         SYMBOL='_Z8IdentityPN10impala_udf15FunctionContextERKNS_9StringValE'"""
    use_fn = """select * from (select max(int_col) from functional.alltypesagg
                where {0}.use_it(string_col) = 'blah' union all
                (select max(int_col) from functional.alltypesagg
                 where {0}.use_it(String_col) > '1' union all
                (select max(int_col) from functional.alltypesagg
                 where {0}.use_it(string_col) > '1'))) v"""
    # Reference to another native function from the same 'so' file. Creating/dropping
    # stresses lib-cache lookup, add, and refresh.
    create_another_fn = """create function if not exists {0}.other(float)
                           returns float location '{1}' symbol='Identity'"""
    drop_another_fn = """drop function if exists {0}.other(float)"""
    udf_path = get_fs_path('/test-warehouse/libTestUdfs.so')

    # Tracks number of impalads prior to tests to check that none have crashed.
    # All impalads are assumed to be coordinators.
    cluster = ImpalaCluster.get_e2e_test_cluster()
    exp_num_coordinators = cluster.num_responsive_coordinators()

    setup_client = self.create_impala_client()
    setup_query = create_fn_to_use.format(unique_database, udf_path)
    try:
      setup_client.execute(setup_query)
    except Exception as e:
      print "Unable to create initial function: {0}".format(setup_query)
      raise

    errors = []

    def use_fn_method():
      time.sleep(1 + random.random())
      client = self.create_impala_client()
      query = use_fn.format(unique_database)
      try:
        client.execute(query)
      except Exception as e:
        errors.append(e)

    def load_fn_method():
      time.sleep(1 + random.random())
      client = self.create_impala_client()
      drop = drop_another_fn.format(unique_database)
      create = create_another_fn.format(unique_database, udf_path)
      try:
        client.execute(drop)
        client.execute(create)
      except Exception as e:
        errors.append(e)

    # number of uses/loads needed to reliably reproduce the bug.
    num_uses = 200
    num_loads = 200

    # create threads to use native function.
    runner_threads = []
    for i in xrange(num_uses):
      runner_threads.append(threading.Thread(target=use_fn_method))

    # create threads to drop/create native functions.
    for i in xrange(num_loads):
      runner_threads.append(threading.Thread(target=load_fn_method))

    # launch all runner threads.
    for t in runner_threads: t.start()

    # join all threads.
    for t in runner_threads: t.join()

    for e in errors: print e

    # Checks that no impalad has crashed.
    assert cluster.num_responsive_coordinators() == exp_num_coordinators
Пример #22
0
 def _run_query_all_impalads(self, exec_options, query, expected):
   impala_cluster = ImpalaCluster.get_e2e_test_cluster()
   for impalad in impala_cluster.impalads:
     client = impalad.service.create_beeswax_client()
     result = self.execute_query_expect_success(client, query, exec_options)
     assert result.data == expected
Пример #23
0
  kill_all(force=options.force_kill)

  try:
    import json
    wait_for_cluster = wait_for_cluster_web
  except ImportError:
    print "json module not found, checking for cluster startup through the command-line"
    wait_for_cluster = wait_for_cluster_cmdline

  # If ImpalaCluster cannot be imported, fall back to the command-line to check
  # whether impalads/statestore are up.
  try:
    from tests.common.impala_cluster import ImpalaCluster
    # Make sure the processes have been killed. We loop till we can't detect a single
    # impalad or a statestore process.
    impala_cluster = ImpalaCluster()
    while len(impala_cluster.impalads) != 0 or impala_cluster.statestored or\
          impala_cluster.catalogd:
      impala_cluster.refresh()
  except ImportError:
    print 'ImpalaCluster module not found.'
    wait_for_cluster = wait_for_cluster_cmdline

  if options.inprocess:
    # The statestore and the impalads start in the same process. Additionally,
    # the statestore does not have a debug webpage.
    start_mini_impala_cluster(options.cluster_size)
    wait_for_cluster_cmdline()
  else:
    try:
      start_statestore()