Пример #1
0
 def queries_to_be_executed(self):
     '''Generator that outputs query models. They are either generated based on the query
 profile, or they are extracted from an existing report.
 '''
     if self.parent_job:
         # If parent job is specified, get the queries from the parent job report
         with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f:
             parent_report = pickle.load(f)
         for error_type in ['stack', 'row_counts', 'mismatch']:
             for query in parent_report.grouped_results[error_type]:
                 yield query['model']
     else:
         # If parent job is not specified, generate queries with QueryGenerator
         num_unexpected_errors = 0
         while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD:
             query = None
             try:
                 self.query_generator = QueryGenerator(self.query_profile)
                 query = self.query_generator.create_query(
                     self.common_tables)
             except IndexError as e:
                 # This is a query generator bug that happens extremely rarely
                 LOG.info('Query Generator Choice Problem, {0}'.format(e))
                 continue
             except Exception as e:
                 LOG.info('Unexpected error in queries_to_be_executed, {0}'.
                          format(e))
                 num_unexpected_errors += 1
                 if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD:
                     LOG.error('Num Unexpected Errors above threshold')
                     raise
                 else:
                     continue
             yield query
def test_hive_analytics_cannot_contain_aggs(analytic_func):
  """
  Tests that the HiveProfile does not allow nested aggs inside specific analytic
  functions. The list of analytic functions that cannot contain aggs is defined by
  QueryProfile.test_hive_analytics_cannot_contain_aggs()
  """

  class FakeDefaultQueryProfile(DefaultProfile):
    """
    A DefaultProfile that forces only nested expression in any expression trees
    """

    def __init__(self):
      super(FakeDefaultQueryProfile, self).__init__()
      self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)})

  class FakeHiveQueryProfile(HiveProfile):
    """
    A HiveProfile that forces only nested expression in any expression trees
    """

    def __init__(self):
      super(FakeHiveQueryProfile, self).__init__()
      self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)})

  # Aggregate functions can only return specific types, such as Int, Number, or Float;
  # while certain AnalyticFuncs can return non-numeric types such as FirstValue or
  # LastValue. So for simplicity, these tests are only run against the Int return_type.
  if Int in [signature.return_type for signature in analytic_func.signatures()]:

    # Generate an agg + analytic func tree using the FakeDefaultQueryProfile and ensure
    # the root func is a analytic_func and that the tree contains an aggregate func. An
    # empty list of funcs is passed into the _create_agg_or_analytic_tree so that no
    # basic funcs are created. We can be sure that the root_func is an analytic_func
    # because agg_funcs cannot contain analytic_funcs.

    qgen = QueryGenerator(FakeDefaultQueryProfile())
    func_tree = qgen._create_agg_or_analytic_tree(Int, agg_funcs=AGG_FUNCS,
                                            analytic_funcs=[analytic_func],
                                            basic_funcs=[])
    assert isinstance(func_tree, analytic_func)
    assert func_tree.contains_agg

    # Do the same for the FakeHiveQueryProfile, but now check that the func_tree has no
    # aggregates.

    qgen = QueryGenerator(FakeHiveQueryProfile())
    func_tree = qgen._create_agg_or_analytic_tree(Int, agg_funcs=AGG_FUNCS,
                                              analytic_funcs=[analytic_func],
                                              basic_funcs=[])
    assert isinstance(func_tree, analytic_func)
    assert not func_tree.contains_agg
Пример #3
0
def generate_random_queries(impala, random_db):
  """Generator function to produce random queries. 'impala' is the Impala service
  object. random_db is the name of the database that queries should be
  generated for."""
  with impala.cursor(db_name=random_db) as cursor:
    tables = [cursor.describe_table(t) for t in cursor.list_table_names()]
  query_generator = QueryGenerator(DefaultProfile())
  model_translator = SqlWriter.create()
  while True:
    query_model = query_generator.generate_statement(tables)
    sql = model_translator.write_query(query_model)
    query = Query()
    query.sql = sql
    query.db_name = random_db
    yield query
Пример #4
0
def generate_random_queries(impala, random_db):
    """Generator function to produce random queries. 'impala' is the Impala service
  object. random_db is the name of the database that queries should be
  generated for."""
    with impala.cursor(db_name=random_db) as cursor:
        tables = [cursor.describe_table(t) for t in cursor.list_table_names()]
    query_generator = QueryGenerator(DefaultProfile())
    model_translator = SqlWriter.create()
    while True:
        query_model = query_generator.generate_statement(tables)
        sql = model_translator.write_query(query_model)
        query = Query()
        query.sql = sql
        query.db_name = random_db
        yield query
Пример #5
0
 def queries_to_be_executed(self):
   '''Generator that outputs query models. They are either generated based on the query
   profile, or they are extracted from an existing report.
   '''
   if self.parent_job:
     # If parent job is specified, get the queries from the parent job report
     with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f:
       parent_report = pickle.load(f)
     for error_type in ['stack', 'row_counts', 'mismatch']:
       for query in parent_report.grouped_results[error_type]:
         yield query['model']
   else:
     # If parent job is not specified, generate queries with QueryGenerator
     num_unexpected_errors = 0
     while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD:
       query = None
       try:
         self.query_generator = QueryGenerator(self.query_profile)
         query = self.query_generator.create_query(self.common_tables)
       except IndexError as e:
         # This is a query generator bug that happens extremely rarely
         LOG.info('Query Generator Choice Problem, {0}'.format(e))
         continue
       except Exception as e:
         LOG.info('Unexpected error in queries_to_be_executed, {0}'.format(e))
         num_unexpected_errors += 1
         if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD:
           LOG.error('Num Unexpected Errors above threshold')
           raise
         else:
           continue
       yield query
Пример #6
0
    def start(self):
        try:
            self.prepare()
            self.query_generator = QueryGenerator(self.query_profile)
            self.common_tables = DbConnection.describe_common_tables(
                [self.ref_connection, self.test_connection])

            for query_model in self.queries_to_be_executed():
                LOG.info('About to execute query.')
                result_dict = self.run_query(query_model)
                LOG.info('Query Executed successfully.')
                if result_dict:
                    self.num_queries_executed += 1
                    self.result_list.append(result_dict)
                LOG.info('Time Left: {0}'.format(self.target_stop_time -
                                                 time()))
                if time() > self.target_stop_time:
                    break
            self.stop_time = time()
            self.save_pickle()
            self.generate_report()
            LOG.info('Generated Report')
        except:
            LOG.exception('Unexpected Exception in start')
            raise
        finally:
            self.impala_env.stop_docker()
            LOG.info('Docker Stopped')
            try:
                os.remove(join_path(PATH_TO_SCHEDULE, self.job_id))
                LOG.info('Schedule file removed')
            except OSError:
                LOG.info('Unable to remove schedule file.')
def test_hive_create_equality_only_joins():
  """
  Tests that QueryGenerator produces a join condition with only equality functions if the
  HiveProfile is used.
  """

  class FakeHiveQueryProfile(HiveProfile):
    """
    A fake QueryProfile that extends the HiveProfile, various weights are modified in
    order to ensure that this test is deterministic.
    """

    def choose_join_condition_count(self):
      """
      There should be only one operator in the JOIN condition
      """
      return 1

    def choose_conjunct_disjunct_fill_ratio(self):
      """
      There should be no AND or OR operators
      """
      return 0

    def choose_relational_func_fill_ratio(self):
      """
      Force all operators to be relational
      """
      return 1

  query_generator = QueryGenerator(FakeHiveQueryProfile())

  # Create two tables that have one joinable Column
  right_table_expr_list = TableExprList()
  right_table = Table("right_table")
  right_table.add_col(Column("right_table", "right_col", Int))
  right_table_expr_list.append(right_table)

  left_table_expr_list = TableExprList()
  left_table = Table("left_table")
  left_table.add_col(Column("left_table", "left_col", Int))
  left_table_expr_list.append(left_table)

  # Validate the root predicate is an Equals funcs
  assert isinstance(query_generator._create_relational_join_condition(
    right_table_expr_list, left_table_expr_list), Equals)
Пример #8
0
def test_use_nested_width_subquery():
  """
  Tests that setting DefaultProfile.use_nested_with to False works properly. Setting this
  method to return False should prevent a WITH clause from being used inside a sub-query.
  """

  class MockQueryProfile(DefaultProfile):
    """
    A mock QueryProfile that sets use_nested_with to False and forces the
    QueryGenerator to created nested queries.
    """

    def __init__(self):
      super(MockQueryProfile, self).__init__()

      # Force the QueryGenerator to create nested queries
      self._bounds['MAX_NESTED_QUERY_COUNT'] = (4, 4)

      # Force the QueryGenerator to use WITH clauses whenever possible
      self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 1

      # Force the QueryGenerator to create inline views whenever possible
      self._probabilities['MISC']['INLINE_VIEW'] = 1

    def use_nested_with(self):
      return False

  mock_query_gen = QueryGenerator(MockQueryProfile())

  # Create two tables
  table_expr_list = TableExprList()

  right_table = Table("right_table")
  right_table.add_col(Column("right_table", "right_col", Int))
  table_expr_list.append(right_table)

  left_table = Table("left_table")
  left_table.add_col(Column("left_table", "left_col", Int))
  table_expr_list.append(left_table)

  # Check that each nested_query doesn't have a with clause
  for nested_query in mock_query_gen.generate_statement(table_expr_list).nested_queries:
    assert nested_query.with_clause is None
def test_use_nested_width_subquery():
  """
  Tests that setting DefaultProfile.use_nested_with to False works properly. Setting this
  method to return False should prevent a WITH clause from being used inside a sub-query.
  """

  class MockQueryProfile(DefaultProfile):
    """
    A mock QueryProfile that sets use_nested_with to False and forces the
    QueryGenerator to created nested queries.
    """

    def __init__(self):
      super(MockQueryProfile, self).__init__()

      # Force the QueryGenerator to create nested queries
      self._bounds['MAX_NESTED_QUERY_COUNT'] = (4, 4)

      # Force the QueryGenerator to use WITH clauses whenever possible
      self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 1

      # Force the QueryGenerator to create inline views whenever possible
      self._probabilities['MISC']['INLINE_VIEW'] = 1

    def use_nested_with(self):
      return False

  mock_query_gen = QueryGenerator(MockQueryProfile())

  # Create two tables
  table_expr_list = TableExprList()

  right_table = Table("right_table")
  right_table.add_col(Column("right_table", "right_col", Int))
  table_expr_list.append(right_table)

  left_table = Table("left_table")
  left_table.add_col(Column("left_table", "left_col", Int))
  table_expr_list.append(left_table)

  # Check that each nested_query doesn't have a with clause
  for nested_query in mock_query_gen.create_query(table_expr_list).nested_queries:
    assert nested_query.with_clause is None
Пример #10
0
def test_hive_analytics_cannot_contain_aggs(analytic_func):
    """
  Tests that the HiveProfile does not allow nested aggs inside specific analytic
  functions. The list of analytic functions that cannot contain aggs is defined by
  QueryProfile.test_hive_analytics_cannot_contain_aggs()
  """
    class FakeDefaultQueryProfile(DefaultProfile):
        """
    A DefaultProfile that forces only nested expression in any expression trees
    """
        def __init__(self):
            super(FakeDefaultQueryProfile, self).__init__()
            self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)})

    class FakeHiveQueryProfile(HiveProfile):
        """
    A HiveProfile that forces only nested expression in any expression trees
    """
        def __init__(self):
            super(FakeHiveQueryProfile, self).__init__()
            self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)})

    # Aggregate functions can only return specific types, such as Int, Number, or Float;
    # while certain AnalyticFuncs can return non-numeric types such as FirstValue or
    # LastValue. So for simplicity, these tests are only run against the Int return_type.
    if Int in [
            signature.return_type for signature in analytic_func.signatures()
    ]:

        # Generate an agg + analytic func tree using the FakeDefaultQueryProfile and ensure
        # the root func is a analytic_func and that the tree contains an aggregate func. An
        # empty list of funcs is passed into the _create_agg_or_analytic_tree so that no
        # basic funcs are created. We can be sure that the root_func is an analytic_func
        # because agg_funcs cannot contain analytic_funcs.

        qgen = QueryGenerator(FakeDefaultQueryProfile())
        func_tree = qgen._create_agg_or_analytic_tree(
            Int,
            agg_funcs=AGG_FUNCS,
            analytic_funcs=[analytic_func],
            basic_funcs=[])
        assert isinstance(func_tree, analytic_func)
        assert func_tree.contains_agg

        # Do the same for the FakeHiveQueryProfile, but now check that the func_tree has no
        # aggregates.

        qgen = QueryGenerator(FakeHiveQueryProfile())
        func_tree = qgen._create_agg_or_analytic_tree(
            Int,
            agg_funcs=AGG_FUNCS,
            analytic_funcs=[analytic_func],
            basic_funcs=[])
        assert isinstance(func_tree, analytic_func)
        assert not func_tree.contains_agg
Пример #11
0
def test_func_tree_contains_funcs():
    """
  Tests the QueryGenerator.func_tree_contains_funcs() method
  """

    qgen = QueryGenerator(DefaultProfile())

    # Create a simple func_tree with only one function
    and_func = And.create_from_args(Boolean(True), Boolean(True))
    and_func.parent = None
    assert qgen._func_tree_contains_funcs(and_func, [And])
    assert not qgen._func_tree_contains_funcs(and_func, [Or])

    # Create a func_tree that contains one parent, and two children
    equals_func = Equals.create_from_args(Boolean(True), Boolean(True))
    and_func = And.create_from_args(equals_func, equals_func)
    equals_func.parent = and_func
    and_func.parent = None
    assert qgen._func_tree_contains_funcs(equals_func, [And])
    assert qgen._func_tree_contains_funcs(equals_func, [Equals])
    assert not qgen._func_tree_contains_funcs(equals_func, [Or])
Пример #12
0
  def start(self):
    try:
      self.prepare()
      self.query_generator = QueryGenerator(self.query_profile)
      if NESTED_TYPES_MODE:
        self.common_tables = DbCursor.describe_common_tables(
            [self.test_connection.cursor()])
      else:
        self.common_tables = DbCursor.describe_common_tables(
            [self.test_connection.cursor(), self.ref_connection.cursor()])

      for query_model in self.queries_to_be_executed():
        LOG.info('About to execute query.')
        result_dict = self.run_query(query_model)
        LOG.info('Query Executed successfully.')
        self.num_queries_executed += 1
        if result_dict:
          self.result_list.append(result_dict)
        LOG.info('Time Left: {0}'.format(self.target_stop_time - time()))
        if time() > self.target_stop_time:
          break
      self.stop_time = time()
      self.save_pickle()
      self.generate_report()
      LOG.info('Generated Report')
    except:
      LOG.exception('Unexpected Exception in start')
      raise
    finally:
      self.impala_env.stop_docker()
      LOG.info('Docker Stopped')
      try:
        os.remove(join_path(PATH_TO_SCHEDULE, self.job_id))
        LOG.info('Schedule file removed')
      except OSError:
        LOG.info('Unable to remove schedule file.')
def test_func_tree_contains_funcs():
  """
  Tests the QueryGenerator.func_tree_contains_funcs() method
  """

  qgen = QueryGenerator(DefaultProfile())

  # Create a simple func_tree with only one function
  and_func = And.create_from_args(Boolean(True), Boolean(True))
  and_func.parent = None
  assert qgen._func_tree_contains_funcs(and_func, [And])
  assert not qgen._func_tree_contains_funcs(and_func, [Or])

  # Create a func_tree that contains one parent, and two children
  equals_func = Equals.create_from_args(Boolean(True), Boolean(True))
  and_func = And.create_from_args(equals_func, equals_func)
  equals_func.parent = and_func
  and_func.parent = None
  assert qgen._func_tree_contains_funcs(equals_func, [And])
  assert qgen._func_tree_contains_funcs(equals_func, [Equals])
  assert not qgen._func_tree_contains_funcs(equals_func, [Or])
Пример #14
0
class Job(object):
  '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to
  execute queries by either generaing them based on a provided query profile or by
  extracting queries from an existing report. A report is generated when it finishes
  running.
  '''

  def __init__(self,
      query_profile,
      job_id,
      run_name = 'default',
      time_limit_sec = 24 * 3600,
      git_command = None,
      parent_job = None):
    self.git_hash = ''
    self.job_id = job_id
    self.job_name = run_name
    self.parent_job = parent_job
    self.query_profile = query_profile or (
        ImpalaNestedTypesProfile() if NESTED_TYPES_MODE else DefaultProfile())
    self.ref_connection = None
    self.result_list = []
    self.start_time = time()
    self.stop_time = None
    self.target_stop_time = time() + time_limit_sec
    self.test_connection = None
    self.num_queries_executed = 0
    self.num_queries_returned_correct_data = 0
    self.flatten_dialect = 'POSTGRESQL' if NESTED_TYPES_MODE else None
    self.impala_env = ImpalaDockerEnv(git_command)

  def __getstate__(self):
    '''For pickling'''
    result = {}
    result['job_id'] = self.job_id
    result['job_name'] = self.job_name
    result['parent_job'] = self.parent_job
    result['result_list'] = self.result_list
    result['git_hash'] = self.git_hash
    result['start_time'] = self.start_time
    result['stop_time'] = self.stop_time
    result['num_queries_executed'] = self.num_queries_executed
    result['num_queries_returned_correct_data'] = self.num_queries_returned_correct_data
    return result

  def prepare(self):
    '''Prepares the environment and connects to Postgres and Impala running inside the
    Docker container.
    '''
    LOG.info('Starting Job Preparation')
    self.impala_env.prepare()
    LOG.info('Job Preparation Complete')

    self.ref_connection = PostgresqlConnection(
        user_name=POSTGRES_USER_NAME,
        password=None,
        host_name=self.impala_env.host,
        port=self.impala_env.postgres_port,
        db_name=POSTGRES_DATABASE_NAME)
    LOG.info('Created ref_connection')

    self.start_impala()

    self.git_hash = self.impala_env.get_git_hash()

  def get_stack(self):
    stack_trace = self.impala_env.get_stack()
    LOG.info('Stack Trace: {0}'.format(stack_trace))
    return stack_trace

  def start_impala(self):
    '''Starts impala and creates a connection to it. '''
    self.impala_env.start_impala()

    self.test_connection = ImpalaConnection(
        host_name=self.impala_env.host,
        port=self.impala_env.impala_port,
        user_name=None,
        db_name=DATABASE_NAME)

    self.test_connection.reconnect()
    self.query_result_comparator = QueryResultComparator(
        self.query_profile,
        self.ref_connection,
        self.test_connection,
        query_timeout_seconds=4*60,
        flatten_dialect='POSTGRESQL')
    LOG.info('Created query result comparator')
    LOG.info(str(self.query_result_comparator.__dict__))

  def is_impala_running(self):
    return self.impala_env.is_impala_running()

  def save_pickle(self):
    '''Saves self as pickle. This is normally done when the job finishes running. '''
    with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f:
      pickle.dump(self, f)
    LOG.info('Saved Completed Job Pickle')

  def queries_to_be_executed(self):
    '''Generator that outputs query models. They are either generated based on the query
    profile, or they are extracted from an existing report.
    '''
    if self.parent_job:
      # If parent job is specified, get the queries from the parent job report
      with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f:
        parent_report = pickle.load(f)
      for error_type in ['stack', 'row_counts', 'mismatch']:
        for query in parent_report.grouped_results[error_type]:
          yield query['model']
    else:
      # If parent job is not specified, generate queries with QueryGenerator
      num_unexpected_errors = 0
      while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD:
        query = None
        try:
          self.query_generator = QueryGenerator(self.query_profile)
          query = self.query_generator.create_query(self.common_tables)
        except IndexError as e:
          # This is a query generator bug that happens extremely rarely
          LOG.info('Query Generator Choice Problem, {0}'.format(e))
          continue
        except Exception as e:
          LOG.info('Unexpected error in queries_to_be_executed, {0}'.format(e))
          num_unexpected_errors += 1
          if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD:
            LOG.error('Num Unexpected Errors above threshold')
            raise
          else:
            continue
        yield query

  def generate_report(self):
    '''Generate report and save it into the reports directory. '''
    from report import Report
    rep = Report(self.job_id)
    rep.save_pickle()

  def start(self):
    try:
      self.prepare()
      self.query_generator = QueryGenerator(self.query_profile)
      if NESTED_TYPES_MODE:
        self.common_tables = DbCursor.describe_common_tables(
            [self.test_connection.cursor()])
      else:
        self.common_tables = DbCursor.describe_common_tables(
            [self.test_connection.cursor(), self.ref_connection.cursor()])

      for query_model in self.queries_to_be_executed():
        LOG.info('About to execute query.')
        result_dict = self.run_query(query_model)
        LOG.info('Query Executed successfully.')
        self.num_queries_executed += 1
        if result_dict:
          self.result_list.append(result_dict)
        LOG.info('Time Left: {0}'.format(self.target_stop_time - time()))
        if time() > self.target_stop_time:
          break
      self.stop_time = time()
      self.save_pickle()
      self.generate_report()
      LOG.info('Generated Report')
    except:
      LOG.exception('Unexpected Exception in start')
      raise
    finally:
      self.impala_env.stop_docker()
      LOG.info('Docker Stopped')
      try:
        os.remove(join_path(PATH_TO_SCHEDULE, self.job_id))
        LOG.info('Schedule file removed')
      except OSError:
        LOG.info('Unable to remove schedule file.')

  def reproduce_crash(self, query_model):
    '''Check if the given query_model causes a crash. Returns the number of times the
    query had to be run to cause a crash.
    '''
    NUM_TRIES = 5
    self.start_impala()
    for try_num in range(1, NUM_TRIES + 1):
      self.query_result_comparator.compare_query_results(query_model)
      if not self.is_impala_running():
        return try_num

  def run_query(self, query_model):
    '''Runs a single query. '''
    if not self.is_impala_running():
      LOG.info('Impala is not running, starting Impala.')
      self.start_impala()

    def run_query_internal():
      self.comparison_result = self.query_result_comparator.compare_query_results(
          query_model)

    self.comparison_result = None
    internal_thread = Thread(
      target=run_query_internal,
      name='run_query_internal_{0}'.format(self.job_id))
    internal_thread.daemon = True
    internal_thread.start()
    internal_thread.join(timeout=600)
    if internal_thread.is_alive():
      LOG.info('run_query_internal is alive, restarting Impala Environment')
      self.impala_env.stop_docker()
      self.prepare()
      return None
    else:
      LOG.info('run_query_internal is dead as expected')

    comparison_result = self.comparison_result

    if comparison_result.query_timed_out:
      LOG.info('Query Timeout Exception')
      restart_impala = True
    else:
      restart_impala = False

    result_dict = {}

    if self.is_impala_running():
      if comparison_result.error:
        result_dict = self.comparison_result_analysis(comparison_result)
        result_dict['model'] = query_model
      elif comparison_result.query_resulted_in_data:
        self.num_queries_returned_correct_data += 1
    else:
      LOG.info('CRASH OCCURED')
      result_dict = self.comparison_result_analysis(comparison_result)
      result_dict['model'] = query_model
      result_dict['stack'] = self.get_stack()
      result_dict['num_tries_to_reproduce'] = self.reproduce_crash(query_model)

    if restart_impala:
      self.start_impala()

    return result_dict

  def comparison_result_analysis(self, comparison_result):
    '''Get useful information from the comparison_result. '''
    result_dict = {}
    result_dict['error'] = comparison_result.error
    result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number
    result_dict['mismatch_ref_row'] = comparison_result.ref_row
    result_dict['mismatch_test_row'] = comparison_result.test_row
    result_dict['ref_row_count'] = comparison_result.ref_row_count
    result_dict['ref_sql'] = comparison_result.ref_sql
    result_dict['test_row_count'] = comparison_result.test_row_count
    result_dict['test_sql'] = comparison_result.test_sql
    return result_dict
Пример #15
0
  def generate_statement(self, tables, dml_table):
    """
    Return a randomly generated INSERT or UPSERT statement. Note that UPSERTs are very
    similar to INSERTs, which is why this generator handles both.

    tables should be a list of Table objects. A typical source of such a list comes from
    db_connection.DbCursor.describe_common_tables(). This list describes the possible
    "sources" of the INSERT/UPSERT's WITH and FROM/WHERE clauses.

    dml_table is a required Table object. The INSERT/UPSERT will be into this table.
    """
    if not (isinstance(tables, list) and len(tables) > 0 and
            all((isinstance(t, Table) for t in tables))):
      raise Exception('tables must be a not-empty list of Table objects')

    if not isinstance(dml_table, Table):
      raise Exception('dml_table must be a Table')

    self.select_stmt_generator = QueryGenerator(self.profile)

    insert_statement = InsertStatement(execution=StatementExecutionMode.DML_TEST)

    # Choose whether this is a
    #   INSERT/UPSERT INTO table SELECT/VALUES
    # or
    #   INSERT/UPSERT INTO table (col1, col2, ...) SELECT/VALUES
    # If the method returns None, it's the former.
    insert_column_list = self.profile.choose_insert_column_list(dml_table)

    if dml_table.primary_keys:
      # Having primary keys implies the table is a Kudu table, which makes it subject to
      # both INSERTs (with automatic ignoring of primary key duplicates) and UPSERTs.
      conflict_action = self.profile.choose_insert_vs_upsert()
    else:
      conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT
    insert_statement.insert_clause = InsertClause(
        dml_table, column_list=insert_column_list, conflict_action=conflict_action)
    # We still need to internally track the columns we're inserting. Keep in mind None
    # means "all" without an explicit column list. Since we've already created the
    # InsertClause object though, we can fill this in for ourselves.
    if insert_column_list is None:
      insert_column_list = dml_table.cols
    insert_item_data_types = [col.type for col in insert_column_list]

    # Decide whether this is INSERT/UPSERT VALUES or INSERT/UPSERT SELECT
    insert_source_clause = self.profile.choose_insert_source_clause()

    if issubclass(insert_source_clause, Query):
      # Use QueryGenerator()'s public interface to generate the SELECT.
      select_query = self.select_stmt_generator.generate_statement(
          tables, select_item_data_types=insert_item_data_types)
      # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
      # generator's type system is not near sophisticated enough to know how random
      # expressions will be implicitly casted in the databases. This requires less work
      # to implement. IMPALA-4693 considers alternative approaches.
      self._cast_select_items(select_query, insert_column_list)
      insert_statement.with_clause = deepcopy(select_query.with_clause)
      select_query.with_clause = None
      insert_statement.select_query = select_query
    elif issubclass(insert_source_clause, ValuesClause):
      insert_statement.values_clause = self._generate_values_clause(insert_column_list)
    else:
      raise Exception('unsupported INSERT/UPSERT source clause: {0}'.format(
          insert_source_clause))
    return insert_statement
Пример #16
0
class InsertStatementGenerator(object):
  def __init__(self, profile):
    # QueryProfile-like object
    self.profile = profile
    # used to generate SELECT queries for INSERT/UPSERT ... SELECT statements;
    # to ensure state is completely reset, this is created anew with each call to
    # generate_statement()
    self.select_stmt_generator = None

  def generate_statement(self, tables, dml_table):
    """
    Return a randomly generated INSERT or UPSERT statement. Note that UPSERTs are very
    similar to INSERTs, which is why this generator handles both.

    tables should be a list of Table objects. A typical source of such a list comes from
    db_connection.DbCursor.describe_common_tables(). This list describes the possible
    "sources" of the INSERT/UPSERT's WITH and FROM/WHERE clauses.

    dml_table is a required Table object. The INSERT/UPSERT will be into this table.
    """
    if not (isinstance(tables, list) and len(tables) > 0 and
            all((isinstance(t, Table) for t in tables))):
      raise Exception('tables must be a not-empty list of Table objects')

    if not isinstance(dml_table, Table):
      raise Exception('dml_table must be a Table')

    self.select_stmt_generator = QueryGenerator(self.profile)

    insert_statement = InsertStatement(execution=StatementExecutionMode.DML_TEST)

    # Choose whether this is a
    #   INSERT/UPSERT INTO table SELECT/VALUES
    # or
    #   INSERT/UPSERT INTO table (col1, col2, ...) SELECT/VALUES
    # If the method returns None, it's the former.
    insert_column_list = self.profile.choose_insert_column_list(dml_table)

    if dml_table.primary_keys:
      # Having primary keys implies the table is a Kudu table, which makes it subject to
      # both INSERTs (with automatic ignoring of primary key duplicates) and UPSERTs.
      conflict_action = self.profile.choose_insert_vs_upsert()
    else:
      conflict_action = InsertClause.CONFLICT_ACTION_DEFAULT
    insert_statement.insert_clause = InsertClause(
        dml_table, column_list=insert_column_list, conflict_action=conflict_action)
    # We still need to internally track the columns we're inserting. Keep in mind None
    # means "all" without an explicit column list. Since we've already created the
    # InsertClause object though, we can fill this in for ourselves.
    if insert_column_list is None:
      insert_column_list = dml_table.cols
    insert_item_data_types = [col.type for col in insert_column_list]

    # Decide whether this is INSERT/UPSERT VALUES or INSERT/UPSERT SELECT
    insert_source_clause = self.profile.choose_insert_source_clause()

    if issubclass(insert_source_clause, Query):
      # Use QueryGenerator()'s public interface to generate the SELECT.
      select_query = self.select_stmt_generator.generate_statement(
          tables, select_item_data_types=insert_item_data_types)
      # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
      # generator's type system is not near sophisticated enough to know how random
      # expressions will be implicitly casted in the databases. This requires less work
      # to implement. IMPALA-4693 considers alternative approaches.
      self._cast_select_items(select_query, insert_column_list)
      insert_statement.with_clause = deepcopy(select_query.with_clause)
      select_query.with_clause = None
      insert_statement.select_query = select_query
    elif issubclass(insert_source_clause, ValuesClause):
      insert_statement.values_clause = self._generate_values_clause(insert_column_list)
    else:
      raise Exception('unsupported INSERT/UPSERT source clause: {0}'.format(
          insert_source_clause))
    return insert_statement

  def _generate_values_clause(self, columns):
    """
    Return a VALUES clause containing a variable number of rows.

    The values corresponding to primary keys will be non-null constants. Any other
    columns could be null, constants, or function trees that may or may not evaluate to
    null.
    """
    values_rows = []
    for _ in xrange(self.profile.choose_insert_values_row_count()):
      values_row = []
      for col in columns:
        if col.is_primary_key:
          val = self.profile.choose_constant(return_type=col.exact_type, allow_null=False)
        elif 'constant' == self.profile.choose_values_item_expr():
          val = self.profile.choose_constant(return_type=col.exact_type, allow_null=True)
        else:
          func_tree = self.select_stmt_generator.create_func_tree(
              col.type, allow_subquery=False)
          val = self.select_stmt_generator.populate_func_with_vals(func_tree)
          # Only the generic type, not the exact type, of the value will be known. To
          # avoid a lot of failed queries due to precision errors, we cast the val to
          # the exact type of the column. This will still not prevent "out of range"
          # conditions, as we don't try to evaluate the random expressions.
          val = CastFunc(val, col.exact_type)
        values_row.append(val)
      values_rows.append(ValuesRow(values_row))
    return ValuesClause(values_rows)

  def _cast_select_items(self, select_query, column_list):
    """
    For a given Query select_query and a column_list (list of Columns), cast each select
    item in select_query to the exact type of the column.

    A Query may have a UNION, recursively do this down the line.
    """
    for col_idx, select_item in enumerate(select_query.select_clause.items):
      cast_val_expr = CastFunc(select_item.val_expr, column_list[col_idx].exact_type)
      select_item.val_expr = cast_val_expr
    if select_query.union_clause:
      self._cast_select_items(select_query.union_clause.query, column_list)
Пример #17
0
    def search(self, number_of_test_queries, stop_on_result_mismatch,
               stop_on_crash):
        if exists(self.query_shelve_path):
            # Ensure a clean shelve will be created
            remove(self.query_shelve_path)

        start_time = time()
        impala_sql_writer = SqlWriter.create(dialect=IMPALA)
        reference_sql_writer = SqlWriter.create(
            dialect=self.reference_connection.db_type)
        query_result_comparator = QueryResultComparator(
            self.impala_connection, self.reference_connection)
        query_generator = QueryGenerator()
        query_count = 0
        queries_resulted_in_data_count = 0
        mismatch_count = 0
        query_timeout_count = 0
        known_error_count = 0
        impala_crash_count = 0
        last_error = None
        repeat_error_count = 0
        with open(self.query_log_path, 'w') as impala_query_log:
            impala_query_log.write('--\n' '-- Stating new run\n' '--\n')
            while number_of_test_queries > query_count:
                query = query_generator.create_query(self.common_tables)
                impala_sql = impala_sql_writer.write_query(query)
                if 'FULL OUTER JOIN' in impala_sql and self.reference_connection.db_type == MYSQL:
                    # Not supported by MySQL
                    continue

                query_count += 1
                LOG.info('Running query #%s', query_count)
                impala_query_log.write(impala_sql + ';\n')
                result = query_result_comparator.compare_query_results(query)
                if result.query_resulted_in_data:
                    queries_resulted_in_data_count += 1
                if result.error:
                    # TODO: These first two come from psycopg2, the postgres driver. Maybe we should
                    #       try a different driver? Or maybe the usage of the driver isn't correct.
                    #       Anyhow ignore these failures.
                    if 'division by zero' in result.error \
                        or 'out of range' in result.error \
                        or 'Too much data' in result.error:
                        LOG.debug('Ignoring error: %s', result.error)
                        query_count -= 1
                        continue

                    if result.is_known_error:
                        known_error_count += 1
                    elif result.query_timed_out:
                        query_timeout_count += 1
                    else:
                        mismatch_count += 1
                        with closing(open_shelve(
                                self.query_shelve_path)) as query_shelve:
                            query_shelve[str(query_count)] = query

                    print('---Impala Query---\n')
                    print(
                        impala_sql_writer.write_query(query, pretty=True) +
                        '\n')
                    print('---Reference Query---\n')
                    print(
                        reference_sql_writer.write_query(query, pretty=True) +
                        '\n')
                    print('---Error---\n')
                    print(result.error + '\n')
                    print('------\n')

                    if 'Could not connect' in result.error \
                        or "Couldn't open transport for" in result.error:
                        # if stop_on_crash:
                        #   break
                        # Assume Impala crashed and try restarting
                        impala_crash_count += 1
                        LOG.info('Restarting Impala')
                        call([
                            join(getenv('IMPALA_HOME'),
                                 'bin/start-impala-cluster.py'),
                            '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")
                        ])
                        self.impala_connection.reconnect()
                        query_result_comparator.impala_cursor = self.impala_connection.create_cursor(
                        )
                        result = query_result_comparator.compare_query_results(
                            query)
                        if result.error:
                            LOG.info('Restarting Impala')
                            call([
                                join(getenv('IMPALA_HOME'),
                                     'bin/start-impala-cluster.py'),
                                '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")
                            ])
                            self.impala_connection.reconnect()
                            query_result_comparator.impala_cursor = self.impala_connection.create_cursor(
                            )
                        else:
                            break

                    if stop_on_result_mismatch and \
                        not (result.is_known_error or result.query_timed_out):
                        break

                    if last_error == result.error \
                        and not (result.is_known_error or result.query_timed_out):
                        repeat_error_count += 1
                        if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT:
                            break
                    else:
                        last_error = result.error
                        repeat_error_count = 0
                else:
                    if result.query_resulted_in_data:
                        LOG.info('Results matched (%s rows)',
                                 result.impala_row_count)
                    else:
                        LOG.info('Query did not produce meaningful data')
                    last_error = None
                    repeat_error_count = 0

            return SearchResults(query_count, queries_resulted_in_data_count,
                                 mismatch_count, query_timeout_count,
                                 known_error_count, impala_crash_count,
                                 time() - start_time)
Пример #18
0
  def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash):
    '''Returns an instance of SearchResults, which is a summary report. This method
       oversees the generation, execution, and comparison of queries.

      number_of_test_queries should an integer indicating the maximum number of queries
      to generate and execute.
    '''
    start_time = time()
    query_result_comparator = QueryResultComparator(
        self.ref_connection, self.test_connection)
    query_generator = QueryGenerator(self.query_profile)
    query_count = 0
    queries_resulted_in_data_count = 0
    mismatch_count = 0
    query_timeout_count = 0
    known_error_count = 0
    test_crash_count = 0
    last_error = None
    repeat_error_count = 0
    while number_of_test_queries > query_count:
      query = query_generator.create_query(self.common_tables)
      query.execution = self.query_profile.get_query_execution()
      query_count += 1
      LOG.info('Running query #%s', query_count)
      result = query_result_comparator.compare_query_results(query)
      if result.query_resulted_in_data:
        queries_resulted_in_data_count += 1
      if isinstance(result.exception, DataLimitExceeded) \
          or isinstance(result.exception, TypeOverflow):
        continue
      if result.error:
        # TODO: These first two come from psycopg2, the postgres driver. Maybe we should
        #       try a different driver? Or maybe the usage of the driver isn't correct.
        #       Anyhow ignore these failures.
        if 'division by zero' in result.error \
            or 'out of range' in result.error:
          LOG.debug('Ignoring error: %s', result.error)
          query_count -= 1
          continue

        if result.is_known_error:
          known_error_count += 1
        elif result.query_timed_out:
          query_timeout_count += 1
        else:
          mismatch_count += 1

        print('---Test Query---\n')
        print(result.test_sql + '\n')
        print('---Reference Query---\n')
        print(result.ref_sql + '\n')
        print('---Error---\n')
        print(result.error + '\n')
        print('------\n')

        if 'Could not connect' in result.error \
            or "Couldn't open transport for" in result.error:
          if stop_on_crash:
            break
          # Assume Impala crashed and try restarting
          test_crash_count += 1
          LOG.info('Restarting Impala')
          call([join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'),
                          '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")])
          self.test_connection.reconnect()
          query_result_comparator.test_cursor = self.test_connection.create_cursor()
          result = query_result_comparator.compare_query_results(query)
          if result.error:
            LOG.info('Restarting Impala')
            call([join_path(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'),
                            '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")])
            self.test_connection.reconnect()
            query_result_comparator.test_cursor = self.test_connection.create_cursor()
          else:
            break

        if stop_on_result_mismatch and \
            not (result.is_known_error or result.query_timed_out):
          break

        if last_error == result.error \
            and not (result.is_known_error or result.query_timed_out):
          repeat_error_count += 1
          if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT:
            break
        else:
          last_error = result.error
          repeat_error_count = 0
      else:
        if result.query_resulted_in_data:
          LOG.info('Results matched (%s rows)', result.test_row_count)
        else:
          LOG.info('Query did not produce meaningful data')
        last_error = None
        repeat_error_count = 0

    return SearchResults(
        query_count,
        queries_resulted_in_data_count,
        mismatch_count,
        query_timeout_count,
        known_error_count,
        test_crash_count,
        time() - start_time)
Пример #19
0
class Job(object):
    '''Represents a Query Generator Job. One ImpalaDockerEnv is associated with it. Able to
  execute queries by either generaing them based on a provided query profile or by
  extracting queries from an existing report. A report is generated when it finishes
  running.
  '''
    def __init__(self,
                 query_profile,
                 job_id,
                 run_name='default',
                 time_limit_sec=24 * 3600,
                 git_command=None,
                 parent_job=None):
        self.git_hash = ''
        self.job_id = job_id
        self.job_name = run_name
        self.parent_job = parent_job
        self.query_profile = query_profile or (ImpalaNestedTypesProfile()
                                               if NESTED_TYPES_MODE else
                                               DefaultProfile())
        self.ref_connection = None
        self.result_list = []
        self.start_time = time()
        self.stop_time = None
        self.target_stop_time = time() + time_limit_sec
        self.test_connection = None
        self.num_queries_executed = 0
        self.num_queries_returned_correct_data = 0
        self.flatten_dialect = 'POSTGRESQL' if NESTED_TYPES_MODE else None
        self.impala_env = ImpalaDockerEnv(git_command)

    def __getstate__(self):
        '''For pickling'''
        result = {}
        result['job_id'] = self.job_id
        result['job_name'] = self.job_name
        result['parent_job'] = self.parent_job
        result['result_list'] = self.result_list
        result['git_hash'] = self.git_hash
        result['start_time'] = self.start_time
        result['stop_time'] = self.stop_time
        result['num_queries_executed'] = self.num_queries_executed
        result[
            'num_queries_returned_correct_data'] = self.num_queries_returned_correct_data
        return result

    def prepare(self):
        '''Prepares the environment and connects to Postgres and Impala running inside the
    Docker container.
    '''
        LOG.info('Starting Job Preparation')
        self.impala_env.prepare()
        LOG.info('Job Preparation Complete')

        self.ref_connection = PostgresqlConnection(
            user_name=POSTGRES_USER_NAME,
            password=None,
            host_name=self.impala_env.host,
            port=self.impala_env.postgres_port,
            db_name=POSTGRES_DATABASE_NAME)
        LOG.info('Created ref_connection')

        self.start_impala()

        self.git_hash = self.impala_env.get_git_hash()

    def get_stack(self):
        stack_trace = self.impala_env.get_stack()
        LOG.info('Stack Trace: {0}'.format(stack_trace))
        return stack_trace

    def start_impala(self):
        '''Starts impala and creates a connection to it. '''
        self.impala_env.start_impala()

        self.test_connection = ImpalaConnection(
            host_name=self.impala_env.host,
            port=self.impala_env.impala_port,
            user_name=None,
            db_name=DATABASE_NAME)

        self.test_connection.reconnect()
        self.query_result_comparator = QueryResultComparator(
            self.query_profile,
            self.ref_connection,
            self.test_connection,
            query_timeout_seconds=4 * 60,
            flatten_dialect='POSTGRESQL')
        LOG.info('Created query result comparator')
        LOG.info(str(self.query_result_comparator.__dict__))

    def is_impala_running(self):
        return self.impala_env.is_impala_running()

    def save_pickle(self):
        '''Saves self as pickle. This is normally done when the job finishes running. '''
        with open(join_path(PATH_TO_FINISHED_JOBS, self.job_id), 'w') as f:
            pickle.dump(self, f)
        LOG.info('Saved Completed Job Pickle')

    def queries_to_be_executed(self):
        '''Generator that outputs query models. They are either generated based on the query
    profile, or they are extracted from an existing report.
    '''
        if self.parent_job:
            # If parent job is specified, get the queries from the parent job report
            with open(join_path(PATH_TO_REPORTS, self.parent_job), 'r') as f:
                parent_report = pickle.load(f)
            for error_type in ['stack', 'row_counts', 'mismatch']:
                for query in parent_report.grouped_results[error_type]:
                    yield query['model']
        else:
            # If parent job is not specified, generate queries with QueryGenerator
            num_unexpected_errors = 0
            while num_unexpected_errors < NUM_UNEXPECTED_ERRORS_THRESHOLD:
                query = None
                try:
                    self.query_generator = QueryGenerator(self.query_profile)
                    query = self.query_generator.create_query(
                        self.common_tables)
                except IndexError as e:
                    # This is a query generator bug that happens extremely rarely
                    LOG.info('Query Generator Choice Problem, {0}'.format(e))
                    continue
                except Exception as e:
                    LOG.info('Unexpected error in queries_to_be_executed, {0}'.
                             format(e))
                    num_unexpected_errors += 1
                    if num_unexpected_errors > NUM_UNEXPECTED_ERRORS_THRESHOLD:
                        LOG.error('Num Unexpected Errors above threshold')
                        raise
                    else:
                        continue
                yield query

    def generate_report(self):
        '''Generate report and save it into the reports directory. '''
        from report import Report
        rep = Report(self.job_id)
        rep.save_pickle()

    def start(self):
        try:
            self.prepare()
            self.query_generator = QueryGenerator(self.query_profile)
            if NESTED_TYPES_MODE:
                self.common_tables = DbCursor.describe_common_tables(
                    [self.test_connection.cursor()])
            else:
                self.common_tables = DbCursor.describe_common_tables([
                    self.test_connection.cursor(),
                    self.ref_connection.cursor()
                ])

            for query_model in self.queries_to_be_executed():
                LOG.info('About to execute query.')
                result_dict = self.run_query(query_model)
                LOG.info('Query Executed successfully.')
                self.num_queries_executed += 1
                if result_dict:
                    self.result_list.append(result_dict)
                LOG.info('Time Left: {0}'.format(self.target_stop_time -
                                                 time()))
                if time() > self.target_stop_time:
                    break
            self.stop_time = time()
            self.save_pickle()
            self.generate_report()
            LOG.info('Generated Report')
        except:
            LOG.exception('Unexpected Exception in start')
            raise
        finally:
            self.impala_env.stop_docker()
            LOG.info('Docker Stopped')
            try:
                os.remove(join_path(PATH_TO_SCHEDULE, self.job_id))
                LOG.info('Schedule file removed')
            except OSError:
                LOG.info('Unable to remove schedule file.')

    def reproduce_crash(self, query_model):
        '''Check if the given query_model causes a crash. Returns the number of times the
    query had to be run to cause a crash.
    '''
        NUM_TRIES = 5
        self.start_impala()
        for try_num in range(1, NUM_TRIES + 1):
            self.query_result_comparator.compare_query_results(query_model)
            if not self.is_impala_running():
                return try_num

    def run_query(self, query_model):
        '''Runs a single query. '''
        if not self.is_impala_running():
            LOG.info('Impala is not running, starting Impala.')
            self.start_impala()

        def run_query_internal():
            self.comparison_result = self.query_result_comparator.compare_query_results(
                query_model)

        self.comparison_result = None
        internal_thread = Thread(target=run_query_internal,
                                 name='run_query_internal_{0}'.format(
                                     self.job_id))
        internal_thread.daemon = True
        internal_thread.start()
        internal_thread.join(timeout=600)
        if internal_thread.is_alive():
            LOG.info(
                'run_query_internal is alive, restarting Impala Environment')
            self.impala_env.stop_docker()
            self.prepare()
            return None
        else:
            LOG.info('run_query_internal is dead as expected')

        comparison_result = self.comparison_result

        if comparison_result.query_timed_out:
            LOG.info('Query Timeout Exception')
            restart_impala = True
        else:
            restart_impala = False

        result_dict = {}

        if self.is_impala_running():
            if comparison_result.error:
                result_dict = self.comparison_result_analysis(
                    comparison_result)
                result_dict['model'] = query_model
            elif comparison_result.query_resulted_in_data:
                self.num_queries_returned_correct_data += 1
        else:
            LOG.info('CRASH OCCURED')
            result_dict = self.comparison_result_analysis(comparison_result)
            result_dict['model'] = query_model
            result_dict['stack'] = self.get_stack()
            result_dict['num_tries_to_reproduce'] = self.reproduce_crash(
                query_model)

        if restart_impala:
            self.start_impala()

        return result_dict

    def comparison_result_analysis(self, comparison_result):
        '''Get useful information from the comparison_result. '''
        result_dict = {}
        result_dict['error'] = comparison_result.error
        result_dict['mismatch_col'] = comparison_result.mismatch_at_col_number
        result_dict['mismatch_ref_row'] = comparison_result.ref_row
        result_dict['mismatch_test_row'] = comparison_result.test_row
        result_dict['ref_row_count'] = comparison_result.ref_row_count
        result_dict['ref_sql'] = comparison_result.ref_sql
        result_dict['test_row_count'] = comparison_result.test_row_count
        result_dict['test_sql'] = comparison_result.test_sql
        return result_dict
    def search(self, number_of_test_queries, stop_on_result_mismatch,
               stop_on_crash, query_timeout_seconds):
        '''Returns an instance of SearchResults, which is a summary report. This method
       oversees the generation, execution, and comparison of queries.

      number_of_test_queries should an integer indicating the maximum number of queries
      to generate and execute.
    '''
        start_time = time()
        query_result_comparator = QueryResultComparator(
            self.query_profile, self.ref_connection, self.test_connection,
            query_timeout_seconds)
        query_generator = QueryGenerator(self.query_profile)
        query_count = 0
        queries_resulted_in_data_count = 0
        mismatch_count = 0
        query_timeout_count = 0
        known_error_count = 0
        test_crash_count = 0
        last_error = None
        repeat_error_count = 0
        while number_of_test_queries > query_count:
            query = query_generator.create_query(self.common_tables)
            query.execution = self.query_profile.get_query_execution()
            query_count += 1
            LOG.info('Running query #%s', query_count)
            result = query_result_comparator.compare_query_results(query)
            if result.query_resulted_in_data:
                queries_resulted_in_data_count += 1
            if isinstance(result.exception, DataLimitExceeded) \
                or isinstance(result.exception, TypeOverflow):
                continue
            if result.error:
                # TODO: These first two come from psycopg2, the postgres driver. Maybe we should
                #       try a different driver? Or maybe the usage of the driver isn't correct.
                #       Anyhow ignore these failures.
                if 'division by zero' in result.error \
                    or 'out of range' in result.error:
                    LOG.debug('Ignoring error: %s', result.error)
                    query_count -= 1
                    continue

                if result.is_known_error:
                    known_error_count += 1
                elif result.query_timed_out:
                    query_timeout_count += 1
                else:
                    mismatch_count += 1

                print('---Test Query---\n')
                print(result.test_sql + '\n')
                print('---Reference Query---\n')
                print(result.ref_sql + '\n')
                print('---Error---\n')
                print(result.error + '\n')
                print('------\n')

                if 'Could not connect' in result.error \
                    or "Couldn't open transport for" in result.error:
                    if stop_on_crash:
                        break
                    # Assume Impala crashed and try restarting
                    test_crash_count += 1
                    LOG.info('Restarting Impala')
                    call([
                        join_path(getenv('IMPALA_HOME'),
                                  'bin/start-impala-cluster.py'),
                        '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")
                    ])
                    self.test_connection.reconnect()
                    query_result_comparator.test_cursor = self.test_connection.create_cursor(
                    )
                    result = query_result_comparator.compare_query_results(
                        query)
                    if result.error:
                        LOG.info('Restarting Impala')
                        call([
                            join_path(getenv('IMPALA_HOME'),
                                      'bin/start-impala-cluster.py'),
                            '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")
                        ])
                        self.test_connection.reconnect()
                        query_result_comparator.test_cursor = self.test_connection.create_cursor(
                        )
                    else:
                        break

                if stop_on_result_mismatch and \
                    not (result.is_known_error or result.query_timed_out):
                    break

                if last_error == result.error \
                    and not (result.is_known_error or result.query_timed_out):
                    repeat_error_count += 1
                    if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT:
                        break
                else:
                    last_error = result.error
                    repeat_error_count = 0
            else:
                if result.query_resulted_in_data:
                    LOG.info('Results matched (%s rows)',
                             result.test_row_count)
                else:
                    LOG.info('Query did not produce meaningful data')
                last_error = None
                repeat_error_count = 0

        return SearchResults(query_count, queries_resulted_in_data_count,
                             mismatch_count, query_timeout_count,
                             known_error_count, test_crash_count,
                             time() - start_time)
Пример #21
0
class InsertStatementGenerator(object):
    def __init__(self, profile):
        # QueryProfile-like object
        self.profile = profile
        # used to generate SELECT queries for INSERT ... SELECT statements;
        # to ensure state is completely reset, this is created anew with each call to
        # generate_statement()
        self.select_stmt_generator = None

    def generate_statement(self, tables, dml_table):
        """
    Return a randomly generated INSERT statement.

    tables should be a list of Table objects. A typical source of such a list comes from
    db_connection.DbCursor.describe_common_tables(). This list describes the possible
    "sources" of the INSERT's WITH and FROM/WHERE clauses.

    dml_table is a required Table object. The INSERT will be into this table.
    """
        if not (isinstance(tables, list) and len(tables) > 0 and all(
            (isinstance(t, Table) for t in tables))):
            raise Exception('tables must be a not-empty list of Table objects')

        if not isinstance(dml_table, Table):
            raise Exception('dml_table must be a Table')

        self.select_stmt_generator = QueryGenerator(self.profile)

        if dml_table.primary_keys:
            insert_statement = InsertStatement(
                conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE)
        else:
            insert_statement = InsertStatement(
                conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT)

        insert_statement.execution = StatementExecutionMode.DML_TEST

        # Choose whether this is a
        #   INSERT INTO table SELECT/VALUES
        # or
        #   INSERT INTO table (col1, col2, ...) SELECT/VALUES
        # If the method returns None, it's the former.
        insert_column_list = self.profile.choose_insert_column_list(dml_table)
        insert_statement.insert_clause = InsertClause(
            dml_table, column_list=insert_column_list)
        # We still need to internally track the columns we're inserting. Keep in mind None
        # means "all" without an explicit column list. Since we've already created the
        # InsertClause object though, we can fill this in for ourselves.
        if insert_column_list is None:
            insert_column_list = dml_table.cols
        insert_item_data_types = [col.type for col in insert_column_list]

        # Decide whether this is INSERT VALUES or INSERT SELECT
        insert_source_clause = self.profile.choose_insert_source_clause()

        if issubclass(insert_source_clause, Query):
            # Use QueryGenerator()'s public interface to generate the SELECT.
            select_query = self.select_stmt_generator.generate_statement(
                tables, select_item_data_types=insert_item_data_types)
            # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
            # generator's type system is not near sophisticated enough to know how random
            # expressions will be implicitly casted in the databases. This requires less work
            # to implement. IMPALA-4693 considers alternative approaches.
            self._cast_select_items(select_query, insert_column_list)
            insert_statement.with_clause = deepcopy(select_query.with_clause)
            select_query.with_clause = None
            insert_statement.select_query = select_query
        elif issubclass(insert_source_clause, ValuesClause):
            insert_statement.values_clause = self._generate_values_clause(
                insert_column_list)
        else:
            raise Exception('unsupported INSERT source clause: {0}'.format(
                insert_source_clause))
        return insert_statement

    def _generate_values_clause(self, columns):
        """
    Return a VALUES clause containing a variable number of rows.

    The values corresponding to primary keys will be non-null constants. Any other
    columns could be null, constants, or function trees that may or may not evaluate to
    null.
    """
        values_rows = []
        for _ in xrange(self.profile.choose_insert_values_row_count()):
            values_row = []
            for col in columns:
                if col.is_primary_key:
                    val = self.profile.choose_constant(
                        return_type=col.exact_type, allow_null=False)
                elif 'constant' == self.profile.choose_values_item_expr():
                    val = self.profile.choose_constant(
                        return_type=col.exact_type, allow_null=True)
                else:
                    func_tree = self.select_stmt_generator.create_func_tree(
                        col.type, allow_subquery=False)
                    val = self.select_stmt_generator.populate_func_with_vals(
                        func_tree)
                    # Only the generic type, not the exact type, of the value will be known. To
                    # avoid a lot of failed queries due to precision errors, we cast the val to
                    # the exact type of the column. This will still not prevent "out of range"
                    # conditions, as we don't try to evaluate the random expressions.
                    val = CastFunc(val, col.exact_type)
                values_row.append(val)
            values_rows.append(ValuesRow(values_row))
        return ValuesClause(values_rows)

    def _cast_select_items(self, select_query, column_list):
        """
    For a given Query select_query and a column_list (list of Columns), cast each select
    item in select_query to the exact type of the column.

    A Query may have a UNION, recursively do this down the line.
    """
        for col_idx, select_item in enumerate(
                select_query.select_clause.items):
            cast_val_expr = CastFunc(select_item.val_expr,
                                     column_list[col_idx].exact_type)
            select_item.val_expr = cast_val_expr
        if select_query.union_clause:
            self._cast_select_items(select_query.union_clause.query,
                                    column_list)
Пример #22
0
    def generate_statement(self, tables, dml_table):
        """
    Return a randomly generated INSERT statement.

    tables should be a list of Table objects. A typical source of such a list comes from
    db_connection.DbCursor.describe_common_tables(). This list describes the possible
    "sources" of the INSERT's WITH and FROM/WHERE clauses.

    dml_table is a required Table object. The INSERT will be into this table.
    """
        if not (isinstance(tables, list) and len(tables) > 0 and all(
            (isinstance(t, Table) for t in tables))):
            raise Exception('tables must be a not-empty list of Table objects')

        if not isinstance(dml_table, Table):
            raise Exception('dml_table must be a Table')

        self.select_stmt_generator = QueryGenerator(self.profile)

        if dml_table.primary_keys:
            insert_statement = InsertStatement(
                conflict_action=InsertStatement.CONFLICT_ACTION_IGNORE)
        else:
            insert_statement = InsertStatement(
                conflict_action=InsertStatement.CONFLICT_ACTION_DEFAULT)

        insert_statement.execution = StatementExecutionMode.DML_TEST

        # Choose whether this is a
        #   INSERT INTO table SELECT/VALUES
        # or
        #   INSERT INTO table (col1, col2, ...) SELECT/VALUES
        # If the method returns None, it's the former.
        insert_column_list = self.profile.choose_insert_column_list(dml_table)
        insert_statement.insert_clause = InsertClause(
            dml_table, column_list=insert_column_list)
        # We still need to internally track the columns we're inserting. Keep in mind None
        # means "all" without an explicit column list. Since we've already created the
        # InsertClause object though, we can fill this in for ourselves.
        if insert_column_list is None:
            insert_column_list = dml_table.cols
        insert_item_data_types = [col.type for col in insert_column_list]

        # Decide whether this is INSERT VALUES or INSERT SELECT
        insert_source_clause = self.profile.choose_insert_source_clause()

        if issubclass(insert_source_clause, Query):
            # Use QueryGenerator()'s public interface to generate the SELECT.
            select_query = self.select_stmt_generator.generate_statement(
                tables, select_item_data_types=insert_item_data_types)
            # To avoid many loss-of-precision errors, explicitly cast the SelectItems. The
            # generator's type system is not near sophisticated enough to know how random
            # expressions will be implicitly casted in the databases. This requires less work
            # to implement. IMPALA-4693 considers alternative approaches.
            self._cast_select_items(select_query, insert_column_list)
            insert_statement.with_clause = deepcopy(select_query.with_clause)
            select_query.with_clause = None
            insert_statement.select_query = select_query
        elif issubclass(insert_source_clause, ValuesClause):
            insert_statement.values_clause = self._generate_values_clause(
                insert_column_list)
        else:
            raise Exception('unsupported INSERT source clause: {0}'.format(
                insert_source_clause))
        return insert_statement
Пример #23
0
  def search(self, number_of_test_queries, stop_on_result_mismatch, stop_on_crash):
    if exists(self.query_shelve_path):
      # Ensure a clean shelve will be created
      remove(self.query_shelve_path)

    start_time = time()
    impala_sql_writer = SqlWriter.create(dialect=IMPALA)
    reference_sql_writer = SqlWriter.create(
        dialect=self.reference_connection.db_type)
    query_result_comparator = QueryResultComparator(
        self.impala_connection, self.reference_connection)
    query_generator = QueryGenerator()
    query_count = 0
    queries_resulted_in_data_count = 0
    mismatch_count = 0
    query_timeout_count = 0
    known_error_count = 0
    impala_crash_count = 0
    last_error = None
    repeat_error_count = 0
    with open(self.query_log_path, 'w') as impala_query_log:
      impala_query_log.write(
         '--\n'
         '-- Stating new run\n'
         '--\n')
      while number_of_test_queries > query_count:
        query = query_generator.create_query(self.common_tables)
        impala_sql = impala_sql_writer.write_query(query)
        if 'FULL OUTER JOIN' in impala_sql and self.reference_connection.db_type == MYSQL:
          # Not supported by MySQL
          continue

        query_count += 1
        LOG.info('Running query #%s', query_count)
        impala_query_log.write(impala_sql + ';\n')
        result = query_result_comparator.compare_query_results(query)
        if result.query_resulted_in_data:
          queries_resulted_in_data_count += 1
        if result.error:
          # TODO: These first two come from psycopg2, the postgres driver. Maybe we should
          #       try a different driver? Or maybe the usage of the driver isn't correct.
          #       Anyhow ignore these failures.
          if 'division by zero' in result.error \
              or 'out of range' in result.error \
              or 'Too much data' in result.error:
            LOG.debug('Ignoring error: %s', result.error)
            query_count -= 1
            continue

          if result.is_known_error:
            known_error_count += 1
          elif result.query_timed_out:
            query_timeout_count += 1
          else:
            mismatch_count += 1
            with closing(open_shelve(self.query_shelve_path)) as query_shelve:
              query_shelve[str(query_count)] = query

          print('---Impala Query---\n')
          print(impala_sql_writer.write_query(query, pretty=True) + '\n')
          print('---Reference Query---\n')
          print(reference_sql_writer.write_query(query, pretty=True) + '\n')
          print('---Error---\n')
          print(result.error + '\n')
          print('------\n')

          if 'Could not connect' in result.error \
              or "Couldn't open transport for" in result.error:
            # if stop_on_crash:
            #   break
            # Assume Impala crashed and try restarting
            impala_crash_count += 1
            LOG.info('Restarting Impala')
            call([join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'),
                            '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")])
            self.impala_connection.reconnect()
            query_result_comparator.impala_cursor = self.impala_connection.create_cursor()
            result = query_result_comparator.compare_query_results(query)
            if result.error:
              LOG.info('Restarting Impala')
              call([join(getenv('IMPALA_HOME'), 'bin/start-impala-cluster.py'),
                              '--log_dir=%s' % getenv('LOG_DIR', "/tmp/")])
              self.impala_connection.reconnect()
              query_result_comparator.impala_cursor = self.impala_connection.create_cursor()
            else:
              break

          if stop_on_result_mismatch and \
              not (result.is_known_error or result.query_timed_out):
            break

          if last_error == result.error \
              and not (result.is_known_error or result.query_timed_out):
            repeat_error_count += 1
            if repeat_error_count == self.ABORT_ON_REPEAT_ERROR_COUNT:
              break
          else:
            last_error = result.error
            repeat_error_count = 0
        else:
          if result.query_resulted_in_data:
            LOG.info('Results matched (%s rows)', result.impala_row_count)
          else:
            LOG.info('Query did not produce meaningful data')
          last_error = None
          repeat_error_count = 0

      return SearchResults(
          query_count,
          queries_resulted_in_data_count,
          mismatch_count,
          query_timeout_count,
          known_error_count,
          impala_crash_count,
          time() - start_time)