def hash_result_impl(): result_log_filename = None try: with self._open_result_file(query, query_id) as result_log: result_log.write(query.sql) result_log.write("\n") result_log_filename = result_log.name current_thread().result = 1 while should_continue.value: LOG.debug( "Fetching result for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor._last_operation else None)) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: LOG.debug( "No more results for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor._last_operation else None)) return for row in rows: current_thread().result = _add_row_to_hash(row, current_thread().result) if result_log: result_log.write(",".join([str(val) for val in row])) result_log.write("\thash=") result_log.write(str(current_thread().result)) result_log.write("\n") except Exception as e: current_thread().error = e finally: # Only retain result files with incorrect results. if (result_log_filename is not None and current_thread().error is not None and current_thread().result == query.result_hash): os.remove(result_log_filename)
def hash_result_impl(): result_log = None try: file_name = '_'.join( [query.logical_query_id, query_id.replace(":", "_")]) if query.result_hash is None: file_name += "_initial" file_name += "_results.txt" result_log = open( os.path.join(self.results_dir, RESULT_HASHES_DIR, file_name), "w") result_log.write(query.sql) result_log.write("\n") current_thread().result = 1 while should_continue.value: LOG.debug( "Fetching result for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor. _last_operation else None)) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: LOG.debug( "No more results for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor. _last_operation else None)) return for row in rows: for idx, val in enumerate(row): if val is None: # The hash() of None can change from run to run since it's based on # a memory address. A chosen value will be used instead. val = 38463209 elif isinstance(val, float): # Floats returned by Impala may not be deterministic, the ending # insignificant digits may differ. Only the first 6 digits will be used # after rounding. sval = "%f" % val dot_idx = sval.find(".") val = round(val, 6 - dot_idx) current_thread().result += (idx + 1) * hash(val) # Modulo the result to keep it "small" otherwise the math ops can be slow # since python does infinite precision math. current_thread().result %= maxint if result_log: result_log.write(str(val)) result_log.write("\t") result_log.write(str(current_thread().result)) result_log.write("\n") except Exception as e: current_thread().error = e finally: if result_log is not None: result_log.close() if (current_thread().error is not None and current_thread().result == query.result_hash): os.remove(result_log.name)
def _fetch_and_hash_result(self, cursor, timeout_unix_time, query): """Fetches results from 'cursor' and returns a hash that is independent of row order. Raises QueryTimeout() if we couldn't fetch all rows from the query before time() reaches 'timeout_unix_time'. 'query' is only used for debug logging purposes (if the result is not as expected a log file will be left in RESULTS_DIR for investigation). """ query_id = op_handle_to_query_id(cursor._last_operation.handle if cursor._last_operation else None) # A value of 1 indicates that the hash thread should continue to work. should_continue = Value("i", 1) def hash_result_impl(): result_log_filename = None try: with self._open_result_file(query, query_id) as result_log: result_log.write(query.sql) result_log.write("\n") result_log_filename = result_log.name current_thread().result = 1 while should_continue.value: LOG.debug( "Fetching result for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor._last_operation else None)) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: LOG.debug( "No more results for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor._last_operation else None)) return for row in rows: current_thread().result = _add_row_to_hash(row, current_thread().result) if result_log: result_log.write(",".join([str(val) for val in row])) result_log.write("\thash=") result_log.write(str(current_thread().result)) result_log.write("\n") except Exception as e: current_thread().error = e finally: # Only retain result files with incorrect results. if (result_log_filename is not None and current_thread().error is not None and current_thread().result == query.result_hash): os.remove(result_log_filename) # TODO: IMPALA-6326: don't fork a thread here hash_thread = create_and_start_daemon_thread( hash_result_impl, "Fetch Results %s" % query_id) hash_thread.join(max(timeout_unix_time - time(), 0)) if hash_thread.is_alive(): should_continue.value = 0 raise QueryTimeout() if hash_thread.error: raise hash_thread.error return hash_thread.result
def _fetch_and_hash_result(self, cursor, timeout_unix_time, query): """Fetches results from 'cursor' and returns a hash that is independent of row order. Raises QueryTimeout() if we couldn't fetch all rows from the query before time() reaches 'timeout_unix_time'. 'query' is only used for debug logging purposes (if the result is not as expected a log file will be left in RESULTS_DIR for investigation). """ query_id = op_handle_to_query_id(cursor._last_operation.handle if cursor._last_operation else None) result_log_filename = None curr_hash = 1 try: with self._open_result_file(query, query_id) as result_log: result_log.write(query.sql) result_log.write("\n") result_log_filename = result_log.name while True: # Check for timeout before each fetch call. Fetching can block indefinitely, # e.g. if the query is a selective scan, so we may miss the timeout, but we # will live with that limitation for now. # TODO: IMPALA-8289: use a non-blocking fetch when support is added if time() >= timeout_unix_time: LOG.debug("Hit timeout before fetch for query with id {0} {1} >= {2}".format( query_id, time(), timeout_unix_time)) raise QueryTimeout() LOG.debug("Fetching result for query with id {0}".format(query_id)) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: LOG.debug("No more results for query with id {0}".format(query_id)) break for row in rows: curr_hash = _add_row_to_hash(row, curr_hash) if result_log: result_log.write(",".join([str(val) for val in row])) result_log.write("\thash=") result_log.write(str(curr_hash)) result_log.write("\n") except Exception: # Don't retain result files for failed queries. if result_log_filename is not None: os.remove(result_log_filename) raise # Only retain result files with incorrect results. if result_log_filename is not None and curr_hash == query.result_hash: os.remove(result_log_filename) return curr_hash
def run_query(self, query, mem_limit_mb, run_set_up=False, timeout_secs=maxint, should_cancel=False, retain_profile=False): """Run a query and return an execution report. If 'run_set_up' is True, set up sql will be executed before the main query. This should be the case during the binary search phase of the stress test. If 'should_cancel' is True, don't get the query profile for timed out queries because the query was purposely cancelled by setting the query timeout too short to complete, rather than having some problem that needs to be investigated. """ if not self.impalad_conn: raise Exception("connect() must first be called") timeout_unix_time = time() + timeout_secs report = QueryReport(query) try: with self.impalad_conn.cursor() as cursor: start_time = time() self._set_db_and_options(cursor, query, run_set_up, mem_limit_mb, timeout_secs) error = None try: cursor.execute_async( "/* Mem: %s MB. Coordinator: %s. */\n" % (mem_limit_mb, self.impalad.host_name) + query.sql) report.query_id = op_handle_to_query_id(cursor._last_operation.handle if cursor._last_operation else None) LOG.debug("Query id is %s", report.query_id) if not self._wait_until_fetchable(cursor, report, timeout_unix_time, should_cancel): return report if query.query_type == QueryType.SELECT: try: report.result_hash = self._fetch_and_hash_result(cursor, timeout_unix_time, query) if retain_profile or \ query.result_hash and report.result_hash != query.result_hash: fetch_and_set_profile(cursor, report) except QueryTimeout: # TODO: IMPALA-6326: remove this cancel, which can race with the thread # in _fetch_and_hash_result() and cause crashes and other errors. self._cancel(cursor, report) return report else: # If query is in error state, this will raise an exception cursor._wait_to_finish() except Exception as error: report.query_id = op_handle_to_query_id(cursor._last_operation.handle if cursor._last_operation else None) LOG.debug("Error running query with id %s: %s", report.query_id, error) self._check_for_memory_errors(report, cursor, error) if report.has_query_error(): return report report.runtime_secs = time() - start_time if cursor.execution_failed() or self.check_if_mem_was_spilled: fetch_and_set_profile(cursor, report) report.mem_was_spilled = any([ pattern.search(report.profile) is not None for pattern in QueryRunner.SPILLED_PATTERNS]) report.not_enough_memory = "Memory limit exceeded" in report.profile except Exception as error: # A mem limit error would have been caught above, no need to check for that here. report.other_error = error return report
def run_query(self, query, mem_limit_mb, run_set_up=False, timeout_secs=maxint, cancel_mech=None, retain_profile=False): """Run a query and return an execution report. If 'run_set_up' is True, set up sql will be executed before the main query. This should be the case during the binary search phase of the stress test. 'cancel_mech' is optionally a CancelMechanism value that should be used to cancel the query after timeout_secs. If 'cancel_mech' is provided, don't get the query profile for timed out queries because the query was purposely cancelled, rather than having some problem that needs investigation. """ assert self.impalad_conn, "connect() must be called before run_query()" assert cancel_mech is None or cancel_mech in CancelMechanism.ALL_MECHS timeout_unix_time = time() + timeout_secs report = QueryReport(query) try: with self.impalad_conn.cursor() as cursor: start_time = time() self._set_db_and_options(cursor, query, run_set_up, mem_limit_mb, timeout_secs, cancel_mech) error = None try: cursor.execute_async( "/* Mem: %s MB. Coordinator: %s. */\n" % (mem_limit_mb, self.impalad.host_name) + query.sql) report.query_id = op_handle_to_query_id(cursor._last_operation.handle if cursor._last_operation else None) LOG.debug("Query id is %s", report.query_id) self._wait_until_fetchable(cursor, report, timeout_unix_time) if query.query_type == QueryType.SELECT: report.result_hash = self._fetch_and_hash_result(cursor, timeout_unix_time, query) else: # If query is in error state, this will raise an exception cursor._wait_to_finish() if (retain_profile or query.result_hash and report.result_hash != query.result_hash): fetch_and_set_profile(cursor, report) except QueryTimeout: if not cancel_mech: fetch_and_set_profile(cursor, report) # Cancel from this client if a) we hit the timeout unexpectedly or b) we're # deliberately cancelling the query via the client. if not cancel_mech or cancel_mech == CancelMechanism.VIA_CLIENT: self._cancel(cursor, report) else: # Wait until the query is cancelled by a different mechanism. self._wait_until_cancelled(cursor, report.query_id) report.timed_out = True return report except Exception as error: report.query_id = op_handle_to_query_id(cursor._last_operation.handle if cursor._last_operation else None) LOG.debug("Error running query with id %s: %s", report.query_id, error) if (cancel_mech == CancelMechanism.VIA_OPTION and is_exec_time_limit_error(error)): # Timeout via query option counts as a timeout. report.timed_out = True else: self._check_for_memory_errors(report, cursor, error) if report.has_query_error(): return report report.runtime_secs = time() - start_time if cursor.execution_failed() or self.check_if_mem_was_spilled: fetch_and_set_profile(cursor, report) report.mem_was_spilled = any([ pattern.search(report.profile) is not None for pattern in QueryRunner.SPILLED_PATTERNS]) # TODO: is this needed? Memory errors are generally caught by the try/except. report.not_enough_memory = "Memory limit exceeded" in report.profile except Exception as error: # A mem limit error would have been caught above, no need to check for that here. LOG.debug("Caught error", error) report.other_error = error return report
def _hash_result(self, cursor, timeout_unix_time, query): """Returns a hash that is independent of row order. 'query' is only used for debug logging purposes (if the result is not as expected a log file will be left for investigations). """ query_id = op_handle_to_query_id( cursor._last_operation.handle if cursor._last_operation else None) # A value of 1 indicates that the hash thread should continue to work. should_continue = Value("i", 1) def hash_result_impl(): result_log = None try: file_name = '_'.join( [query.logical_query_id, query_id.replace(":", "_")]) if query.result_hash is None: file_name += "_initial" file_name += "_results.txt" result_log = open( os.path.join(self.results_dir, RESULT_HASHES_DIR, file_name), "w") result_log.write(query.sql) result_log.write("\n") current_thread().result = 1 while should_continue.value: LOG.debug( "Fetching result for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor. _last_operation else None)) rows = cursor.fetchmany(self.BATCH_SIZE) if not rows: LOG.debug( "No more results for query with id %s", op_handle_to_query_id( cursor._last_operation.handle if cursor. _last_operation else None)) return for row in rows: for idx, val in enumerate(row): if val is None: # The hash() of None can change from run to run since it's based on # a memory address. A chosen value will be used instead. val = 38463209 elif isinstance(val, float): # Floats returned by Impala may not be deterministic, the ending # insignificant digits may differ. Only the first 6 digits will be used # after rounding. sval = "%f" % val dot_idx = sval.find(".") val = round(val, 6 - dot_idx) current_thread().result += (idx + 1) * hash(val) # Modulo the result to keep it "small" otherwise the math ops can be slow # since python does infinite precision math. current_thread().result %= maxint if result_log: result_log.write(str(val)) result_log.write("\t") result_log.write(str(current_thread().result)) result_log.write("\n") except Exception as e: current_thread().error = e finally: if result_log is not None: result_log.close() if (current_thread().error is not None and current_thread().result == query.result_hash): os.remove(result_log.name) hash_thread = create_and_start_daemon_thread( hash_result_impl, "Fetch Results %s" % query_id) hash_thread.join(max(timeout_unix_time - time(), 0)) if hash_thread.is_alive(): should_continue.value = 0 raise QueryTimeout() if hash_thread.error: raise hash_thread.error return hash_thread.result