def __init__(self, server_hostname, server_port, test_id, test_index, logger_name, log_dir, debug): signal.signal(signal.SIGTERM, self._sigterm_handler) # register a SIGTERM handler self.serde = SerDe() self.logger = test_logger(logger_name, log_dir, debug) self.runner_port = server_port self.test_id = test_id self.test_index = test_index self.id = "test-runner-%d-%d" % (os.getpid(), id(self)) self.message = ClientEventFactory(self.test_id, self.test_index, self.id) self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger) ready_reply = self.sender.send(self.message.ready()) self.session_context = ready_reply["session_context"] self.test_metadata = ready_reply["test_metadata"] self.cluster = ready_reply["cluster"] # Wait to instantiate the test object until running the test self.test = None self.test_context = None
class RunnerClient(object): """Run a single test""" def __init__(self, server_hostname, server_port, test_id, test_index, logger_name, log_dir, debug): signal.signal(signal.SIGTERM, self._sigterm_handler) # register a SIGTERM handler self.serde = SerDe() self.logger = test_logger(logger_name, log_dir, debug) self.runner_port = server_port self.test_id = test_id self.test_index = test_index self.id = "test-runner-%d-%d" % (os.getpid(), id(self)) self.message = ClientEventFactory(self.test_id, self.test_index, self.id) self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger) ready_reply = self.sender.send(self.message.ready()) self.session_context = ready_reply["session_context"] self.test_metadata = ready_reply["test_metadata"] self.cluster = ready_reply["cluster"] # Wait to instantiate the test object until running the test self.test = None self.test_context = None def send(self, event): return self.sender.send(event) def _sigterm_handler(self, signum, frame): """Translate SIGTERM to SIGINT on this process python will treat SIGINT as a Keyboard exception. Exception handling does the rest. """ os.kill(os.getpid(), signal.SIGINT) def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args): loader = TestLoader(self.session_context, self.logger, injected_args=injected_args, cluster=self.cluster) loaded_context_list = loader.discover(directory, file_name, cls_name, method_name) assert len(loaded_context_list) == 1 test_context = loaded_context_list[0] test_context.cluster = self.cluster return test_context def run(self): self.log(logging.INFO, "Loading test %s" % str(self.test_metadata)) self.test_context = self._collect_test_context(**self.test_metadata) self.test_context.test_index = self.test_index self.send(self.message.running()) if self.test_context.ignore: # Skip running this test, but keep track of the fact that we ignored it result = TestResult(self.test_context, self.test_index, self.session_context, test_status=IGNORE, start_time=time.time(), stop_time=time.time()) result.report() # Tell the server we are finished self.send(self.message.finished(result=result)) return # Results from this test, as well as logs will be dumped here mkdir_p(TestContext.results_dir(self.test_context, self.test_index)) start_time = -1 stop_time = -1 test_status = PASS summary = "" data = None try: # Instantiate test self.test = self.test_context.cls(self.test_context) self.log(logging.DEBUG, "Checking if there are enough nodes...") min_cluster_spec = self.test.min_cluster_spec() # Check test resource msg = self.cluster.all().nodes.attempt_remove_spec(min_cluster_spec) if len(msg) > 0: raise RuntimeError("There are not enough nodes available in the cluster to run this test. " + msg) # Run the test unit start_time = time.time() self.setup_test() data = self.run_test() test_status = PASS self.log(logging.INFO, "PASS") except BaseException as e: err_trace = str(e) + "\n" + traceback.format_exc(limit=16) self.log(logging.INFO, "FAIL: " + err_trace) test_status = FAIL summary += err_trace finally: self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status) stop_time = time.time() if hasattr(self, "services"): service_errors = self.test_context.services.errors() if service_errors: summary += "\n\n" + service_errors result = TestResult( self.test_context, self.test_index, self.session_context, test_status, summary, data, start_time, stop_time) self.log(logging.INFO, "Summary: %s" % str(result.summary)) self.log(logging.INFO, "Data: %s" % str(result.data)) result.report() # Tell the server we are finished self._do_safely(lambda: self.send(self.message.finished(result=result)), "Problem sending FINISHED message:") # Release test_context resources only after creating the result and finishing logging activity # The Sender object uses the same logger, so we postpone closing until after the finished message is sent self.test_context.close() self.test_context = None self.test = None def setup_test(self): """start services etc""" self.log(logging.INFO, "Setting up...") self.test.setup() def run_test(self): """Run the test! We expect test_context.function to be a function or unbound method which takes an instantiated test object as its argument. """ self.log(logging.INFO, "Running...") return self.test_context.function(self.test) def _do_safely(self, action, err_msg): try: action() except BaseException as e: self.log(logging.WARN, err_msg + " " + str(e) + "\n" + traceback.format_exc(limit=16)) def teardown_test(self, teardown_services=True, test_status=None): """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes. Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner should stop if a keyboard interrupt is caught. """ self.log(logging.INFO, "Tearing down...") if not self.test: self.log(logging.WARN, "%s failed to instantiate" % self.test_id) self.test_context.close() return services = self.test_context.services if teardown_services: self._do_safely(self.test.teardown, "Error running teardown method:") # stop services self._do_safely(services.stop_all, "Error stopping services:") # always collect service logs whether or not we tear down # logs are typically removed during "clean" phase, so collect logs before cleaning self.log(logging.DEBUG, "Copying logs from services...") self._do_safely(lambda: self.test.copy_service_logs(test_status), "Error copying service logs:") # clean up stray processes and persistent state if teardown_services: self.log(logging.DEBUG, "Cleaning up services...") self._do_safely(services.clean_all, "Error cleaning services:") self.log(logging.DEBUG, "Freeing nodes...") self._do_safely(self.test.free_nodes, "Error freeing nodes:") def log(self, log_level, msg, *args, **kwargs): """Log to the service log and the test log of the current test.""" if self.test_context is None: msg = "%s: %s" % (self.__class__.__name__, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) else: msg = "%s: %s: %s" % (self.__class__.__name__, self.test_context.test_name, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) self.send(self.message.log(msg, level=log_level))
def ready_response(self, client_id, port): sender_event_factory = ClientEventFactory("test_1", 0, client_id) sender = Sender(server_host='localhost', server_port=port, message_supplier=sender_event_factory, logger=logging) sender.send(sender_event_factory.ready())
class RunnerClient(object): """Run a single test""" def __init__(self, server_hostname, server_port, test_id, test_index, logger_name, log_dir, debug): signal.signal(signal.SIGTERM, self._sigterm_handler) # register a SIGTERM handler self.serde = SerDe() self.logger = test_logger(logger_name, log_dir, debug) self.runner_port = server_port self.test_id = test_id self.test_index = test_index self.id = "test-runner-%d-%d" % (os.getpid(), id(self)) self.message = ClientEventFactory(self.test_id, self.test_index, self.id) self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger) ready_reply = self.sender.send(self.message.ready()) self.session_context = ready_reply["session_context"] self.test_metadata = ready_reply["test_metadata"] self.cluster = ready_reply["cluster"] # Wait to instantiate the test object until running the test self.test = None self.test_context = None def send(self, event): return self.sender.send(event) def _sigterm_handler(self, signum, frame): """Translate SIGTERM to SIGINT on this process python will treat SIGINT as a Keyboard exception. Exception handling does the rest. """ os.kill(os.getpid(), signal.SIGINT) def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args): loader = TestLoader(self.session_context, self.logger, injected_args=injected_args, cluster=self.cluster) loaded_context_list = loader.discover(directory, file_name, cls_name, method_name) assert len(loaded_context_list) == 1 test_context = loaded_context_list[0] test_context.cluster = self.cluster return test_context def run(self): self.log(logging.INFO, "Loading test %s" % str(self.test_metadata)) self.test_context = self._collect_test_context(**self.test_metadata) self.test_context.test_index = self.test_index self.send(self.message.running()) if self.test_context.ignore: # Skip running this test, but keep track of the fact that we ignored it result = TestResult(self.test_context, self.test_index, self.session_context, test_status=IGNORE, start_time=time.time(), stop_time=time.time()) result.report() # Tell the server we are finished self.send(self.message.finished(result=result)) return # Results from this test, as well as logs will be dumped here mkdir_p(TestContext.results_dir(self.test_context, self.test_index)) start_time = -1 stop_time = -1 test_status = PASS summary = "" data = None try: # Instantiate test self.test = self.test_context.cls(self.test_context) self.log(logging.DEBUG, "Checking if there are enough nodes...") min_cluster_spec = self.test.min_cluster_spec() os_to_num_nodes = {} for node_spec in min_cluster_spec: if not os_to_num_nodes.get(node_spec.operating_system): os_to_num_nodes[node_spec.operating_system] = 1 else: os_to_num_nodes[node_spec.operating_system] = os_to_num_nodes[node_spec.operating_system] + 1 for (operating_system, node_count) in os_to_num_nodes.iteritems(): num_avail = len(list(self.cluster.all().nodes.elements(operating_system=operating_system))) if node_count > num_avail: raise RuntimeError( "There are not enough nodes available in the cluster to run this test. " "Cluster size for %s: %d, Need at least: %d. Services currently registered: %s" % (operating_system, num_avail, node_count, self.test_context.services)) # Run the test unit start_time = time.time() self.setup_test() data = self.run_test() test_status = PASS self.log(logging.INFO, "PASS") except BaseException as e: err_trace = str(e.message) + "\n" + traceback.format_exc(limit=16) self.log(logging.INFO, "FAIL: " + err_trace) test_status = FAIL summary += err_trace finally: self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status) stop_time = time.time() service_errors = self.test_context.services.errors() if service_errors: summary += "\n\n" + service_errors result = TestResult( self.test_context, self.test_index, self.session_context, test_status, summary, data, start_time, stop_time) self.log(logging.INFO, "Summary: %s" % str(result.summary)) self.log(logging.INFO, "Data: %s" % str(result.data)) result.report() # Tell the server we are finished self._do_safely(lambda: self.send(self.message.finished(result=result)), "Problem sending FINISHED message:") # Release test_context resources only after creating the result and finishing logging activity # The Sender object uses the same logger, so we postpone closing until after the finished message is sent self.test_context.close() self.test_context = None self.test = None def setup_test(self): """start services etc""" self.log(logging.INFO, "Setting up...") self.test.setup() def run_test(self): """Run the test! We expect test_context.function to be a function or unbound method which takes an instantiated test object as its argument. """ self.log(logging.INFO, "Running...") return self.test_context.function(self.test) def _do_safely(self, action, err_msg): try: action() except BaseException as e: self.log(logging.WARN, err_msg + " " + e.message + "\n" + traceback.format_exc(limit=16)) def teardown_test(self, teardown_services=True, test_status=None): """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes. Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner should stop if a keyboard interrupt is caught. """ self.log(logging.INFO, "Tearing down...") if not self.test: self.log(logging.WARN, "%s failed to instantiate" % self.test_id) self.test_context.close() return services = self.test_context.services if teardown_services: self._do_safely(self.test.teardown, "Error running teardown method:") # stop services self._do_safely(services.stop_all, "Error stopping services:") # always collect service logs whether or not we tear down # logs are typically removed during "clean" phase, so collect logs before cleaning self.log(logging.DEBUG, "Copying logs from services...") self._do_safely(lambda: self.test.copy_service_logs(test_status), "Error copying service logs:") # clean up stray processes and persistent state if teardown_services: self.log(logging.DEBUG, "Cleaning up services...") self._do_safely(services.clean_all, "Error cleaning services:") self.log(logging.DEBUG, "Freeing nodes...") self._do_safely(self.test.free_nodes, "Error freeing nodes:") def log(self, log_level, msg, *args, **kwargs): """Log to the service log and the test log of the current test.""" if self.test_context is None: msg = "%s: %s" % (self.__class__.__name__, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) else: msg = "%s: %s: %s" % (self.__class__.__name__, self.test_context.test_name, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) self.send(self.message.log(msg, level=log_level))
class RunnerClient(object): """Run a single test""" def __init__(self, server_hostname, server_port, test_id, test_index, logger_name, log_dir, debug, fail_bad_cluster_utilization, deflake_num): signal.signal(signal.SIGTERM, self._sigterm_handler) # register a SIGTERM handler self.serde = SerDe() self.logger = test_logger(logger_name, log_dir, debug) self.runner_port = server_port self.fail_bad_cluster_utilization = fail_bad_cluster_utilization self.test_id = test_id self.test_index = test_index self.id = "test-runner-%d-%d" % (os.getpid(), id(self)) self.message = ClientEventFactory(self.test_id, self.test_index, self.id) self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger) ready_reply = self.sender.send(self.message.ready()) self.session_context = ready_reply["session_context"] self.test_metadata = ready_reply["test_metadata"] self.cluster = ready_reply["cluster"] self.deflake_num = deflake_num # Wait to instantiate the test object until running the test self.test = None self.test_context = None self.all_services = None def send(self, event): return self.sender.send(event) def _sigterm_handler(self, signum, frame): """Translate SIGTERM to SIGINT on this process python will treat SIGINT as a Keyboard exception. Exception handling does the rest. """ os.kill(os.getpid(), signal.SIGINT) def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args): loader = TestLoader(self.session_context, self.logger, injected_args=injected_args, cluster=self.cluster) # TODO: deal with this in a more graceful fashion. # In an unlikely even that discover either raises the exception or fails to find exactly one test # we should probably continue trying other tests rather than killing this process loaded_context_list = loader.discover(directory, file_name, cls_name, method_name) assert len(loaded_context_list) == 1 test_context = loaded_context_list[0] test_context.cluster = self.cluster return test_context def run(self): self.log(logging.INFO, "Loading test %s" % str(self.test_metadata)) self.test_context = self._collect_test_context(**self.test_metadata) self.test_context.test_index = self.test_index self.send(self.message.running()) if self.test_context.ignore: # Skip running this test, but keep track of the fact that we ignored it result = TestResult(self.test_context, self.test_index, self.session_context, test_status=IGNORE, start_time=time.time(), stop_time=time.time()) result.report() # Tell the server we are finished self.send(self.message.finished(result=result)) return start_time = -1 stop_time = -1 test_status = FAIL summary = [] data = None self.all_services = ServiceRegistry() num_runs = 0 try: while test_status == FAIL and num_runs < self.deflake_num: num_runs += 1 self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num)) start_time = time.time() test_status, summary, data = self._do_run(num_runs) if test_status == PASS and num_runs > 1: test_status = FLAKY msg = str(test_status.to_json()) if summary: msg += ": {}".format(summary) if num_runs != self.deflake_num: msg += "\n" + "~" * max(len(line) for line in summary.split('\n')) self.log(logging.INFO, msg) finally: stop_time = time.time() test_status, summary = self._check_cluster_utilization(test_status, summary) if num_runs > 1: # for reporting purposes report all services self.test_context.services = self.all_services # for flaky tests, we report the start and end time of the successful run, and not the whole run period result = TestResult( self.test_context, self.test_index, self.session_context, test_status, summary, data, start_time, stop_time) self.log(logging.INFO, "Data: %s" % str(result.data)) result.report() # Tell the server we are finished self._do_safely(lambda: self.send(self.message.finished(result=result)), "Problem sending FINISHED message for " + str(self.test_metadata) + ":\n") # Release test_context resources only after creating the result and finishing logging activity # The Sender object uses the same logger, so we postpone closing until after the finished message is sent self.test_context.close() self.all_services = None self.test_context = None self.test = None def _do_run(self, num_runs): test_status = FAIL summary = [] data = None sid_factory = MultiRunServiceIdFactory(num_runs) if self.deflake_num > 1 else service_id_factory try: # Results from this test, as well as logs will be dumped here mkdir_p(TestContext.results_dir(self.test_context, self.test_index)) # Instantiate test self.test = self.test_context.cls(self.test_context) # Check if there are enough nodes self._check_min_cluster_spec() # Run the test unit self.setup_test() data = self.run_test() test_status = PASS except BaseException as e: # mark the test as failed before doing anything else test_status = FAIL err_trace = self._exc_msg(e) summary.append(err_trace) finally: for service in self.test_context.services: service.service_id_factory = sid_factory self.all_services.append(service) self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status) if hasattr(self.test_context, "services"): service_errors = self.test_context.services.errors() if service_errors: summary.extend(["\n\n", service_errors]) # free nodes if self.test: self.log(logging.DEBUG, "Freeing nodes...") self._do_safely(self.test.free_nodes, "Error freeing nodes:") return test_status, "".join(summary), data def _check_min_cluster_spec(self): self.log(logging.DEBUG, "Checking if there are enough nodes...") min_cluster_spec = self.test.min_cluster_spec() os_to_num_nodes = {} for node_spec in min_cluster_spec: if not os_to_num_nodes.get(node_spec.operating_system): os_to_num_nodes[node_spec.operating_system] = 1 else: os_to_num_nodes[node_spec.operating_system] = os_to_num_nodes[node_spec.operating_system] + 1 for (operating_system, node_count) in iteritems(os_to_num_nodes): num_avail = len(list(self.cluster.all().nodes.elements(operating_system=operating_system))) if node_count > num_avail: raise RuntimeError( "There are not enough nodes available in the cluster to run this test. " "Cluster size for %s: %d, Need at least: %d. Services currently registered: %s" % (operating_system, num_avail, node_count, self.test_context.services)) def _check_cluster_utilization(self, result, summary): """Checks if the number of nodes used by a test is less than the number of nodes requested by the test. If this is the case and we wish to fail on bad cluster utilization, the result value is failed. Will also print a warning if the test passes and the node utilization doesn't match. """ max_used = self.cluster.max_used() total = len(self.cluster.all()) if max_used < total: message = "Test requested %d nodes, used only %d" % (total, max_used) if self.fail_bad_cluster_utilization: # only check node utilization on test pass if result == PASS or result == FLAKY: self.log(logging.INFO, "FAIL: " + message) result = FAIL summary += message else: self.log(logging.WARN, message) return result, summary def setup_test(self): """start services etc""" self.log(logging.INFO, "Setting up...") self.test.setup() def run_test(self): """Run the test! We expect test_context.function to be a function or unbound method which takes an instantiated test object as its argument. """ self.log(logging.INFO, "Running...") return self.test_context.function(self.test) def _exc_msg(self, e): return repr(e) + "\n" + traceback.format_exc(limit=16) def _do_safely(self, action, err_msg): try: action() except BaseException as e: self.log(logging.WARN, err_msg + " " + self._exc_msg(e)) def teardown_test(self, teardown_services=True, test_status=None): """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes. Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner should stop if a keyboard interrupt is caught. """ self.log(logging.INFO, "Tearing down...") if not self.test: self.log(logging.WARN, "%s failed to instantiate" % self.test_id) self.test_context.close() return services = self.test_context.services if teardown_services: self._do_safely(self.test.teardown, "Error running teardown method:") # stop services self._do_safely(services.stop_all, "Error stopping services:") # always collect service logs whether or not we tear down # logs are typically removed during "clean" phase, so collect logs before cleaning self.log(logging.DEBUG, "Copying logs from services...") self._do_safely(lambda: self.test.copy_service_logs(test_status), "Error copying service logs:") # clean up stray processes and persistent state if teardown_services: self.log(logging.DEBUG, "Cleaning up services...") self._do_safely(services.clean_all, "Error cleaning services:") def log(self, log_level, msg, *args, **kwargs): """Log to the service log and the test log of the current test.""" if self.test_context is None: msg = "%s: %s" % (self.__class__.__name__, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) else: msg = "%s: %s: %s" % (self.__class__.__name__, self.test_context.test_name, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) self.send(self.message.log(msg, level=log_level))