def __init__(self, session_context, module=None, cls=None, function=None, injected_args=None): """ :type session_context: ducktape.tests.session.SessionContext """ self.module = module self.cls = cls self.function = function self.injected_args = injected_args self.session_context = session_context self.services = ServiceRegistry() # dict for toggling service log collection on/off self.log_collect = {} # Individual test results go here mkdir_p(self.results_dir) self._logger_configured = False self.configure_logger()
def __init__(self, **kwargs): """ :param session_context :param module :param cls :param function :param injected_args :param service_registry :param cluster_use_metadata """ self.session_context = kwargs.get("session_context") self.module = kwargs.get("module") self.cls = kwargs.get("cls") self.function = kwargs.get("function") self.injected_args = kwargs.get("injected_args") self.ignore = kwargs.get("ignore", False) # cluster_use_metadata is a dict containing information about how this test will use cluster resources # to date, this only includes "num_nodes" self.cluster_use_metadata = copy.copy(kwargs.get("cluster_use_metadata", {})) self.services = ServiceRegistry() # dict for toggling service log collection on/off self.log_collect = {}
def __init__(self, **kwargs): """ :param session_context :param module :param cls :param function :param injected_args """ self.session_context = kwargs.get("session_context", None) self.module = kwargs.get("module", None) self.cls = kwargs.get("cls", None) self.function = kwargs.get("function", None) self.injected_args = kwargs.get("injected_args", None) self.ignore = kwargs.get("ignore", False) self.services = ServiceRegistry() # dict for toggling service log collection on/off self.log_collect = {}
def __init__(self, **kwargs): """ :param session_context: :param cluster: the cluster object which will be used by this test :param module: name of the module containing the test class/method :param cls: class object containing the test method :param function: the test method :param file: file containing this module :param injected_args: a dict containing keyword args which will be passed to the test method :param cluster_use_metadata: dict containing information about how this test will use cluster resources, to date, this only includes "num_nodes" """ self.session_context = kwargs.get("session_context") self.cluster = kwargs.get("cluster") self.module = kwargs.get("module") if kwargs.get("file") is not None: self.file = os.path.abspath(kwargs.get("file")) else: self.file = None self.cls = kwargs.get("cls") self.function = kwargs.get("function") self.injected_args = kwargs.get("injected_args") self.ignore = kwargs.get("ignore", False) # cluster_use_metadata is a dict containing information about how this test will use cluster resources # to date, this only includes "num_nodes" self.cluster_use_metadata = copy.copy( kwargs.get("cluster_use_metadata", {})) self.services = ServiceRegistry() self.test_index = None # dict for toggling service log collection on/off self.log_collect = {} self._logger = None self._local_scratch_dir = None
def __init__(self, session_context, module=None, cls=None, function=None, config=None): """ :type session_context: ducktape.tests.session.SessionContext """ self.module = module self.cls = cls self.function = function self.config = config self.session_context = session_context self.cluster = session_context.cluster self.services = ServiceRegistry() # dict for toggling service log collection on/off self.log_collect = {} self.results_dir = self.session_context.results_dir if self.cls is not None: self.results_dir = os.path.join(self.results_dir, self.cls.__name__) if self.function is not None: self.results_dir = os.path.join(self.results_dir, self.function.__name__) mkdir_p(self.results_dir) self._logger_configured = False self.configure_logger()
def run(self): self.log(logging.INFO, "Loading test %s" % str(self.test_metadata)) self.test_context = self._collect_test_context(**self.test_metadata) self.test_context.test_index = self.test_index self.send(self.message.running()) if self.test_context.ignore: # Skip running this test, but keep track of the fact that we ignored it result = TestResult(self.test_context, self.test_index, self.session_context, test_status=IGNORE, start_time=time.time(), stop_time=time.time()) result.report() # Tell the server we are finished self.send(self.message.finished(result=result)) return start_time = -1 stop_time = -1 test_status = FAIL summary = [] data = None self.all_services = ServiceRegistry() num_runs = 0 try: while test_status == FAIL and num_runs < self.deflake_num: num_runs += 1 self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num)) start_time = time.time() test_status, summary, data = self._do_run(num_runs) if test_status == PASS and num_runs > 1: test_status = FLAKY msg = str(test_status.to_json()) if summary: msg += ": {}".format(summary) if num_runs != self.deflake_num: msg += "\n" + "~" * max(len(line) for line in summary.split('\n')) self.log(logging.INFO, msg) finally: stop_time = time.time() test_status, summary = self._check_cluster_utilization(test_status, summary) if num_runs > 1: # for reporting purposes report all services self.test_context.services = self.all_services # for flaky tests, we report the start and end time of the successful run, and not the whole run period result = TestResult( self.test_context, self.test_index, self.session_context, test_status, summary, data, start_time, stop_time) self.log(logging.INFO, "Data: %s" % str(result.data)) result.report() # Tell the server we are finished self._do_safely(lambda: self.send(self.message.finished(result=result)), "Problem sending FINISHED message for " + str(self.test_metadata) + ":\n") # Release test_context resources only after creating the result and finishing logging activity # The Sender object uses the same logger, so we postpone closing until after the finished message is sent self.test_context.close() self.all_services = None self.test_context = None self.test = None
class RunnerClient(object): """Run a single test""" def __init__(self, server_hostname, server_port, test_id, test_index, logger_name, log_dir, debug, fail_bad_cluster_utilization, deflake_num): signal.signal(signal.SIGTERM, self._sigterm_handler) # register a SIGTERM handler self.serde = SerDe() self.logger = test_logger(logger_name, log_dir, debug) self.runner_port = server_port self.fail_bad_cluster_utilization = fail_bad_cluster_utilization self.test_id = test_id self.test_index = test_index self.id = "test-runner-%d-%d" % (os.getpid(), id(self)) self.message = ClientEventFactory(self.test_id, self.test_index, self.id) self.sender = Sender(server_hostname, str(self.runner_port), self.message, self.logger) ready_reply = self.sender.send(self.message.ready()) self.session_context = ready_reply["session_context"] self.test_metadata = ready_reply["test_metadata"] self.cluster = ready_reply["cluster"] self.deflake_num = deflake_num # Wait to instantiate the test object until running the test self.test = None self.test_context = None self.all_services = None def send(self, event): return self.sender.send(event) def _sigterm_handler(self, signum, frame): """Translate SIGTERM to SIGINT on this process python will treat SIGINT as a Keyboard exception. Exception handling does the rest. """ os.kill(os.getpid(), signal.SIGINT) def _collect_test_context(self, directory, file_name, cls_name, method_name, injected_args): loader = TestLoader(self.session_context, self.logger, injected_args=injected_args, cluster=self.cluster) # TODO: deal with this in a more graceful fashion. # In an unlikely even that discover either raises the exception or fails to find exactly one test # we should probably continue trying other tests rather than killing this process loaded_context_list = loader.discover(directory, file_name, cls_name, method_name) assert len(loaded_context_list) == 1 test_context = loaded_context_list[0] test_context.cluster = self.cluster return test_context def run(self): self.log(logging.INFO, "Loading test %s" % str(self.test_metadata)) self.test_context = self._collect_test_context(**self.test_metadata) self.test_context.test_index = self.test_index self.send(self.message.running()) if self.test_context.ignore: # Skip running this test, but keep track of the fact that we ignored it result = TestResult(self.test_context, self.test_index, self.session_context, test_status=IGNORE, start_time=time.time(), stop_time=time.time()) result.report() # Tell the server we are finished self.send(self.message.finished(result=result)) return start_time = -1 stop_time = -1 test_status = FAIL summary = [] data = None self.all_services = ServiceRegistry() num_runs = 0 try: while test_status == FAIL and num_runs < self.deflake_num: num_runs += 1 self.log(logging.INFO, "on run {}/{}".format(num_runs, self.deflake_num)) start_time = time.time() test_status, summary, data = self._do_run(num_runs) if test_status == PASS and num_runs > 1: test_status = FLAKY msg = str(test_status.to_json()) if summary: msg += ": {}".format(summary) if num_runs != self.deflake_num: msg += "\n" + "~" * max(len(line) for line in summary.split('\n')) self.log(logging.INFO, msg) finally: stop_time = time.time() test_status, summary = self._check_cluster_utilization(test_status, summary) if num_runs > 1: # for reporting purposes report all services self.test_context.services = self.all_services # for flaky tests, we report the start and end time of the successful run, and not the whole run period result = TestResult( self.test_context, self.test_index, self.session_context, test_status, summary, data, start_time, stop_time) self.log(logging.INFO, "Data: %s" % str(result.data)) result.report() # Tell the server we are finished self._do_safely(lambda: self.send(self.message.finished(result=result)), "Problem sending FINISHED message for " + str(self.test_metadata) + ":\n") # Release test_context resources only after creating the result and finishing logging activity # The Sender object uses the same logger, so we postpone closing until after the finished message is sent self.test_context.close() self.all_services = None self.test_context = None self.test = None def _do_run(self, num_runs): test_status = FAIL summary = [] data = None sid_factory = MultiRunServiceIdFactory(num_runs) if self.deflake_num > 1 else service_id_factory try: # Results from this test, as well as logs will be dumped here mkdir_p(TestContext.results_dir(self.test_context, self.test_index)) # Instantiate test self.test = self.test_context.cls(self.test_context) # Check if there are enough nodes self._check_min_cluster_spec() # Run the test unit self.setup_test() data = self.run_test() test_status = PASS except BaseException as e: # mark the test as failed before doing anything else test_status = FAIL err_trace = self._exc_msg(e) summary.append(err_trace) finally: for service in self.test_context.services: service.service_id_factory = sid_factory self.all_services.append(service) self.teardown_test(teardown_services=not self.session_context.no_teardown, test_status=test_status) if hasattr(self.test_context, "services"): service_errors = self.test_context.services.errors() if service_errors: summary.extend(["\n\n", service_errors]) # free nodes if self.test: self.log(logging.DEBUG, "Freeing nodes...") self._do_safely(self.test.free_nodes, "Error freeing nodes:") return test_status, "".join(summary), data def _check_min_cluster_spec(self): self.log(logging.DEBUG, "Checking if there are enough nodes...") min_cluster_spec = self.test.min_cluster_spec() os_to_num_nodes = {} for node_spec in min_cluster_spec: if not os_to_num_nodes.get(node_spec.operating_system): os_to_num_nodes[node_spec.operating_system] = 1 else: os_to_num_nodes[node_spec.operating_system] = os_to_num_nodes[node_spec.operating_system] + 1 for (operating_system, node_count) in iteritems(os_to_num_nodes): num_avail = len(list(self.cluster.all().nodes.elements(operating_system=operating_system))) if node_count > num_avail: raise RuntimeError( "There are not enough nodes available in the cluster to run this test. " "Cluster size for %s: %d, Need at least: %d. Services currently registered: %s" % (operating_system, num_avail, node_count, self.test_context.services)) def _check_cluster_utilization(self, result, summary): """Checks if the number of nodes used by a test is less than the number of nodes requested by the test. If this is the case and we wish to fail on bad cluster utilization, the result value is failed. Will also print a warning if the test passes and the node utilization doesn't match. """ max_used = self.cluster.max_used() total = len(self.cluster.all()) if max_used < total: message = "Test requested %d nodes, used only %d" % (total, max_used) if self.fail_bad_cluster_utilization: # only check node utilization on test pass if result == PASS or result == FLAKY: self.log(logging.INFO, "FAIL: " + message) result = FAIL summary += message else: self.log(logging.WARN, message) return result, summary def setup_test(self): """start services etc""" self.log(logging.INFO, "Setting up...") self.test.setup() def run_test(self): """Run the test! We expect test_context.function to be a function or unbound method which takes an instantiated test object as its argument. """ self.log(logging.INFO, "Running...") return self.test_context.function(self.test) def _exc_msg(self, e): return repr(e) + "\n" + traceback.format_exc(limit=16) def _do_safely(self, action, err_msg): try: action() except BaseException as e: self.log(logging.WARN, err_msg + " " + self._exc_msg(e)) def teardown_test(self, teardown_services=True, test_status=None): """teardown method which stops services, gathers log data, removes persistent state, and releases cluster nodes. Catch all exceptions so that every step in the teardown process is tried, but signal that the test runner should stop if a keyboard interrupt is caught. """ self.log(logging.INFO, "Tearing down...") if not self.test: self.log(logging.WARN, "%s failed to instantiate" % self.test_id) self.test_context.close() return services = self.test_context.services if teardown_services: self._do_safely(self.test.teardown, "Error running teardown method:") # stop services self._do_safely(services.stop_all, "Error stopping services:") # always collect service logs whether or not we tear down # logs are typically removed during "clean" phase, so collect logs before cleaning self.log(logging.DEBUG, "Copying logs from services...") self._do_safely(lambda: self.test.copy_service_logs(test_status), "Error copying service logs:") # clean up stray processes and persistent state if teardown_services: self.log(logging.DEBUG, "Cleaning up services...") self._do_safely(services.clean_all, "Error cleaning services:") def log(self, log_level, msg, *args, **kwargs): """Log to the service log and the test log of the current test.""" if self.test_context is None: msg = "%s: %s" % (self.__class__.__name__, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) else: msg = "%s: %s: %s" % (self.__class__.__name__, self.test_context.test_name, str(msg)) self.logger.log(log_level, msg, *args, **kwargs) self.send(self.message.log(msg, level=log_level))