def _reuse_dev_cluster(self) -> clusterlib.ClusterLib: """Reuse cluster that was already started outside of test framework.""" instance_num = 0 self.cm._cluster_instance_num = instance_num cluster_nodes.set_cluster_env(instance_num) state_dir = cluster_nodes.get_cluster_env().state_dir # make sure instance dir exists instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" instance_dir.mkdir(exist_ok=True, parents=True) cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: cluster_obj = cluster_nodes.get_cluster_type().get_cluster_obj() # setup faucet addresses if not (state_dir / cluster_nodes.ADDRS_DATA).exists(): tmp_path = state_dir / "addrs_data" tmp_path.mkdir(exist_ok=True, parents=True) cluster_nodes.setup_test_addrs(cluster_obj, tmp_path) # check if it is necessary to reload data self._reload_cluster_obj(state_dir=state_dir) return cluster_obj
def stop_all_clusters(self) -> None: """Stop all cluster instances.""" self._log("called `stop_all_clusters`") for instance_num in range(self.num_of_instances): instance_dir = self.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" if (not (instance_dir / CLUSTER_RUNNING_FILE).exists() or (instance_dir / CLUSTER_STOPPED_FILE).exists()): self._log(f"cluster instance {instance_num} not running") continue startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.prepare_scripts_files( destdir=self._create_startup_files_dir(instance_num), instance_num=instance_num, ) cluster_nodes.set_cluster_env(instance_num) self._log( f"stopping cluster instance {instance_num} with `{startup_files.stop_script}`" ) state_dir = cluster_nodes.get_cluster_env().state_dir try: cluster_nodes.stop_cluster(cmd=str(startup_files.stop_script)) except Exception as exc: LOGGER.error(f"While stopping cluster: {exc}") cli_coverage.save_start_script_coverage( log_file=state_dir / CLUSTER_START_CMDS_LOG, pytest_config=self.pytest_config, ) cluster_nodes.save_cluster_artifacts( artifacts_dir=self.pytest_tmp_dir, clean=True) open(instance_dir / CLUSTER_STOPPED_FILE, "a").close() self._log(f"stopped cluster instance {instance_num}")
def get( # noqa: C901 self, mark: str = "", lock_resources: Iterable[str] = (), use_resources: Iterable[str] = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches assert not isinstance( lock_resources, str), "`lock_resources` must be sequence of strings" assert not isinstance( use_resources, str), "`use_resources` must be sequence of strings" if configuration.DEV_CLUSTER_RUNNING: if start_cmd: LOGGER.warning( f"Ignoring the '{start_cmd}' cluster start command as " "'DEV_CLUSTER_RUNNING' is set.") # check if the development cluster instance is ready by now so we don't need to obtain # cluster lock when it is not necessary if not self._is_dev_cluster_ready(): with locking.FileLockIfXdist(self.cm.cluster_lock): self._setup_dev_cluster() if configuration.FORBID_RESTART and start_cmd: raise RuntimeError( "Cannot use custom start command when 'FORBID_RESTART' is set." ) if start_cmd: if not (mark or (Resources.CLUSTER in lock_resources)): raise RuntimeError( "Custom start command can be used only together with singleton or `mark`." ) # always clean after test(s) that started cluster with custom configuration cleanup = True # Add `Resources.CLUSTER` to `use_resources`. Filter out `lock_resources` from the # list of `use_resources`. use_resources = list( set(use_resources).union({Resources.CLUSTER}) - set(lock_resources)) cget_status = ClusterGetStatus( mark=mark, lock_resources=lock_resources, use_resources=use_resources, cleanup=cleanup, start_cmd=start_cmd, current_test=os.environ.get("PYTEST_CURRENT_TEST") or "", ) marked_tests_cache: Dict[int, MarkedTestsStatus] = {} self.cm._log(f"want to run test '{cget_status.current_test}'") # iterate until it is possible to start the test while True: if cget_status.restart_ready: self._restart(start_cmd=start_cmd) if not cget_status.first_iteration: xdist_sleep(random.uniform(0.6, 1.2) * cget_status.sleep_delay) # nothing time consuming can go under this lock as all other workers will need to wait with locking.FileLockIfXdist(self.cm.cluster_lock): if self._is_already_running(cget_status): if not self.cm.cache.cluster_obj: raise AssertionError( "`cluster_obj` not available, that cannot happen") return self.cm.cache.cluster_obj # needs to be set here, before the first `continue` cget_status.first_iteration = False self.cm._cluster_instance_num = -1 # try all existing cluster instances for instance_num in range(self.cm.num_of_instances): # there's only one cluster instance when `DEV_CLUSTER_RUNNING` is set if configuration.DEV_CLUSTER_RUNNING and instance_num != 0: continue # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if (cget_status.selected_instance != -1 and instance_num != cget_status.selected_instance): continue cget_status.instance_num = instance_num cget_status.instance_dir = ( self.cm.pytest_tmp_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}") cget_status.instance_dir.mkdir(exist_ok=True) # cleanup cluster instance where attempt to start cluster failed repeatedly if (cget_status.instance_dir / CLUSTER_DEAD_FILE).exists(): self._cleanup_dead_clusters(cget_status) continue # cluster restart planned or in progress, so no new tests can start if self._restarted_by_other_worker(cget_status): cget_status.sleep_delay = 5 continue # are there tests already running on this cluster instance? cget_status.started_tests_sfiles = list( cget_status.instance_dir.glob( f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. cget_status.marked_starting_sfiles = list( cget_status.instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_*")) cget_status.marked_running_sfiles = list( cget_status.instance_dir.glob( f"{TEST_CURR_MARK_GLOB}_*")) # if marked tests are already running, update their status self._update_marked_tests( marked_tests_cache=marked_tests_cache, cget_status=cget_status) # test has mark if mark: # select this instance for running marked tests if possible if not self._marked_select_instance(cget_status): cget_status.sleep_delay = 2 continue # check if we need to wait until unmarked tests are finished if (not cget_status.marked_running_sfiles and cget_status.started_tests_sfiles): cget_status.sleep_delay = 10 continue self.cm._log( f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # no unmarked test can run while marked tests are starting or running elif cget_status.marked_running_sfiles or cget_status.marked_starting_sfiles: self.cm._log( f"c{instance_num}: marked tests starting or running, " f"I don't have mark") cget_status.sleep_delay = 2 continue # check availability of the required resources if not self._are_resources_available(cget_status): cget_status.sleep_delay = 5 continue # if restart is needed, indicate that the cluster will be restarted # (after all currently running tests are finished) if not self._init_restart(cget_status): continue # we've found suitable cluster instance cget_status.selected_instance = instance_num self.cm._cluster_instance_num = instance_num self.cm._log( f"c{instance_num}: can run test '{cget_status.current_test}'" ) # set environment variables that are needed when restarting the cluster # and running tests cluster_nodes.set_cluster_env(instance_num) # if needed, finish restart related actions if not self._finish_restart(cget_status): continue # from this point on, all conditions needed to start the test are met break else: # if the test cannot start on any instance, return to top-level loop continue self._create_test_status_files(cget_status) # Check if it is necessary to reload data. This still needs to happen under # global lock. state_dir = cluster_nodes.get_cluster_env().state_dir self._reload_cluster_obj(state_dir=state_dir) # cluster is ready, we can start the test break cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: raise AssertionError( "`cluster_obj` not available, that cannot happen") cluster_obj.cluster_id = instance_num cluster_obj._cluster_manager = self.cm # type: ignore return cluster_obj
def get( # noqa: C901 self, singleton: bool = False, mark: str = "", lock_resources: UnpackableSequence = (), use_resources: UnpackableSequence = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches,too-many-locals # don't start new cluster if it was already started outside of test framework if DEV_CLUSTER_RUNNING: if start_cmd: LOGGER.warning( f"Ignoring the '{start_cmd}' cluster start command as " "'DEV_CLUSTER_RUNNING' is set.") return self._reuse_dev_cluster() if FORBID_RESTART and start_cmd: raise RuntimeError( "Cannot use custom start command when 'FORBID_RESTART' is set." ) selected_instance = -1 restart_here = False restart_ready = False first_iteration = True sleep_delay = 1 marked_tests_cache: Dict[int, MarkedTestsStatus] = {} if start_cmd: if not (singleton or mark): raise AssertionError( "Custom start command can be used only together with `singleton` or `mark`" ) # always clean after test(s) that started cluster with custom configuration cleanup = True # iterate until it is possible to start the test while True: if restart_ready: self._restart(start_cmd=start_cmd) if not first_iteration: helpers.xdist_sleep(random.random() * sleep_delay) # nothing time consuming can go under this lock as it will block all other workers with helpers.FileLockIfXdist(self.cm.cluster_lock): test_on_worker = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_RUNNING_GLOB}_{self.cm.worker_id}" )) # test is already running, nothing to set up if (first_iteration and test_on_worker and self.cm._cluster_instance_num != -1 and self.cm.cache.cluster_obj): self.cm._log(f"{test_on_worker[0]} already exists") return self.cm.cache.cluster_obj first_iteration = False # needs to be set here, before the first `continue` self.cm._cluster_instance_num = -1 # try all existing cluster instances for instance_num in range(self.cm.num_of_instances): # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if selected_instance != -1 and instance_num != selected_instance: continue instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" instance_dir.mkdir(exist_ok=True) # if the selected instance failed to start, move on to other instance if (instance_dir / CLUSTER_DEAD_FILE).exists(): selected_instance = -1 restart_here = False restart_ready = False # remove status files that are checked by other workers for sf in ( *instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"), *instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_*"), ): os.remove(sf) dead_clusters = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{CLUSTER_DEAD_FILE}") ) if len(dead_clusters) == self.cm.num_of_instances: raise RuntimeError( "All clusters are dead, cannot run.") continue # singleton test is running, so no other test can be started if (instance_dir / TEST_SINGLETON_FILE).exists(): self.cm._log( f"c{instance_num}: singleton test in progress, cannot run" ) sleep_delay = 5 continue restart_in_progress = list( instance_dir.glob(f"{RESTART_IN_PROGRESS_GLOB}_*")) # cluster restart planned, no new tests can start if not restart_here and restart_in_progress: # no log message here, it would be too many of them sleep_delay = 5 continue started_tests = list( instance_dir.glob(f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. marked_starting = list( instance_dir.glob(f"{TEST_MARK_STARTING_GLOB}_*")) marked_running = list( instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*")) if mark: marked_running_my = ( instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}").exists() marked_starting_my = list( instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_{mark}_*")) marked_running_my_anywhere = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_CURR_MARK_GLOB}_{mark}" )) # check if tests with my mark are running on some other cluster instance if not marked_running_my and marked_running_my_anywhere: self.cm._log( f"c{instance_num}: tests marked with my mark '{mark}' " "already running on other cluster instance, cannot run" ) continue marked_starting_my_anywhere = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_MARK_STARTING_GLOB}_{mark}_*" )) # check if tests with my mark are starting on some other cluster instance if not marked_starting_my and marked_starting_my_anywhere: self.cm._log( f"c{instance_num}: tests marked with my mark '{mark}' starting " "on other cluster instance, cannot run") continue # check if this test has the same mark as currently running marked tests if marked_running_my or marked_starting_my: # lock to this cluster instance selected_instance = instance_num elif marked_running or marked_starting: self.cm._log( f"c{instance_num}: tests marked with other mark starting " f"or running, I have different mark '{mark}'") continue # check if needs to wait until marked tests can run if marked_starting_my and started_tests: self.cm._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) sleep_delay = 2 continue # no unmarked test can run while marked tests are starting or running elif marked_running or marked_starting: self.cm._log( f"c{instance_num}: marked tests starting or running, " f"I don't have mark") sleep_delay = 5 continue # is this the first marked test that wants to run? initial_marked_test = bool(mark and not marked_running) # indicate that it is planned to start marked tests as soon as # all currently running tests are finished or the cluster is restarted if initial_marked_test: # lock to this cluster instance selected_instance = instance_num mark_starting_file = ( instance_dir / f"{TEST_MARK_STARTING_GLOB}_{mark}_{self.cm.worker_id}" ) if not mark_starting_file.exists(): open( mark_starting_file, "a", ).close() if started_tests: self.cm._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) sleep_delay = 3 continue # get marked tests status marked_tests_status = self._get_marked_tests_status( cache=marked_tests_cache, instance_num=instance_num) # marked tests are already running if marked_running: active_mark_file = marked_running[0].name # update marked tests status self._update_marked_tests( marked_tests_status=marked_tests_status, active_mark_name=active_mark_file, started_tests=started_tests, instance_num=instance_num, ) self.cm._log( f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # reset counter of cycles with no marked test running marked_tests_status.no_marked_tests_iter = 0 # this test is a singleton - no other test can run while this one is running if singleton and started_tests: self.cm._log( f"c{instance_num}: tests are running, cannot start singleton" ) sleep_delay = 5 continue # this test wants to lock some resources, check if these are not # locked or in use if lock_resources: res_usable = self._are_resources_usable( resources=lock_resources, instance_dir=instance_dir, instance_num=instance_num, ) if not res_usable: sleep_delay = 5 continue # filter out `lock_resources` from the list of `use_resources` if use_resources and lock_resources: use_resources = list( set(use_resources) - set(lock_resources)) # this test wants to use some resources, check if these are not locked if use_resources: res_locked = self._are_resources_locked( resources=use_resources, instance_dir=instance_dir, instance_num=instance_num, ) if res_locked: sleep_delay = 5 continue # indicate that the cluster will be restarted new_cmd_restart = bool(start_cmd and (initial_marked_test or singleton)) if not restart_here and ( new_cmd_restart or self._is_restart_needed(instance_num)): if started_tests: self.cm._log( f"c{instance_num}: tests are running, cannot restart" ) continue # Cluster restart will be performed by this worker. # By setting `restart_here`, we make sure this worker continue on # this cluster instance after restart. It is important because # the `start_cmd` used for starting the cluster might be speciffic # to the test. restart_here = True self.cm._log( f"c{instance_num}: setting to restart cluster") selected_instance = instance_num restart_in_progress_file = ( instance_dir / f"{RESTART_IN_PROGRESS_GLOB}_{self.cm.worker_id}") if not restart_in_progress_file.exists(): open(restart_in_progress_file, "a").close() # we've found suitable cluster instance selected_instance = instance_num self.cm._cluster_instance_num = instance_num cluster_nodes.set_cluster_env(instance_num) if restart_here: if restart_ready: # The cluster was already restarted if we are here and # `restart_ready` is still True. restart_ready = False # Remove status files that are no longer valid after restart. for f in instance_dir.glob( f"{RESTART_IN_PROGRESS_GLOB}_*"): os.remove(f) for f in instance_dir.glob( f"{RESTART_NEEDED_GLOB}_*"): os.remove(f) else: self.cm._log(f"c{instance_num}: calling restart") # the actual `_restart` function will be called outside # of global lock restart_ready = True continue # from this point on, all conditions needed to start the test are met # this test is a singleton if singleton: self.cm._log(f"c{instance_num}: starting singleton") open(self.cm.instance_dir / TEST_SINGLETON_FILE, "a").close() # this test is a first marked test if initial_marked_test: self.cm._log( f"c{instance_num}: starting '{mark}' tests") open( self.cm.instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}", "a").close() for sf in marked_starting: os.remove(sf) # create status file for each in-use resource _ = [ open( self.cm.instance_dir / f"{RESOURCE_IN_USE_GLOB}_{r}_{self.cm.worker_id}", "a", ).close() for r in use_resources ] # create status file for each locked resource _ = [ open( self.cm.instance_dir / f"{RESOURCE_LOCKED_GLOB}_{r}_{self.cm.worker_id}", "a", ).close() for r in lock_resources ] # cleanup = cluster restart after test (group of tests) is finished if cleanup: # cleanup after group of test that are marked with a marker if mark: self.cm._log(f"c{instance_num}: cleanup and mark") open( self.cm.instance_dir / f"{RESTART_AFTER_MARK_GLOB}_{self.cm.worker_id}", "a", ).close() # cleanup after single test (e.g. singleton) else: self.cm._log( f"c{instance_num}: cleanup and not mark") open( self.cm.instance_dir / f"{RESTART_NEEDED_GLOB}_{self.cm.worker_id}", "a", ).close() break else: # if the test cannot start on any instance, return to top-level loop continue test_running_file = ( self.cm.instance_dir / f"{TEST_RUNNING_GLOB}_{self.cm.worker_id}") self.cm._log( f"c{self.cm.cluster_instance_num}: creating {test_running_file}" ) open(test_running_file, "a").close() # check if it is necessary to reload data state_dir = cluster_nodes.get_cluster_env().state_dir self._reload_cluster_obj(state_dir=state_dir) cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: cluster_obj = cluster_nodes.get_cluster_type( ).get_cluster_obj() # `cluster_obj` is ready, we can start the test break return cluster_obj