def conn(cls) -> psycopg2.extensions.connection: instance_num = cluster_nodes.get_cluster_env().instance_num conn = cls.conn_cache.get(instance_num) if conn is None or conn.closed == 1: conn = psycopg2.connect(f"dbname={DBSYNC_DB}{instance_num}") cls.conn_cache[instance_num] = conn return conn
def _reuse_dev_cluster(self) -> clusterlib.ClusterLib: """Reuse cluster that was already started outside of test framework.""" instance_num = 0 self.cm._cluster_instance_num = instance_num cluster_nodes.set_cluster_env(instance_num) state_dir = cluster_nodes.get_cluster_env().state_dir # make sure instance dir exists instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" instance_dir.mkdir(exist_ok=True, parents=True) cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: cluster_obj = cluster_nodes.get_cluster_type().get_cluster_obj() # setup faucet addresses if not (state_dir / cluster_nodes.ADDRS_DATA).exists(): tmp_path = state_dir / "addrs_data" tmp_path.mkdir(exist_ok=True, parents=True) cluster_nodes.setup_test_addrs(cluster_obj, tmp_path) # check if it is necessary to reload data self._reload_cluster_obj(state_dir=state_dir) return cluster_obj
def add_ignore_rule(files_glob: str, regex: str) -> None: """Add ignore rule for expected errors.""" with helpers.FileLockIfXdist(f"{helpers.get_basetemp()}/ignore_rules.lock"): state_dir = cluster_nodes.get_cluster_env().state_dir rules_file = state_dir / ERRORS_RULES_FILE_NAME with open(rules_file, "a") as infile: infile.write(f"{files_glob};;{regex}\n")
def _is_dev_cluster_ready(self) -> bool: """Check if development cluster instance is ready to be used.""" work_dir = cluster_nodes.get_cluster_env().work_dir state_dir = work_dir / f"{cluster_nodes.STATE_CLUSTER}0" if (state_dir / cluster_nodes.ADDRS_DATA).exists(): return True return False
def stop_all_clusters(self) -> None: """Stop all cluster instances.""" self._log("called `stop_all_clusters`") for instance_num in range(self.num_of_instances): instance_dir = self.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" if (not (instance_dir / CLUSTER_RUNNING_FILE).exists() or (instance_dir / CLUSTER_STOPPED_FILE).exists()): self._log(f"cluster instance {instance_num} not running") continue startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.prepare_scripts_files( destdir=self._create_startup_files_dir(instance_num), instance_num=instance_num, ) cluster_nodes.set_cluster_env(instance_num) self._log( f"stopping cluster instance {instance_num} with `{startup_files.stop_script}`" ) state_dir = cluster_nodes.get_cluster_env().state_dir try: cluster_nodes.stop_cluster(cmd=str(startup_files.stop_script)) except Exception as exc: LOGGER.error(f"While stopping cluster: {exc}") cli_coverage.save_start_script_coverage( log_file=state_dir / CLUSTER_START_CMDS_LOG, pytest_config=self.pytest_config, ) cluster_nodes.save_cluster_artifacts( artifacts_dir=self.pytest_tmp_dir, clean=True) open(instance_dir / CLUSTER_STOPPED_FILE, "a").close() self._log(f"stopped cluster instance {instance_num}")
def cleanup( cluster_obj: clusterlib.ClusterLib, location: FileType, ) -> None: """Cleanup a testnet with the help of testing artifacts.""" cluster_env = cluster_nodes.get_cluster_env() faucet_addr_file = cluster_env.state_dir / "shelley" / "faucet.addr" faucet_payment = create_addr_record(faucet_addr_file) files_found = group_files(find_files(location)) def _run(files: List[Path]) -> None: for fpath in files: # add random sleep for < 1s to prevent # "Network.Socket.connect: <socket: 11>: resource exhausted" time.sleep(random.random()) f_name = fpath.name if f_name == "faucet.addr": continue if f_name.endswith("_stake.addr"): payment_addr = fpath.parent / f_name.replace( "_stake.addr", ".addr") try: payment = create_addr_record(payment_addr) stake = create_addr_record(fpath) except ValueError as exc: LOGGER.warning(f"Skipping '{fpath}':\n'{exc}'") continue pool_user = clusterlib.PoolUser(payment=payment, stake=stake) deregister_stake_addr(cluster_obj=cluster_obj, pool_user=pool_user, name_template=f_name) withdraw_reward( cluster_obj=cluster_obj, stake_addr_record=stake, dst_addr_record=payment, name_template=f_name, ) else: try: payment = create_addr_record(fpath) except ValueError as exc: LOGGER.warning(f"Skipping '{fpath}':\n'{exc}'") continue return_funds_to_faucet( cluster_obj=cluster_obj, src_addr=payment, faucet_addr=faucet_payment.address, tx_name=f_name, ) # run cleanup in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: futures = [executor.submit(_run, f) for f in files_found] concurrent.futures.wait(futures)
def add_ignore_rule(files_glob: str, regex: str, ignore_file_id: str) -> None: """Add ignore rule for expected errors.""" cluster_env = cluster_nodes.get_cluster_env() rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}" lock_file = (temptools.get_basetemp() / f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock") with locking.FileLockIfXdist(lock_file), open(rules_file, "a", encoding="utf-8") as infile: infile.write(f"{files_glob};;{regex}\n")
def expect_errors(regex_pairs: List[Tuple[str, str]]) -> Iterator[None]: """Make sure expected errors are present in logs. Args: regex_pairs: [(glob, regex)] - list of regexes that need to be present in files described by the glob """ state_dir = cluster_nodes.get_cluster_env().state_dir glob_list = [] for files_glob, regex in regex_pairs: add_ignore_rule(files_glob, regex) # don't report errors that are expected glob_list.append(files_glob) # resolve the globs _expanded_paths = [ list(state_dir.glob(glob_item)) for glob_item in glob_list ] # flatten the list expanded_paths = list(itertools.chain.from_iterable(_expanded_paths)) # record each end-of-file as a starting offset for searching the log file seek_offsets = {str(p): helpers.get_eof_offset(p) for p in expanded_paths} timestamp = time.time() yield for files_glob, regex in regex_pairs: regex_comp = re.compile(regex) # get list of records (file names and offsets) for given glob matching_files = fnmatch.filter(seek_offsets, f"{state_dir}/{files_glob}") for logfile in matching_files: # skip if the log file is rotated log, it will be handled by `get_rotated_logs` if ROTATED_RE.match(logfile): continue # search for the expected error seek = seek_offsets.get(logfile) or 0 line_found = False for logfile_rec in get_rotated_logs(logfile=Path(logfile), seek=seek, timestamp=timestamp): with open(logfile_rec.logfile) as infile: infile.seek(seek) for line in infile: if regex_comp.search(line): line_found = True break if line_found: break else: raise AssertionError( f"No line matching `{regex}` found in '{logfile}'.")
def test_available_metrics( self, wait_epochs, ): """Test that available EKG metrics matches the expected schema.""" # pylint: disable=unused-argument ekg_port = (cluster_nodes.get_cluster_type( ).cluster_scripts.get_instance_ports( cluster_nodes.get_cluster_env().instance_num).ekg_pool1) response = get_ekg_metrics(ekg_port) model_ekg.Model.validate(response.json())
def clean_ignore_rules(ignore_file_id: str) -> None: """Cleanup relevant ignore rules file. Delete ignore file identified by `ignore_file_id` when it is no longer valid. """ cluster_env = cluster_nodes.get_cluster_env() rules_file = cluster_env.state_dir / f"{ERRORS_IGNORE_FILE_NAME}_{ignore_file_id}" lock_file = (temptools.get_basetemp() / f"{ERRORS_IGNORE_FILE_NAME}_{cluster_env.instance_num}.lock") with locking.FileLockIfXdist(lock_file): rules_file.unlink(missing_ok=True)
def testenv_setup_teardown( tmp_path_factory: TempdirFactory, worker_id: str, request: FixtureRequest) -> Generator[None, None, None]: pytest_root_tmp = temptools.get_pytest_root_tmp(tmp_path_factory) with locking.FileLockIfXdist( f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"): # save environment info for Allure if not list(pytest_root_tmp.glob(".started_session_*")): _save_env_for_allure(request.config) helpers.touch(pytest_root_tmp / f".started_session_{worker_id}") yield with locking.FileLockIfXdist( f"{pytest_root_tmp}/{cluster_management.CLUSTER_LOCK}"): # save CLI coverage to dir specified by `--cli-coverage-dir` cluster_manager_obj = cluster_management.ClusterManager( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config) cluster_manager_obj.save_worker_cli_coverage() # perform cleanup if this is the last running pytest worker (pytest_root_tmp / f".started_session_{worker_id}").unlink() if not list(pytest_root_tmp.glob(".started_session_*")): # perform testnet cleanup _testnet_cleanup(pytest_root_tmp=pytest_root_tmp) if configuration.DEV_CLUSTER_RUNNING: # save cluster artifacts artifacts_base_dir = request.config.getoption( "--artifacts-base-dir") if artifacts_base_dir: state_dir = cluster_nodes.get_cluster_env().state_dir artifacts.save_cluster_artifacts(save_dir=pytest_root_tmp, state_dir=state_dir) else: # stop all cluster instances, save artifacts _stop_all_cluster_instances( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config, ) # copy collected artifacts to dir specified by `--artifacts-base-dir` artifacts.copy_artifacts(pytest_tmp_dir=pytest_root_tmp, pytest_config=request.config)
def test_available_metrics( self, wait_epochs, ): """Test that list of available metrics == list of expected metrics.""" # pylint: disable=unused-argument prometheus_port = (cluster_nodes.get_cluster_type( ).cluster_scripts.get_instance_ports( cluster_nodes.get_cluster_env().instance_num).prometheus_pool1) response = get_prometheus_metrics(prometheus_port) metrics = response.text.strip().split("\n") metrics_keys = sorted(m.split(" ")[0] for m in metrics) assert metrics_keys == EXPECTED_METRICS, "Metrics differ"
def _setup_dev_cluster(self) -> None: """Set up cluster instance that was already started outside of test framework.""" work_dir = cluster_nodes.get_cluster_env().work_dir state_dir = work_dir / f"{cluster_nodes.STATE_CLUSTER}0" if (state_dir / cluster_nodes.ADDRS_DATA).exists(): return self.cm._log("c0: setting up dev cluster") # Create "addrs_data" directly in the cluster state dir, so it can be reused # (in normal non-`DEV_CLUSTER_RUNNING` setup we want "addrs_data" stored among # tests artifacts, so it can be used during cleanup etc.). tmp_path = state_dir / "addrs_data" tmp_path.mkdir(exist_ok=True, parents=True) cluster_obj = cluster_nodes.get_cluster_type().get_cluster_obj() cluster_nodes.setup_test_addrs(cluster_obj=cluster_obj, destination_dir=tmp_path)
def stop_all_clusters(self) -> None: """Stop all cluster instances.""" self._log("called `stop_all_clusters`") # don't stop cluster if it was started outside of test framework if configuration.DEV_CLUSTER_RUNNING: LOGGER.warning( "Ignoring request to stop clusters as 'DEV_CLUSTER_RUNNING' is set." ) return work_dir = cluster_nodes.get_cluster_env().work_dir for instance_num in range(self.num_of_instances): instance_dir = self.pytest_tmp_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" if (not (instance_dir / CLUSTER_RUNNING_FILE).exists() or (instance_dir / CLUSTER_STOPPED_FILE).exists()): self._log(f"c{instance_num}: cluster instance not running") continue state_dir = work_dir / f"{cluster_nodes.STATE_CLUSTER}{instance_num}" stop_script = state_dir / cluster_scripts.STOP_SCRIPT if not stop_script.exists(): self._log(f"c{instance_num}: stop script doesn't exist!") continue self._log( f"c{instance_num}: stopping cluster instance with `{stop_script}`" ) try: helpers.run_command(str(stop_script)) except Exception as err: self._log(f"c{instance_num}: failed to stop cluster:\n{err}") artifacts.save_start_script_coverage( log_file=state_dir / CLUSTER_START_CMDS_LOG, pytest_config=self.pytest_config, ) artifacts.save_cluster_artifacts(save_dir=self.pytest_tmp_dir, state_dir=state_dir) shutil.rmtree(state_dir, ignore_errors=True) helpers.touch(instance_dir / CLUSTER_STOPPED_FILE) self._log(f"c{instance_num}: stopped cluster instance")
def search_cluster_artifacts() -> List[Tuple[Path, str]]: """Search cluster artifacts for errors.""" cluster_env = cluster_nodes.get_cluster_env() lock_file = temptools.get_basetemp( ) / f"search_artifacts_{cluster_env.instance_num}.lock" with locking.FileLockIfXdist(lock_file): ignore_rules = _get_ignore_rules(cluster_env=cluster_env) errors = [] for logfile in cluster_env.state_dir.glob("*.std*"): # skip if the log file is status file or rotated log if logfile.name.endswith(".offset") or ROTATED_RE.match( logfile.name): continue # read seek offset (from where to start searching) and timestamp of last search offset_file = logfile.parent / f".{logfile.name}.offset" if offset_file.exists(): seek = _get_seek(offset_file) timestamp = os.path.getmtime(offset_file) else: seek = 0 timestamp = 0.0 errors_ignored = _get_ignore_regex(ignore_rules=ignore_rules, regexes=ERRORS_IGNORED, logfile=logfile) errors_ignored_re = re.compile(errors_ignored) # record offset for the "live" log file with open(offset_file, "w", encoding="utf-8") as outfile: outfile.write(str(helpers.get_eof_offset(logfile))) for logfile_rec in _get_rotated_logs(logfile=logfile, seek=seek, timestamp=timestamp): with open(logfile_rec.logfile, encoding="utf-8") as infile: infile.seek(seek) for line in infile: if ERRORS_RE.search(line) and not ( errors_ignored and errors_ignored_re.search(line)): errors.append((logfile, line)) return errors
def get( # noqa: C901 self, mark: str = "", lock_resources: Iterable[str] = (), use_resources: Iterable[str] = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches assert not isinstance( lock_resources, str), "`lock_resources` must be sequence of strings" assert not isinstance( use_resources, str), "`use_resources` must be sequence of strings" if configuration.DEV_CLUSTER_RUNNING: if start_cmd: LOGGER.warning( f"Ignoring the '{start_cmd}' cluster start command as " "'DEV_CLUSTER_RUNNING' is set.") # check if the development cluster instance is ready by now so we don't need to obtain # cluster lock when it is not necessary if not self._is_dev_cluster_ready(): with locking.FileLockIfXdist(self.cm.cluster_lock): self._setup_dev_cluster() if configuration.FORBID_RESTART and start_cmd: raise RuntimeError( "Cannot use custom start command when 'FORBID_RESTART' is set." ) if start_cmd: if not (mark or (Resources.CLUSTER in lock_resources)): raise RuntimeError( "Custom start command can be used only together with singleton or `mark`." ) # always clean after test(s) that started cluster with custom configuration cleanup = True # Add `Resources.CLUSTER` to `use_resources`. Filter out `lock_resources` from the # list of `use_resources`. use_resources = list( set(use_resources).union({Resources.CLUSTER}) - set(lock_resources)) cget_status = ClusterGetStatus( mark=mark, lock_resources=lock_resources, use_resources=use_resources, cleanup=cleanup, start_cmd=start_cmd, current_test=os.environ.get("PYTEST_CURRENT_TEST") or "", ) marked_tests_cache: Dict[int, MarkedTestsStatus] = {} self.cm._log(f"want to run test '{cget_status.current_test}'") # iterate until it is possible to start the test while True: if cget_status.restart_ready: self._restart(start_cmd=start_cmd) if not cget_status.first_iteration: xdist_sleep(random.uniform(0.6, 1.2) * cget_status.sleep_delay) # nothing time consuming can go under this lock as all other workers will need to wait with locking.FileLockIfXdist(self.cm.cluster_lock): if self._is_already_running(cget_status): if not self.cm.cache.cluster_obj: raise AssertionError( "`cluster_obj` not available, that cannot happen") return self.cm.cache.cluster_obj # needs to be set here, before the first `continue` cget_status.first_iteration = False self.cm._cluster_instance_num = -1 # try all existing cluster instances for instance_num in range(self.cm.num_of_instances): # there's only one cluster instance when `DEV_CLUSTER_RUNNING` is set if configuration.DEV_CLUSTER_RUNNING and instance_num != 0: continue # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if (cget_status.selected_instance != -1 and instance_num != cget_status.selected_instance): continue cget_status.instance_num = instance_num cget_status.instance_dir = ( self.cm.pytest_tmp_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}") cget_status.instance_dir.mkdir(exist_ok=True) # cleanup cluster instance where attempt to start cluster failed repeatedly if (cget_status.instance_dir / CLUSTER_DEAD_FILE).exists(): self._cleanup_dead_clusters(cget_status) continue # cluster restart planned or in progress, so no new tests can start if self._restarted_by_other_worker(cget_status): cget_status.sleep_delay = 5 continue # are there tests already running on this cluster instance? cget_status.started_tests_sfiles = list( cget_status.instance_dir.glob( f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. cget_status.marked_starting_sfiles = list( cget_status.instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_*")) cget_status.marked_running_sfiles = list( cget_status.instance_dir.glob( f"{TEST_CURR_MARK_GLOB}_*")) # if marked tests are already running, update their status self._update_marked_tests( marked_tests_cache=marked_tests_cache, cget_status=cget_status) # test has mark if mark: # select this instance for running marked tests if possible if not self._marked_select_instance(cget_status): cget_status.sleep_delay = 2 continue # check if we need to wait until unmarked tests are finished if (not cget_status.marked_running_sfiles and cget_status.started_tests_sfiles): cget_status.sleep_delay = 10 continue self.cm._log( f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # no unmarked test can run while marked tests are starting or running elif cget_status.marked_running_sfiles or cget_status.marked_starting_sfiles: self.cm._log( f"c{instance_num}: marked tests starting or running, " f"I don't have mark") cget_status.sleep_delay = 2 continue # check availability of the required resources if not self._are_resources_available(cget_status): cget_status.sleep_delay = 5 continue # if restart is needed, indicate that the cluster will be restarted # (after all currently running tests are finished) if not self._init_restart(cget_status): continue # we've found suitable cluster instance cget_status.selected_instance = instance_num self.cm._cluster_instance_num = instance_num self.cm._log( f"c{instance_num}: can run test '{cget_status.current_test}'" ) # set environment variables that are needed when restarting the cluster # and running tests cluster_nodes.set_cluster_env(instance_num) # if needed, finish restart related actions if not self._finish_restart(cget_status): continue # from this point on, all conditions needed to start the test are met break else: # if the test cannot start on any instance, return to top-level loop continue self._create_test_status_files(cget_status) # Check if it is necessary to reload data. This still needs to happen under # global lock. state_dir = cluster_nodes.get_cluster_env().state_dir self._reload_cluster_obj(state_dir=state_dir) # cluster is ready, we can start the test break cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: raise AssertionError( "`cluster_obj` not available, that cannot happen") cluster_obj.cluster_id = instance_num cluster_obj._cluster_manager = self.cm # type: ignore return cluster_obj
def _restart(self, start_cmd: str = "", stop_cmd: str = "") -> bool: # noqa: C901 """Restart cluster. Not called under global lock! """ # pylint: disable=too-many-branches cluster_running_file = self.cm.instance_dir / CLUSTER_RUNNING_FILE # don't restart cluster if it was started outside of test framework if configuration.DEV_CLUSTER_RUNNING: self.cm._log( f"c{self.cm.cluster_instance_num}: ignoring restart, dev cluster is running" ) if cluster_running_file.exists(): LOGGER.warning( "Ignoring requested cluster restart as 'DEV_CLUSTER_RUNNING' is set." ) else: helpers.touch(cluster_running_file) return True # fail if cluster restart is forbidden and it was already started if configuration.FORBID_RESTART and cluster_running_file.exists(): raise RuntimeError( "Cannot restart cluster when 'FORBID_RESTART' is set.") self.cm._log( f"c{self.cm.cluster_instance_num}: called `_restart`, start_cmd='{start_cmd}', " f"stop_cmd='{stop_cmd}'") startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.prepare_scripts_files( destdir=self.cm._create_startup_files_dir( self.cm.cluster_instance_num), instance_num=self.cm.cluster_instance_num, start_script=start_cmd, stop_script=stop_cmd, ) state_dir = cluster_nodes.get_cluster_env().state_dir self.cm._log( f"c{self.cm.cluster_instance_num}: in `_restart`, new files " f"start_cmd='{startup_files.start_script}', " f"stop_cmd='{startup_files.stop_script}'") excp: Optional[Exception] = None for i in range(2): if i > 0: self.cm._log( f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\nretrying" ) time.sleep(0.2) try: LOGGER.info( f"Stopping cluster with `{startup_files.stop_script}`.") helpers.run_command(str(startup_files.stop_script)) except Exception as err: self.cm._log( f"c{self.cm.cluster_instance_num}: failed to stop cluster:\n{err}" ) # save artifacts only when produced during this test run if cluster_running_file.exists(): artifacts.save_start_script_coverage( log_file=state_dir / CLUSTER_START_CMDS_LOG, pytest_config=self.cm.pytest_config, ) artifacts.save_cluster_artifacts( save_dir=self.cm.pytest_tmp_dir, state_dir=state_dir) shutil.rmtree(state_dir, ignore_errors=True) with contextlib.suppress(Exception): _kill_supervisor(self.cm.cluster_instance_num) try: cluster_obj = cluster_nodes.start_cluster( cmd=str(startup_files.start_script), args=startup_files.start_script_args) except Exception as err: LOGGER.error(f"Failed to start cluster: {err}") excp = err else: break else: self.cm._log( f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\ncluster dead" ) if not configuration.IS_XDIST: pytest.exit(msg=f"Failed to start cluster, exception: {excp}", returncode=1) helpers.touch(self.cm.instance_dir / CLUSTER_DEAD_FILE) return False # Create temp dir for faucet addresses data. # Pytest's mktemp adds number to the end of the dir name, so keep the trailing '_' # as separator. Resulting dir name is e.g. 'addrs_data_ci3_0'. tmp_path = Path( self.cm.tmp_path_factory.mktemp( f"addrs_data_ci{self.cm.cluster_instance_num}_")) # setup faucet addresses cluster_nodes.setup_test_addrs(cluster_obj=cluster_obj, destination_dir=tmp_path) # create file that indicates that the cluster is running if not cluster_running_file.exists(): helpers.touch(cluster_running_file) return True
def get( # noqa: C901 self, singleton: bool = False, mark: str = "", lock_resources: UnpackableSequence = (), use_resources: UnpackableSequence = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches,too-many-locals # don't start new cluster if it was already started outside of test framework if DEV_CLUSTER_RUNNING: if start_cmd: LOGGER.warning( f"Ignoring the '{start_cmd}' cluster start command as " "'DEV_CLUSTER_RUNNING' is set.") return self._reuse_dev_cluster() if FORBID_RESTART and start_cmd: raise RuntimeError( "Cannot use custom start command when 'FORBID_RESTART' is set." ) selected_instance = -1 restart_here = False restart_ready = False first_iteration = True sleep_delay = 1 marked_tests_cache: Dict[int, MarkedTestsStatus] = {} if start_cmd: if not (singleton or mark): raise AssertionError( "Custom start command can be used only together with `singleton` or `mark`" ) # always clean after test(s) that started cluster with custom configuration cleanup = True # iterate until it is possible to start the test while True: if restart_ready: self._restart(start_cmd=start_cmd) if not first_iteration: helpers.xdist_sleep(random.random() * sleep_delay) # nothing time consuming can go under this lock as it will block all other workers with helpers.FileLockIfXdist(self.cm.cluster_lock): test_on_worker = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_RUNNING_GLOB}_{self.cm.worker_id}" )) # test is already running, nothing to set up if (first_iteration and test_on_worker and self.cm._cluster_instance_num != -1 and self.cm.cache.cluster_obj): self.cm._log(f"{test_on_worker[0]} already exists") return self.cm.cache.cluster_obj first_iteration = False # needs to be set here, before the first `continue` self.cm._cluster_instance_num = -1 # try all existing cluster instances for instance_num in range(self.cm.num_of_instances): # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if selected_instance != -1 and instance_num != selected_instance: continue instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" instance_dir.mkdir(exist_ok=True) # if the selected instance failed to start, move on to other instance if (instance_dir / CLUSTER_DEAD_FILE).exists(): selected_instance = -1 restart_here = False restart_ready = False # remove status files that are checked by other workers for sf in ( *instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"), *instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_*"), ): os.remove(sf) dead_clusters = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{CLUSTER_DEAD_FILE}") ) if len(dead_clusters) == self.cm.num_of_instances: raise RuntimeError( "All clusters are dead, cannot run.") continue # singleton test is running, so no other test can be started if (instance_dir / TEST_SINGLETON_FILE).exists(): self.cm._log( f"c{instance_num}: singleton test in progress, cannot run" ) sleep_delay = 5 continue restart_in_progress = list( instance_dir.glob(f"{RESTART_IN_PROGRESS_GLOB}_*")) # cluster restart planned, no new tests can start if not restart_here and restart_in_progress: # no log message here, it would be too many of them sleep_delay = 5 continue started_tests = list( instance_dir.glob(f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. marked_starting = list( instance_dir.glob(f"{TEST_MARK_STARTING_GLOB}_*")) marked_running = list( instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*")) if mark: marked_running_my = ( instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}").exists() marked_starting_my = list( instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_{mark}_*")) marked_running_my_anywhere = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_CURR_MARK_GLOB}_{mark}" )) # check if tests with my mark are running on some other cluster instance if not marked_running_my and marked_running_my_anywhere: self.cm._log( f"c{instance_num}: tests marked with my mark '{mark}' " "already running on other cluster instance, cannot run" ) continue marked_starting_my_anywhere = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_MARK_STARTING_GLOB}_{mark}_*" )) # check if tests with my mark are starting on some other cluster instance if not marked_starting_my and marked_starting_my_anywhere: self.cm._log( f"c{instance_num}: tests marked with my mark '{mark}' starting " "on other cluster instance, cannot run") continue # check if this test has the same mark as currently running marked tests if marked_running_my or marked_starting_my: # lock to this cluster instance selected_instance = instance_num elif marked_running or marked_starting: self.cm._log( f"c{instance_num}: tests marked with other mark starting " f"or running, I have different mark '{mark}'") continue # check if needs to wait until marked tests can run if marked_starting_my and started_tests: self.cm._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) sleep_delay = 2 continue # no unmarked test can run while marked tests are starting or running elif marked_running or marked_starting: self.cm._log( f"c{instance_num}: marked tests starting or running, " f"I don't have mark") sleep_delay = 5 continue # is this the first marked test that wants to run? initial_marked_test = bool(mark and not marked_running) # indicate that it is planned to start marked tests as soon as # all currently running tests are finished or the cluster is restarted if initial_marked_test: # lock to this cluster instance selected_instance = instance_num mark_starting_file = ( instance_dir / f"{TEST_MARK_STARTING_GLOB}_{mark}_{self.cm.worker_id}" ) if not mark_starting_file.exists(): open( mark_starting_file, "a", ).close() if started_tests: self.cm._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) sleep_delay = 3 continue # get marked tests status marked_tests_status = self._get_marked_tests_status( cache=marked_tests_cache, instance_num=instance_num) # marked tests are already running if marked_running: active_mark_file = marked_running[0].name # update marked tests status self._update_marked_tests( marked_tests_status=marked_tests_status, active_mark_name=active_mark_file, started_tests=started_tests, instance_num=instance_num, ) self.cm._log( f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # reset counter of cycles with no marked test running marked_tests_status.no_marked_tests_iter = 0 # this test is a singleton - no other test can run while this one is running if singleton and started_tests: self.cm._log( f"c{instance_num}: tests are running, cannot start singleton" ) sleep_delay = 5 continue # this test wants to lock some resources, check if these are not # locked or in use if lock_resources: res_usable = self._are_resources_usable( resources=lock_resources, instance_dir=instance_dir, instance_num=instance_num, ) if not res_usable: sleep_delay = 5 continue # filter out `lock_resources` from the list of `use_resources` if use_resources and lock_resources: use_resources = list( set(use_resources) - set(lock_resources)) # this test wants to use some resources, check if these are not locked if use_resources: res_locked = self._are_resources_locked( resources=use_resources, instance_dir=instance_dir, instance_num=instance_num, ) if res_locked: sleep_delay = 5 continue # indicate that the cluster will be restarted new_cmd_restart = bool(start_cmd and (initial_marked_test or singleton)) if not restart_here and ( new_cmd_restart or self._is_restart_needed(instance_num)): if started_tests: self.cm._log( f"c{instance_num}: tests are running, cannot restart" ) continue # Cluster restart will be performed by this worker. # By setting `restart_here`, we make sure this worker continue on # this cluster instance after restart. It is important because # the `start_cmd` used for starting the cluster might be speciffic # to the test. restart_here = True self.cm._log( f"c{instance_num}: setting to restart cluster") selected_instance = instance_num restart_in_progress_file = ( instance_dir / f"{RESTART_IN_PROGRESS_GLOB}_{self.cm.worker_id}") if not restart_in_progress_file.exists(): open(restart_in_progress_file, "a").close() # we've found suitable cluster instance selected_instance = instance_num self.cm._cluster_instance_num = instance_num cluster_nodes.set_cluster_env(instance_num) if restart_here: if restart_ready: # The cluster was already restarted if we are here and # `restart_ready` is still True. restart_ready = False # Remove status files that are no longer valid after restart. for f in instance_dir.glob( f"{RESTART_IN_PROGRESS_GLOB}_*"): os.remove(f) for f in instance_dir.glob( f"{RESTART_NEEDED_GLOB}_*"): os.remove(f) else: self.cm._log(f"c{instance_num}: calling restart") # the actual `_restart` function will be called outside # of global lock restart_ready = True continue # from this point on, all conditions needed to start the test are met # this test is a singleton if singleton: self.cm._log(f"c{instance_num}: starting singleton") open(self.cm.instance_dir / TEST_SINGLETON_FILE, "a").close() # this test is a first marked test if initial_marked_test: self.cm._log( f"c{instance_num}: starting '{mark}' tests") open( self.cm.instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}", "a").close() for sf in marked_starting: os.remove(sf) # create status file for each in-use resource _ = [ open( self.cm.instance_dir / f"{RESOURCE_IN_USE_GLOB}_{r}_{self.cm.worker_id}", "a", ).close() for r in use_resources ] # create status file for each locked resource _ = [ open( self.cm.instance_dir / f"{RESOURCE_LOCKED_GLOB}_{r}_{self.cm.worker_id}", "a", ).close() for r in lock_resources ] # cleanup = cluster restart after test (group of tests) is finished if cleanup: # cleanup after group of test that are marked with a marker if mark: self.cm._log(f"c{instance_num}: cleanup and mark") open( self.cm.instance_dir / f"{RESTART_AFTER_MARK_GLOB}_{self.cm.worker_id}", "a", ).close() # cleanup after single test (e.g. singleton) else: self.cm._log( f"c{instance_num}: cleanup and not mark") open( self.cm.instance_dir / f"{RESTART_NEEDED_GLOB}_{self.cm.worker_id}", "a", ).close() break else: # if the test cannot start on any instance, return to top-level loop continue test_running_file = ( self.cm.instance_dir / f"{TEST_RUNNING_GLOB}_{self.cm.worker_id}") self.cm._log( f"c{self.cm.cluster_instance_num}: creating {test_running_file}" ) open(test_running_file, "a").close() # check if it is necessary to reload data state_dir = cluster_nodes.get_cluster_env().state_dir self._reload_cluster_obj(state_dir=state_dir) cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: cluster_obj = cluster_nodes.get_cluster_type( ).get_cluster_obj() # `cluster_obj` is ready, we can start the test break return cluster_obj
def _restart(self, start_cmd: str = "", stop_cmd: str = "") -> bool: # noqa: C901 """Restart cluster. Not called under global lock! """ # pylint: disable=too-many-branches cluster_running_file = self.cm.instance_dir / CLUSTER_RUNNING_FILE # don't restart cluster if it was started outside of test framework if DEV_CLUSTER_RUNNING: if cluster_running_file.exists(): LOGGER.warning( "Ignoring requested cluster restart as 'DEV_CLUSTER_RUNNING' is set." ) else: open(cluster_running_file, "a").close() return True # fail if cluster restart is forbidden and it was already started if FORBID_RESTART and cluster_running_file.exists(): raise RuntimeError( "Cannot restart cluster when 'FORBID_RESTART' is set.") self.cm._log( f"c{self.cm.cluster_instance_num}: called `_restart`, start_cmd='{start_cmd}', " f"stop_cmd='{stop_cmd}'") startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.prepare_scripts_files( destdir=self.cm._create_startup_files_dir( self.cm.cluster_instance_num), instance_num=self.cm.cluster_instance_num, start_script=start_cmd, stop_script=stop_cmd, ) state_dir = cluster_nodes.get_cluster_env().state_dir self.cm._log( f"c{self.cm.cluster_instance_num}: in `_restart`, new files " f"start_cmd='{startup_files.start_script}', " f"stop_cmd='{startup_files.stop_script}'") excp: Optional[Exception] = None for i in range(2): if i > 0: self.cm._log( f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\nretrying" ) time.sleep(0.2) try: cluster_nodes.stop_cluster(cmd=str(startup_files.stop_script)) except Exception as err: self.cm._log( f"c{self.cm.cluster_instance_num}: failed to stop cluster:\n{err}" ) # save artifacts only when produced during this test run if cluster_running_file.exists(): cli_coverage.save_start_script_coverage( log_file=state_dir / CLUSTER_START_CMDS_LOG, pytest_config=self.cm.pytest_config, ) self._restart_save_cluster_artifacts(clean=True) try: _kill_supervisor(self.cm.cluster_instance_num) except Exception: pass try: cluster_obj = cluster_nodes.start_cluster( cmd=str(startup_files.start_script), args=startup_files.start_script_args) except Exception as err: LOGGER.error(f"Failed to start cluster: {err}") excp = err else: break else: self.cm._log( f"c{self.cm.cluster_instance_num}: failed to start cluster:\n{excp}\ncluster dead" ) if not helpers.IS_XDIST: pytest.exit(msg=f"Failed to start cluster, exception: {excp}", returncode=1) open(self.cm.instance_dir / CLUSTER_DEAD_FILE, "a").close() return False # setup faucet addresses tmp_path = Path(self.cm.tmp_path_factory.mktemp("addrs_data")) cluster_nodes.setup_test_addrs(cluster_obj, tmp_path) # create file that indicates that the cluster is running if not cluster_running_file.exists(): open(cluster_running_file, "a").close() return True