def short_kes_start_cluster(tmp_path_factory: TempdirFactory) -> Path: """Update *slotsPerKESPeriod* and *maxKESEvolutions*.""" pytest_globaltemp = helpers.get_pytest_globaltemp(tmp_path_factory) # need to lock because this same fixture can run on several workers in parallel with helpers.FileLockIfXdist( f"{pytest_globaltemp}/startup_files_short_kes.lock"): destdir = pytest_globaltemp / "startup_files_short_kes" destdir.mkdir(exist_ok=True) # return existing script if it is already generated by other worker destdir_ls = list(destdir.glob("start-cluster*")) if destdir_ls: return destdir_ls[0] startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.copy_scripts_files(destdir=destdir) with open(startup_files.genesis_spec) as fp_in: genesis_spec = json.load(fp_in) genesis_spec["slotsPerKESPeriod"] = 700 genesis_spec["maxKESEvolutions"] = 5 with open(startup_files.genesis_spec, "w") as fp_out: json.dump(genesis_spec, fp_out) return startup_files.start_script
def set_needs_restart(self) -> None: """Indicate that the cluster needs restart.""" with helpers.FileLockIfXdist(self.cluster_lock): self._log( f"c{self.cluster_instance_num}: called `set_needs_restart`") open(self.instance_dir / f"{RESTART_NEEDED_GLOB}_{self.worker_id}", "a").close()
def epoch_length_start_cluster(tmp_path_factory: TempdirFactory) -> Path: """Update *epochLength* to 1200.""" pytest_globaltemp = helpers.get_pytest_globaltemp(tmp_path_factory) # need to lock because this same fixture can run on several workers in parallel with helpers.FileLockIfXdist( f"{pytest_globaltemp}/startup_files_epoch_1200.lock"): destdir = pytest_globaltemp / "startup_files_epoch_1200" destdir.mkdir(exist_ok=True) # return existing script if it is already generated by other worker destdir_ls = list(destdir.glob("start-cluster*")) if destdir_ls: return destdir_ls[0] startup_files = cluster_nodes.get_cluster_type( ).cluster_scripts.copy_scripts_files(destdir=destdir) with open(startup_files.genesis_spec) as fp_in: genesis_spec = json.load(fp_in) genesis_spec["epochLength"] = 1500 with open(startup_files.genesis_spec, "w") as fp_out: json.dump(genesis_spec, fp_out) return startup_files.start_script
def return_funds_to_faucet( *src_addrs: clusterlib.AddressRecord, cluster_obj: clusterlib.ClusterLib, faucet_addr: str, amount: int = -1, tx_name: Optional[str] = None, destination_dir: FileType = ".", ) -> None: """Send `amount` from all `src_addrs` to `faucet_addr`. The amount of "-1" means all available funds. """ tx_name = tx_name or helpers.get_timestamped_rand_str() tx_name = f"{tx_name}_return_funds" with helpers.FileLockIfXdist(f"{helpers.get_basetemp()}/{faucet_addr}.lock"): try: logging.disable(logging.ERROR) for src in src_addrs: fund_dst = [clusterlib.TxOut(address=faucet_addr, amount=amount)] fund_tx_files = clusterlib.TxFiles(signing_key_files=[src.skey_file]) # try to return funds; don't mind if there's not enough funds for fees etc. try: cluster_obj.send_funds( src_address=src.address, destinations=fund_dst, tx_name=tx_name, tx_files=fund_tx_files, destination_dir=destination_dir, ) except Exception: pass finally: logging.disable(logging.NOTSET)
def add_ignore_rule(files_glob: str, regex: str) -> None: """Add ignore rule for expected errors.""" with helpers.FileLockIfXdist(f"{helpers.get_basetemp()}/ignore_rules.lock"): state_dir = cluster_nodes.get_cluster_env().state_dir rules_file = state_dir / ERRORS_RULES_FILE_NAME with open(rules_file, "a") as infile: infile.write(f"{files_glob};;{regex}\n")
def update_params( cluster_obj: clusterlib.ClusterLib, src_addr_record: clusterlib.AddressRecord, update_proposals: List[UpdateProposal], ) -> None: """Update params using update proposal.""" _cli_args = [(u.arg, str(u.value)) for u in update_proposals] cli_args = list(itertools.chain.from_iterable(_cli_args)) with helpers.FileLockIfXdist(f"{helpers.get_basetemp()}/update_params.lock"): LOGGER.info("Waiting for new epoch to submit proposal.") cluster_obj.wait_for_new_epoch() cluster_obj.submit_update_proposal( cli_args=cli_args, src_address=src_addr_record.address, src_skey_file=src_addr_record.skey_file, tx_name=helpers.get_timestamped_rand_str(), ) LOGGER.info(f"Update Proposal submitted ({cli_args})") cluster_obj.wait_for_new_epoch() protocol_params = cluster_obj.get_protocol_params() for u in update_proposals: # TODO: handle nested dictionaries if not u.name: continue updated_value = protocol_params[u.name] if str(updated_value) != str(u.value): raise AssertionError( f"Cluster update proposal failed! Param value for {u.name}: {updated_value}.\n" f"Expected: {u.value}\n" f"Tip: {cluster_obj.get_tip()}" )
def _log(self, msg: str) -> None: """Log message.""" if not self.manager_log.is_file(): return with helpers.FileLockIfXdist(self.log_lock): with open(self.manager_log, "a") as logfile: logfile.write( f"{datetime.datetime.now()} on {self.worker_id}: {msg}\n")
def add_ignore_rule(files_glob: str, regex: str) -> None: """Add ignore rule for expected errors.""" with helpers.FileLockIfXdist(f"{helpers.TEST_TEMP_DIR}/ignore_rules.lock"): cluster_env = devops_cluster.get_cluster_env() state_dir = cluster_env["state_dir"] rules_file = state_dir / ERRORS_RULES_FILE_NAME with open(rules_file, "a") as infile: infile.write(f"{files_glob};;{regex}\n")
def cluster_cleanup(tmp_path_factory: TempdirFactory, worker_id: str, request: FixtureRequest) -> Generator[None, None, None]: pytest_tmp_dir = Path(tmp_path_factory.getbasetemp()) if not worker_id or worker_id == "master": # if cluster was started outside of test framework, do nothing if cluster_management.DEV_CLUSTER_RUNNING: # TODO: check that socket is open and print error if not yield return yield cluster_manager_obj = cluster_management.ClusterManager( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config) cluster_manager_obj.save_worker_cli_coverage() _stop_all_cluster_instances( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config, pytest_tmp_dir=pytest_tmp_dir, ) return lock_dir = pytest_tmp_dir = pytest_tmp_dir.parent # pylint: disable=consider-using-with open(lock_dir / f".started_session_{worker_id}", "a").close() yield with helpers.FileLockIfXdist( f"{lock_dir}/{cluster_management.CLUSTER_LOCK}"): cluster_manager_obj = cluster_management.ClusterManager( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config) cluster_manager_obj.save_worker_cli_coverage() os.remove(lock_dir / f".started_session_{worker_id}") if not list(lock_dir.glob(".started_session_*")): _stop_all_cluster_instances( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config, pytest_tmp_dir=pytest_tmp_dir, )
def search_cluster_artifacts() -> List[Tuple[Path, str]]: """Search cluster artifacts for errors.""" state_dir = cluster_nodes.get_cluster_env().state_dir rules_file = state_dir / ERRORS_RULES_FILE_NAME with helpers.FileLockIfXdist( f"{helpers.get_basetemp()}/ignore_rules.lock"): ignore_rules = get_ignore_rules(rules_file) errors = [] for logfile in state_dir.glob("*.std*"): # skip if the log file is status file or rotated log if logfile.name.endswith(".offset") or ROTATED_RE.match(logfile.name): continue # read seek offset (from where to start searching) and timestamp of last search offset_file = logfile.parent / f".{logfile.name}.offset" if offset_file.exists(): seek = _get_seek(offset_file) timestamp = os.path.getmtime(offset_file) else: seek = 0 timestamp = 0.0 errors_ignored = get_ignore_regex(ignore_rules=ignore_rules, regexes=ERRORS_IGNORED, logfile=logfile) errors_ignored_re = re.compile(errors_ignored) # record offset for the "live" log file with open(offset_file, "w") as outfile: outfile.write(str(helpers.get_eof_offset(logfile))) for logfile_rec in get_rotated_logs(logfile=logfile, seek=seek, timestamp=timestamp): with open(logfile_rec.logfile) as infile: infile.seek(seek) for line in infile: if ERRORS_RE.search(line) and not ( errors_ignored and errors_ignored_re.search(line)): errors.append((logfile, line)) return errors
def cluster_cleanup(tmp_path_factory: TempdirFactory, worker_id: str, request: FixtureRequest) -> Generator: pytest_tmp_dir = Path(tmp_path_factory.getbasetemp()) if not worker_id or worker_id == "master": yield cluster_manager_obj = parallel_run.ClusterManager( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config) cluster_manager_obj.save_worker_cli_coverage() _stop_all_cluster_instances( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config, pytest_tmp_dir=pytest_tmp_dir, ) return lock_dir = pytest_tmp_dir = pytest_tmp_dir.parent open(lock_dir / f".started_session_{worker_id}", "a").close() yield with helpers.FileLockIfXdist(f"{lock_dir}/{parallel_run.CLUSTER_LOCK}"): cluster_manager_obj = parallel_run.ClusterManager( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config) cluster_manager_obj.save_worker_cli_coverage() os.remove(lock_dir / f".started_session_{worker_id}") if not list(lock_dir.glob(".started_session_*")): _stop_all_cluster_instances( tmp_path_factory=tmp_path_factory, worker_id=worker_id, pytest_config=request.config, pytest_tmp_dir=pytest_tmp_dir, )
def on_test_stop(self) -> None: """Perform actions after the test finished.""" if self._cluster_instance_num == -1: return with helpers.FileLockIfXdist(self.cluster_lock): self._log(f"c{self.cluster_instance_num}: called `on_test_stop`") # remove resource locking files created by the worker resource_locking_files = list( self.instance_dir.glob( f"{RESOURCE_LOCKED_GLOB}_*_{self.worker_id}")) for f in resource_locking_files: os.remove(f) # remove "resource in use" files created by the worker resource_in_use_files = list( self.instance_dir.glob( f"{RESOURCE_IN_USE_GLOB}_*_{self.worker_id}")) for f in resource_in_use_files: os.remove(f) # remove file that indicates that a test is running on the worker try: os.remove(self.instance_dir / f"{TEST_RUNNING_GLOB}_{self.worker_id}") except FileNotFoundError: pass # remove file that indicates the test was singleton try: os.remove(self.instance_dir / TEST_SINGLETON_FILE) except FileNotFoundError: pass # search for errors in cluster logfiles errors = logfiles.search_cluster_artifacts() if errors: logfiles.report_artifacts_errors(errors)
def get( # noqa: C901 self, singleton: bool = False, mark: str = "", lock_resources: UnpackableSequence = (), use_resources: UnpackableSequence = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches,too-many-locals # don't start new cluster if it was already started outside of test framework if DEV_CLUSTER_RUNNING: if start_cmd: LOGGER.warning( f"Ignoring the '{start_cmd}' cluster start command as " "'DEV_CLUSTER_RUNNING' is set.") return self._reuse_dev_cluster() if FORBID_RESTART and start_cmd: raise RuntimeError( "Cannot use custom start command when 'FORBID_RESTART' is set." ) selected_instance = -1 restart_here = False restart_ready = False first_iteration = True sleep_delay = 1 marked_tests_cache: Dict[int, MarkedTestsStatus] = {} if start_cmd: if not (singleton or mark): raise AssertionError( "Custom start command can be used only together with `singleton` or `mark`" ) # always clean after test(s) that started cluster with custom configuration cleanup = True # iterate until it is possible to start the test while True: if restart_ready: self._restart(start_cmd=start_cmd) if not first_iteration: helpers.xdist_sleep(random.random() * sleep_delay) # nothing time consuming can go under this lock as it will block all other workers with helpers.FileLockIfXdist(self.cm.cluster_lock): test_on_worker = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_RUNNING_GLOB}_{self.cm.worker_id}" )) # test is already running, nothing to set up if (first_iteration and test_on_worker and self.cm._cluster_instance_num != -1 and self.cm.cache.cluster_obj): self.cm._log(f"{test_on_worker[0]} already exists") return self.cm.cache.cluster_obj first_iteration = False # needs to be set here, before the first `continue` self.cm._cluster_instance_num = -1 # try all existing cluster instances for instance_num in range(self.cm.num_of_instances): # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if selected_instance != -1 and instance_num != selected_instance: continue instance_dir = self.cm.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" instance_dir.mkdir(exist_ok=True) # if the selected instance failed to start, move on to other instance if (instance_dir / CLUSTER_DEAD_FILE).exists(): selected_instance = -1 restart_here = False restart_ready = False # remove status files that are checked by other workers for sf in ( *instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*"), *instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_*"), ): os.remove(sf) dead_clusters = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{CLUSTER_DEAD_FILE}") ) if len(dead_clusters) == self.cm.num_of_instances: raise RuntimeError( "All clusters are dead, cannot run.") continue # singleton test is running, so no other test can be started if (instance_dir / TEST_SINGLETON_FILE).exists(): self.cm._log( f"c{instance_num}: singleton test in progress, cannot run" ) sleep_delay = 5 continue restart_in_progress = list( instance_dir.glob(f"{RESTART_IN_PROGRESS_GLOB}_*")) # cluster restart planned, no new tests can start if not restart_here and restart_in_progress: # no log message here, it would be too many of them sleep_delay = 5 continue started_tests = list( instance_dir.glob(f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. marked_starting = list( instance_dir.glob(f"{TEST_MARK_STARTING_GLOB}_*")) marked_running = list( instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*")) if mark: marked_running_my = ( instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}").exists() marked_starting_my = list( instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_{mark}_*")) marked_running_my_anywhere = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_CURR_MARK_GLOB}_{mark}" )) # check if tests with my mark are running on some other cluster instance if not marked_running_my and marked_running_my_anywhere: self.cm._log( f"c{instance_num}: tests marked with my mark '{mark}' " "already running on other cluster instance, cannot run" ) continue marked_starting_my_anywhere = list( self.cm.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_MARK_STARTING_GLOB}_{mark}_*" )) # check if tests with my mark are starting on some other cluster instance if not marked_starting_my and marked_starting_my_anywhere: self.cm._log( f"c{instance_num}: tests marked with my mark '{mark}' starting " "on other cluster instance, cannot run") continue # check if this test has the same mark as currently running marked tests if marked_running_my or marked_starting_my: # lock to this cluster instance selected_instance = instance_num elif marked_running or marked_starting: self.cm._log( f"c{instance_num}: tests marked with other mark starting " f"or running, I have different mark '{mark}'") continue # check if needs to wait until marked tests can run if marked_starting_my and started_tests: self.cm._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) sleep_delay = 2 continue # no unmarked test can run while marked tests are starting or running elif marked_running or marked_starting: self.cm._log( f"c{instance_num}: marked tests starting or running, " f"I don't have mark") sleep_delay = 5 continue # is this the first marked test that wants to run? initial_marked_test = bool(mark and not marked_running) # indicate that it is planned to start marked tests as soon as # all currently running tests are finished or the cluster is restarted if initial_marked_test: # lock to this cluster instance selected_instance = instance_num mark_starting_file = ( instance_dir / f"{TEST_MARK_STARTING_GLOB}_{mark}_{self.cm.worker_id}" ) if not mark_starting_file.exists(): open( mark_starting_file, "a", ).close() if started_tests: self.cm._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) sleep_delay = 3 continue # get marked tests status marked_tests_status = self._get_marked_tests_status( cache=marked_tests_cache, instance_num=instance_num) # marked tests are already running if marked_running: active_mark_file = marked_running[0].name # update marked tests status self._update_marked_tests( marked_tests_status=marked_tests_status, active_mark_name=active_mark_file, started_tests=started_tests, instance_num=instance_num, ) self.cm._log( f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # reset counter of cycles with no marked test running marked_tests_status.no_marked_tests_iter = 0 # this test is a singleton - no other test can run while this one is running if singleton and started_tests: self.cm._log( f"c{instance_num}: tests are running, cannot start singleton" ) sleep_delay = 5 continue # this test wants to lock some resources, check if these are not # locked or in use if lock_resources: res_usable = self._are_resources_usable( resources=lock_resources, instance_dir=instance_dir, instance_num=instance_num, ) if not res_usable: sleep_delay = 5 continue # filter out `lock_resources` from the list of `use_resources` if use_resources and lock_resources: use_resources = list( set(use_resources) - set(lock_resources)) # this test wants to use some resources, check if these are not locked if use_resources: res_locked = self._are_resources_locked( resources=use_resources, instance_dir=instance_dir, instance_num=instance_num, ) if res_locked: sleep_delay = 5 continue # indicate that the cluster will be restarted new_cmd_restart = bool(start_cmd and (initial_marked_test or singleton)) if not restart_here and ( new_cmd_restart or self._is_restart_needed(instance_num)): if started_tests: self.cm._log( f"c{instance_num}: tests are running, cannot restart" ) continue # Cluster restart will be performed by this worker. # By setting `restart_here`, we make sure this worker continue on # this cluster instance after restart. It is important because # the `start_cmd` used for starting the cluster might be speciffic # to the test. restart_here = True self.cm._log( f"c{instance_num}: setting to restart cluster") selected_instance = instance_num restart_in_progress_file = ( instance_dir / f"{RESTART_IN_PROGRESS_GLOB}_{self.cm.worker_id}") if not restart_in_progress_file.exists(): open(restart_in_progress_file, "a").close() # we've found suitable cluster instance selected_instance = instance_num self.cm._cluster_instance_num = instance_num cluster_nodes.set_cluster_env(instance_num) if restart_here: if restart_ready: # The cluster was already restarted if we are here and # `restart_ready` is still True. restart_ready = False # Remove status files that are no longer valid after restart. for f in instance_dir.glob( f"{RESTART_IN_PROGRESS_GLOB}_*"): os.remove(f) for f in instance_dir.glob( f"{RESTART_NEEDED_GLOB}_*"): os.remove(f) else: self.cm._log(f"c{instance_num}: calling restart") # the actual `_restart` function will be called outside # of global lock restart_ready = True continue # from this point on, all conditions needed to start the test are met # this test is a singleton if singleton: self.cm._log(f"c{instance_num}: starting singleton") open(self.cm.instance_dir / TEST_SINGLETON_FILE, "a").close() # this test is a first marked test if initial_marked_test: self.cm._log( f"c{instance_num}: starting '{mark}' tests") open( self.cm.instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}", "a").close() for sf in marked_starting: os.remove(sf) # create status file for each in-use resource _ = [ open( self.cm.instance_dir / f"{RESOURCE_IN_USE_GLOB}_{r}_{self.cm.worker_id}", "a", ).close() for r in use_resources ] # create status file for each locked resource _ = [ open( self.cm.instance_dir / f"{RESOURCE_LOCKED_GLOB}_{r}_{self.cm.worker_id}", "a", ).close() for r in lock_resources ] # cleanup = cluster restart after test (group of tests) is finished if cleanup: # cleanup after group of test that are marked with a marker if mark: self.cm._log(f"c{instance_num}: cleanup and mark") open( self.cm.instance_dir / f"{RESTART_AFTER_MARK_GLOB}_{self.cm.worker_id}", "a", ).close() # cleanup after single test (e.g. singleton) else: self.cm._log( f"c{instance_num}: cleanup and not mark") open( self.cm.instance_dir / f"{RESTART_NEEDED_GLOB}_{self.cm.worker_id}", "a", ).close() break else: # if the test cannot start on any instance, return to top-level loop continue test_running_file = ( self.cm.instance_dir / f"{TEST_RUNNING_GLOB}_{self.cm.worker_id}") self.cm._log( f"c{self.cm.cluster_instance_num}: creating {test_running_file}" ) open(test_running_file, "a").close() # check if it is necessary to reload data state_dir = cluster_nodes.get_cluster_env().state_dir self._reload_cluster_obj(state_dir=state_dir) cluster_obj = self.cm.cache.cluster_obj if not cluster_obj: cluster_obj = cluster_nodes.get_cluster_type( ).get_cluster_obj() # `cluster_obj` is ready, we can start the test break return cluster_obj
*dst_addrs: str, cluster_obj: clusterlib.ClusterLib, amount: int = 2_000_000, tx_name: Optional[str] = None, destination_dir: FileType = ".", ) -> None: """Send `amount` from genesis addr to all `dst_addrs`.""" fund_dst = [ clusterlib.TxOut(address=d, amount=amount) for d in dst_addrs if cluster_obj.get_address_balance(d) < amount ] if not fund_dst: return with helpers.FileLockIfXdist(f"{helpers.get_basetemp()}/{cluster_obj.genesis_utxo_addr}.lock"): tx_name = tx_name or helpers.get_timestamped_rand_str() tx_name = f"{tx_name}_genesis_funding" fund_tx_files = clusterlib.TxFiles( signing_key_files=[ *cluster_obj.genesis_keys.delegate_skeys, cluster_obj.genesis_keys.genesis_utxo_skey, ] ) cluster_obj.send_funds( src_address=cluster_obj.genesis_utxo_addr, destinations=fund_dst, tx_name=tx_name, tx_files=fund_tx_files, destination_dir=destination_dir,
def fund_from_genesis( *dst_addrs: str, cluster_obj: clusterlib.ClusterLib, amount: int = 2_000_000, tx_name: Optional[str] = None, destination_dir: FileType = ".", ) -> None: """Send `amount` from genesis addr to all `dst_addrs`.""" fund_dst = [ clusterlib.TxOut(address=d, amount=amount) for d in dst_addrs if cluster_obj.get_address_balance(d) < amount ] if not fund_dst: return with helpers.FileLockIfXdist( f"{helpers.TEST_TEMP_DIR}/{cluster_obj.genesis_utxo_addr}.lock"): tx_name = tx_name or get_timestamped_rand_str() tx_name = f"{tx_name}_genesis_funding" fund_tx_files = clusterlib.TxFiles(signing_key_files=[ *cluster_obj.delegate_skeys, cluster_obj.genesis_utxo_skey ]) cluster_obj.send_funds( src_address=cluster_obj.genesis_utxo_addr, destinations=fund_dst, tx_name=tx_name, tx_files=fund_tx_files, destination_dir=destination_dir, ) cluster_obj.wait_for_new_block(new_blocks=2)
def get( # noqa: C901 self, singleton: bool = False, mark: str = "", lock_resources: UnpackableSequence = (), use_resources: UnpackableSequence = (), cleanup: bool = False, start_cmd: str = "", ) -> clusterlib.ClusterLib: """Return the `clusterlib.ClusterLib` instance once we can start the test. It checks current conditions and waits if the conditions don't allow to start the test right away. """ # pylint: disable=too-many-statements,too-many-branches,too-many-locals selected_instance = -1 restart_here = False restart_ready = False mark_start_here = False first_iteration = True sleep_delay = 1 marked_tests_cache: Dict[int, MarkedTestsStatus] = {} if start_cmd: if not (singleton or mark): raise AssertionError( "Custom start command can be used only together with `singleton` or `mark`" ) # always clean after test(s) that started cluster with custom configuration cleanup = True # iterate until it is possible to start the test while True: if restart_ready: self._restart(start_cmd=start_cmd) if not first_iteration: helpers.xdist_sleep(random.random() * sleep_delay) # nothing time consuming can go under this lock as it will block all other workers with helpers.FileLockIfXdist(self.cluster_lock): test_on_worker = list( self.lock_dir.glob( f"{CLUSTER_DIR_TEMPLATE}*/{TEST_RUNNING_GLOB}_{self.worker_id}" )) # test is already running, nothing to set up if (first_iteration and test_on_worker and self._cluster_instance != -1 and self.cache.cluster_obj): self._log(f"{test_on_worker[0]} already exists") return self.cache.cluster_obj first_iteration = False # needs to be set here, before the first `continue` self._cluster_instance = -1 # try all existing cluster instances for instance_num in range(self.num_of_instances): # if instance to run the test on was already decided, skip all other instances # pylint: disable=consider-using-in if selected_instance != -1 and instance_num != selected_instance: continue instance_dir = self.lock_dir / f"{CLUSTER_DIR_TEMPLATE}{instance_num}" instance_dir.mkdir(exist_ok=True) # singleton test is running, so no other test can be started if (instance_dir / TEST_SINGLETON_FILE).exists(): self._log( f"c{instance_num}: singleton test in progress, cannot run" ) sleep_delay = 5 continue restart_in_progress = list( instance_dir.glob(f"{RESTART_IN_PROGRESS_GLOB}_*")) # cluster restart planned, no new tests can start if not restart_here and restart_in_progress: self._log( f"c{instance_num}: restart in progress, cannot run" ) continue started_tests = list( instance_dir.glob(f"{TEST_RUNNING_GLOB}_*")) # "marked tests" = group of tests marked with a specific mark. # While these tests are running, no unmarked test can start. # Check if it is indicated that marked tests will start next. marked_tests_starting = list( instance_dir.glob(f"{TEST_MARK_STARTING_GLOB}_*")) marked_tests_starting_my = list( instance_dir.glob( f"{TEST_MARK_STARTING_GLOB}_{mark}_*")) if not mark_start_here and marked_tests_starting_my: self._log( f"c{instance_num}: marked tests starting with my mark, cannot run" ) selected_instance = instance_num sleep_delay = 2 continue if not mark_start_here and marked_tests_starting: self._log( f"c{instance_num}: marked tests starting, cannot run" ) sleep_delay = 2 continue if mark_start_here and marked_tests_starting: if started_tests: self._log( f"c{instance_num}: unmarked tests running, cannot start marked test" ) sleep_delay = 2 continue os.remove(marked_tests_starting[0]) mark_start_here = False test_curr_mark = list( instance_dir.glob(f"{TEST_CURR_MARK_GLOB}_*")) first_marked_test = bool(mark and not test_curr_mark) # indicate that it is planned to start marked tests as soon as # all currently running tests are finished if first_marked_test and started_tests: self._log( f"c{instance_num}: unmarked tests running, wants to start '{mark}'" ) mark_start_here = True selected_instance = instance_num open( instance_dir / f"{TEST_MARK_STARTING_GLOB}_{mark}_{self.worker_id}", "a").close() sleep_delay = 2 continue # get marked tests status marked_tests_status = self._get_marked_tests_status( cache=marked_tests_cache, instance_num=instance_num) # marked tests are already running if test_curr_mark: active_mark_file = test_curr_mark[0].name self._update_marked_tests( marked_tests_status=marked_tests_status, active_mark_name=active_mark_file, started_tests=started_tests, instance_num=instance_num, ) if not mark: self._log( f"c{instance_num}: marked tests running, I don't have mark" ) sleep_delay = 5 continue # check if this test has the same mark as currently running marked tests, # so it can run if f"{TEST_CURR_MARK_GLOB}_{mark}" not in active_mark_file: self._log( f"c{instance_num}: marked tests running, " f"I have different mark - {mark}") sleep_delay = 5 continue self._log(f"c{instance_num}: in marked tests branch, " f"I have required mark '{mark}'") # reset counter of cycles with no marked test running marked_tests_status.no_marked_tests_iter = 0 # this test is a singleton - no other test can run while this one is running if singleton and started_tests: self._log( f"c{instance_num}: tests are running, cannot start singleton" ) sleep_delay = 5 continue # this test wants to lock some resources, check if these are not # locked or in use if lock_resources: res_usable = self._are_resources_usable( resources=lock_resources, instance_dir=instance_dir, instance_num=instance_num, ) if not res_usable: sleep_delay = 5 continue # filter out `lock_resources` from the list of `use_resources` if use_resources and lock_resources: use_resources = list( set(use_resources) - set(lock_resources)) # this test wants to use some resources, check if these are not locked if use_resources: res_locked = self._are_resources_locked( resources=use_resources, instance_dir=instance_dir, instance_num=instance_num, ) if res_locked: sleep_delay = 5 continue # indicate that the cluster will be restarted new_cmd_restart = bool(start_cmd and (first_marked_test or singleton)) if not restart_here and ( new_cmd_restart or self._is_restart_needed(instance_num)): self._log( f"c{instance_num}: setting to restart cluster") restart_here = True selected_instance = instance_num open( instance_dir / f"{RESTART_IN_PROGRESS_GLOB}_{self.worker_id}", "a").close() # cluster restart will be performed by this worker if restart_here and started_tests: self._log( f"c{instance_num}: tests are running, cannot restart" ) sleep_delay = 2 continue # we've found suitable cluster instance self._cluster_instance = instance_num cluster_instances.set_cardano_node_socket_path( instance_num) if restart_here: if restart_ready: # The cluster was already restarted if we are here and # `restart_ready` is still True. restart_ready = False # Remove status files that are no longer valid after restart. for f in instance_dir.glob( f"{RESTART_IN_PROGRESS_GLOB}_*"): os.remove(f) for f in instance_dir.glob( f"{RESTART_NEEDED_GLOB}_*"): os.remove(f) else: self._log(f"c{instance_num}: calling restart") # the actual `_restart` function will be called outside # of global lock restart_ready = True continue # from this point on, all conditions needed to start the test are met # this test is a singleton if singleton: self._log(f"c{instance_num}: starting singleton") open(self.instance_dir / TEST_SINGLETON_FILE, "a").close() # this test is a first marked test if first_marked_test: self._log(f"c{instance_num}: starting '{mark}' tests") open( self.instance_dir / f"{TEST_CURR_MARK_GLOB}_{mark}", "a").close() # create status file for each in-use resource _ = [ open( self.instance_dir / f"{RESOURCE_IN_USE_GLOB}_{r}_{self.worker_id}", "a").close() for r in use_resources ] # create status file for each locked resource _ = [ open( self.instance_dir / f"{RESOURCE_LOCKED_GLOB}_{r}_{self.worker_id}", "a").close() for r in lock_resources ] # cleanup = cluster restart after test (group of tests) is finished if cleanup: # cleanup after group of test that are marked with a marker if mark: self._log(f"c{instance_num}: cleanup and mark") open( self.instance_dir / f"{RESTART_AFTER_MARK_GLOB}_{self.worker_id}", "a", ).close() # cleanup after single test (e.g. singleton) else: self._log(f"c{instance_num}: cleanup and not mark") open( self.instance_dir / f"{RESTART_NEEDED_GLOB}_{self.worker_id}", "a").close() break else: # if the test cannot run on any instance, return to top-level loop continue test_running_file = self.instance_dir / f"{TEST_RUNNING_GLOB}_{self.worker_id}" self._log( f"c{self.cluster_instance}: creating {test_running_file}") open(test_running_file, "a").close() cluster_env = devops_cluster.get_cluster_env() state_dir = Path(cluster_env["state_dir"]) # check if it is necessary to reload data self._reload_cluster_obj(state_dir=state_dir) cluster_obj = self.cache.cluster_obj if not cluster_obj: cluster_obj = devops_cluster.get_cluster_obj() # `cluster_obj` is ready, we can start the test break return cluster_obj