def wait_for_nodes_status(node_names=None, status=constants.NODE_READY, timeout=180): """ Wait until all nodes are in the given status Args: node_names (list): The node names to wait for to reached the desired state If None, will wait for all cluster nodes status (str): The node status to wait for (e.g. 'Ready', 'NotReady', 'SchedulingDisabled') timeout (int): The number in seconds to wait for the nodes to reach the status Raises: ResourceWrongStatusException: In case one or more nodes haven't reached the desired state """ if not node_names: node_names = [node.name for node in get_node_objs()] log.info(f"Waiting for nodes {node_names} to reach status {status}") try: for sample in TimeoutSampler(timeout, 3, get_node_objs, node_names): for node in sample: if node.ocp.get_resource_status(node.name) == status: node_names.remove(node.name) if not node_names: break except TimeoutExpiredError: log.error(f"The following nodes haven't reached status {status}: {node_names}") raise exceptions.ResourceWrongStatusException( node_names, [n.describe() for n in get_node_objs(node_names)] )
def wait_for_wl_to_finish(self, timeout=18000, sleep=300): """ Waiting until the workload is finished and get the test log Args: timeout (int): time in second to wait until the benchmark start sleep (int): Sleep interval seconds Raise: exception for too much restarts of the test. ResourceWrongStatusException : test Failed / Error TimeoutExpiredError : test did not completed on time. """ log.info(f"Waiting for {self.client_pod_name} to complete") Finished = 0 restarts = 0 total_time = timeout while not Finished and total_time > 0: results = run_oc_command( "get pod --no-headers -o custom-columns=:metadata.name,:status.phase", namespace=benchmark_operator.BMO_NAME, ) (fname, status) = ["", ""] for name in results: # looking for the pod which run the benchmark (not the IO) # this pod contain the `client` in his name, and there is only one # pod like this, other pods have the `server` in the name. (fname, status) = name.split() if re.search("client", fname): break else: (fname, status) = ["", ""] if fname == "": # there is no `client` pod ! err_msg = f"{self.client_pod} Failed to run !!!" log.error(err_msg) raise Exception(err_msg) if not fname == self.client_pod: # The client pod name is different from previous check, it was restarted log.info( f"The pod {self.client_pod} was restart. the new client pod is {fname}" ) self.client_pod = fname restarts += 1 # in case of restarting the benchmark, reset the timeout as well total_time = timeout if restarts > 3: # we are tolerating only 3 restarts err_msg = f"Too much restarts of the benchmark ({restarts})" log.error(err_msg) raise Exception(err_msg) if status == "Succeeded": # Getting the end time of the benchmark - for reporting. self.end_time = self.get_time() self.test_logs = self.pod_obj.exec_oc_cmd( f"logs {self.client_pod}", out_yaml_format=False ) log.info(f"{self.client_pod} completed successfully") Finished = 1 elif ( status != constants.STATUS_RUNNING and status != constants.STATUS_PENDING ): # if the benchmark pod is not in Running state (and not Completed/Pending), # no need to wait for timeout. # Note: the pod can be in pending state in case of restart. err_msg = f"{self.client_pod} Failed to run - ({status})" log.error(err_msg) raise exceptions.ResourceWrongStatusException( self.client_pod, describe_out=err_msg, column="Status", expected="Succeeded", got=status, ) else: log.info( f"{self.client_pod} is in {status} State, and wait to Succeeded State." f" wait another {sleep} sec. for benchmark to complete" ) time.sleep(sleep) total_time -= sleep if not Finished: err_msg = ( f"{self.client_pod} did not completed on time, " f"maybe timeout ({timeout}) need to be increase" ) log.error(err_msg) raise exceptions.TimeoutExpiredError( self.client_pod, custom_message=err_msg ) # Saving the benchmark internal log into a file at the logs directory log_file_name = f"{self.full_log_path}/test-pod.log" try: with open(log_file_name, "w") as f: f.write(self.test_logs) log.info(f"The Test log can be found at : {log_file_name}") except Exception: log.warning(f"Cannot write the log to the file {log_file_name}") log.info(f"The {self.benchmark_name} benchmark complete")