def finalizer(): # Validate all nodes are schedulable scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status") assert prometheus_health_check(), "Prometheus health is degraded"
def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == constants.STATUS_CLBO): node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name]) # Verify OSD encrypted if config.ENV_DATA.get("encryption_at_rest"): osd_encryption_verification() logger.info("Clear crash warnings and osd removal leftovers") clear_crash_warning_and_osd_removal_leftovers()
def get_node_by_attached_volume(self, volume): """ Get node OCS object of the EC2 instance that has the volume attached to Args: volume (Volume): The volume to get the EC2 according to Returns: OCS: The OCS object of the EC2 instance """ instance_ids = [ at.get('InstanceId') for at in volume.attachments ] assert instance_ids, ( f"EBS Volume {volume.id} is not attached to any EC2 instance" ) instance_id = instance_ids[0] all_nodes = get_node_objs() nodes = [ n for n in all_nodes if instance_id in n.get() .get('spec').get('providerID') ] assert nodes, ( f"Failed to find the OCS object for EC2 instance {instance_id}" ) return nodes[0]
def finalizer(): scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes)
def noobaa_running_node_restart(pod_name): """ Function to restart node which has noobaa pod's running Args: pod_name (str): Name of noobaa pod """ nb_pod_obj = pod.get_pod_obj( (get_pod_name_by_pattern( pattern=pod_name, namespace=constants.OPENSHIFT_STORAGE_NAMESPACE))[0], namespace=constants.OPENSHIFT_STORAGE_NAMESPACE, ) nb_node_name = pod.get_pod_node(nb_pod_obj).name factory = platform_nodes.PlatformNodesFactory() nodes = factory.get_nodes_platform() nb_nodes = get_node_objs(node_names=nb_node_name) log.info(f"{pod_name} is running on {nb_node_name}") log.info(f"Restating node: {nb_node_name}....") nodes.restart_nodes_by_stop_and_start(nodes=nb_nodes, force=True) # Validate nodes are up and running wait_for_nodes_status() ceph_health_check(tries=30, delay=60) helpers.wait_for_resource_state(nb_pod_obj, constants.STATUS_RUNNING, timeout=180)
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ if pod_name_of_node == 'couchbase': node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == 'osd': node_list = get_osd_running_nodes() elif pod_name_of_node == 'master': node_list = get_master_nodes() node_1 = get_node_objs(node_list[random.randint(0, len(node_list) - 1)]) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) get_node_resource_utilization_from_adm_top(node_type='master', print_table=True) # Restart relevant node nodes.restart_nodes(node_1) for sample in TimeoutSampler(300, 5, self.cb.result.done): if sample: break else: logging.info( "#### ....Waiting for couchbase threads to complete...") self.sanity_helpers.health_check()
def test_rgw_host_node_failure( self, nodes, node_restart_teardown, mcg_obj, bucket_factory ): """ Test case to fail node where RGW and Noobaa-db-0 hosting and verify new pod spuns on healthy node """ # Get rgw pods rgw_pod_obj = get_rgw_pods() # Get nooba pods noobaa_pod_obj = get_noobaa_pods() # Get the node where noobaa-db hosted for noobaa_pod in noobaa_pod_obj: if noobaa_pod.name == "noobaa-db-0": noobaa_pod_node = get_pod_node(noobaa_pod) for rgw_pod in rgw_pod_obj: pod_node = rgw_pod.get().get("spec").get("nodeName") if pod_node == noobaa_pod_node.name: # Stop the node log.info( f"Stopping node {pod_node} where" f" rgw pod {rgw_pod.name} and noobaa-db-0 hosted" ) node_obj = get_node_objs(node_names=[pod_node]) nodes.stop_nodes(node_obj) # Validate old rgw pod went terminating state wait_for_resource_state( resource=rgw_pod, state=constants.STATUS_TERMINATING, timeout=720 ) # Validate new rgw pod spun ocp_obj = OCP( kind=constants.POD, namespace=defaults.ROOK_CLUSTER_NAMESPACE ) ocp_obj.wait_for_resource( condition=constants.STATUS_RUNNING, resource_count=len(rgw_pod_obj), selector=constants.RGW_APP_LABEL, ) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-1") # Start the node nodes.start_nodes(node_obj) # Create OBC and read wnd write self.create_obc_creation(bucket_factory, mcg_obj, "Object-key-2") # Verify cluster health self.sanity_helpers.health_check() # Verify all storage pods are running wait_for_storage_pods()
def finalizer(): """ Make sure that all cluster's nodes are in 'Ready' state and if not, change them back to 'Ready' state by marking them as schedulable """ scheduling_disabled_nodes = [ n.name for n in get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_READY_SCHEDULING_DISABLED ] if scheduling_disabled_nodes: schedule_nodes(scheduling_disabled_nodes) # Remove label created for DC app pods on all worker nodes node_objs = get_node_objs() for node_obj in node_objs: if "dc" in node_obj.get().get("metadata").get("labels").keys(): remove_label_from_worker_node([node_obj.name], label_key="dc")
def test_nodes_restart(self, nodes, pvc_factory, pod_factory, force): """ Test nodes restart (from the platform layer, i.e, EC2 instances, VMWare VMs) """ ocp_nodes = get_node_objs() nodes.restart_nodes(nodes=ocp_nodes, force=force) self.sanity_helpers.health_check() self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def finalizer(): ocp_nodes = get_node_objs() for n in ocp_nodes: recover_node_to_ready_state(n) logger.info("Switch to the original cluster index") config.switch_ctx(self.orig_index) ceph_health_check()
def test_rolling_nodes_restart(self, nodes, pvc_factory, pod_factory): """ Test restart nodes one after the other and check health status in between """ ocp_nodes = get_node_objs() for node in ocp_nodes: nodes.restart_nodes(nodes=[node], wait=False) self.sanity_helpers.health_check(cluster_check=False, tries=60) self.sanity_helpers.create_resources(pvc_factory, pod_factory)
def test_get_vm_status(): """ Test of RHV get_vm_status() method implementation VM of healthy OCS Cluster has 'up' status by default. """ rhv_depl = RHVIPI() vm = rhv_depl.rhv_util.get_rhv_vm_instance( get_node_objs()[0].get().get("metadata").get("name")) logger.info(f"vm name is: {vm.name}") status = rhv_depl.rhv_util.get_vm_status(vm) assert "up" == str(status), f"Status of {vm.name} is {status}"
def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status()
def test_stop_and_start_rhv_vms(): """ Test of RHV stop_rhv_vms() method implementation VM has 'down' status after shutdown and 'up' after power on """ rhv_depl = RHVIPI() vm = rhv_depl.rhv_util.get_rhv_vm_instance( get_node_objs()[0].get().get("metadata").get("name")) logger.info(f"vm name is: {vm.name}") rhv_depl.rhv_util.stop_rhv_vms([vm]) status = rhv_depl.rhv_util.get_vm_status(vm) assert "down" == str(status), f"Status of {vm.name} is {status}"
def check_automated_recovery_from_stopped_node(nodes): """ 1) Stop node. 2) The rook ceph pods associated with the node should change to a Terminating state. 3) The node should power on automatically, or if removed from the cluster, a new node should create automatically. 4) The new osd pods with the same ids should start on the stopped node after it powered on, or to start on the new osd node. """ old_wnodes = get_worker_nodes() log.info(f"Current worker nodes: {old_wnodes}") osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] machine_name = machine.get_machine_from_node_name(osd_node_name) machineset = machine.get_machineset_from_machine_name(machine_name) log.info(f"machineset name: {machineset}") old_osd_pod_ids = get_node_osd_ids(osd_node_name) log.info(f"osd pod ids: {old_osd_pod_ids}") pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate( osd_node_name) nodes.stop_nodes([osd_node], wait=True) log.info(f"Successfully powered off node: {osd_node_name}") log.info("Verify the node rook ceph pods go into a Terminating state") res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING], pod_names_expected_to_terminate) assert res, "Not all the node rook ceph pods are in a Terminating state" try: log.info(f"Wait for the node: {osd_node_name} to power on") wait_for_nodes_status([osd_node_name]) log.info(f"Successfully powered on node {osd_node_name}") except ResourceWrongStatusException as e: log.info( f"The worker node {osd_node_name} didn't start due to the exception {str(e)} " f"Probably it has been removed from the cluster. Waiting for a new node to come up..." ) new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes) osd_node_name = new_wnode.name assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids, timeout=300) log.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" )
def get_pod_node(pod_obj): """ Get the node that the pod is running on Args: pod_obj (OCS): The pod object Returns: ocs_ci.ocs.ocp.OCP: The node object """ node_name = pod_obj.get().get('spec').get('nodeName') return node.get_node_objs(node_names=node_name)[0]
def test_run_couchbase_node_reboot(self, cb_setup, nodes, pod_name_of_node): """ Test couchbase workload with node reboot """ # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) get_node_resource_utilization_from_adm_top(node_type="master", print_table=True) if pod_name_of_node == "couchbase": node_list = self.cb.get_couchbase_nodes() elif pod_name_of_node == "osd": node_list = get_osd_running_nodes() elif pod_name_of_node == "master": master_node = get_nodes(pod_name_of_node, num_of_nodes=1) # Restart relevant node if pod_name_of_node == "master": nodes.restart_nodes(master_node, wait=False) waiting_time = 40 log.info(f"Waiting {waiting_time} seconds...") time.sleep(waiting_time) else: restart_node = get_node_objs(node_list[random.randint( 0, len(node_list) - 1)]) nodes.restart_nodes(restart_node) # Validate all nodes and services are in READY state and up retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(ocp.wait_for_cluster_connectivity(tries=400)) retry( (CommandFailed, TimeoutError, AssertionError, ResourceWrongStatusException), tries=60, delay=15, )(wait_for_nodes_status(timeout=1800)) bg_handler = flowtest.BackgroundOps() bg_ops = [self.cb.result] retry((CommandFailed), tries=60, delay=15)(bg_handler.wait_for_bg_operations(bg_ops, timeout=3600)) self.sanity_helpers.health_check(tries=40)
def finalizer(): # Validate all nodes are in READY state not_ready_nodes = [ n for n in get_node_objs() if n.ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] log.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes_by_stop_and_start(not_ready_nodes) wait_for_nodes_status() log.info("All nodes are in Ready status")
def test_p_stop_and_start(): """ Test of RHV stop_nodes & start Nodes method implementation """ rhv_plfrm = RHVNodes() nodes = get_node_objs() logger.info(f"nodes are: {nodes}") node = [nodes[4]] rhv_plfrm.stop_nodes(node) vm_name = node[0].get().get("metadata").get("name") vm_obj = rhv_plfrm.rhv.get_rhv_vm_instance(vm_name) status = rhv_plfrm.rhv.get_vm_status(vm_obj) assert "down" == str(status), f"Status of {vm_name} is {status}" status = rhv_plfrm.rhv.get_vm_status(vm_obj) logger.info(f"Status of {vm_name} is {status}")
def ec2_instances(request, aws_obj): """ Get cluster instances Returns: dict: The ID keys and the name values of the instances """ # Get all cluster nodes objects nodes = node.get_node_objs() # Get the cluster nodes ec2 instances ec2_instances = aws.get_instances_ids_and_names(nodes) assert ec2_instances, f"Failed to get ec2 instances for node {[n.name for n in nodes]}" def finalizer(): """ Make sure all instances are running """ # Getting the instances that are in status 'stopping' (if there are any), to wait for them to # get to status 'stopped' so it will be possible to start them stopping_instances = { key: val for key, val in ec2_instances.items() if (aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPING) } # Waiting fot the instances that are in status 'stopping' # (if there are any) to reach 'stopped' if stopping_instances: for stopping_instance in stopping_instances: instance = aws_obj.get_ec2_instance(stopping_instance.key()) instance.wait_until_stopped() stopped_instances = { key: val for key, val in ec2_instances.items() if (aws_obj.get_instances_status_by_id(key) == constants.INSTANCE_STOPPED) } # Start the instances if stopped_instances: aws_obj.start_ec2_instances(instances=stopped_instances, wait=True) request.addfinalizer(finalizer) return ec2_instances
def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=3, transactions=transactions, clients=3) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Choose a node based on pod it contains if pod_name == 'postgres': node_list = pgsql.get_pgsql_nodes() elif pod_name == 'osd': node_list = get_osd_running_nodes() node_1 = get_node_objs(node_list[random.randint(0, len(node_list) - 1)]) # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type='worker', print_table=True) # Restart relevant node nodes.restart_nodes(node_1) # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods) # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_run_pgsql_reboot_node(self, pgsql, nodes, transactions, pod_name): """ Test pgsql workload """ # Create pgbench benchmark pgsql.create_pgbench_benchmark(replicas=1, transactions=transactions) # Start measuring time start_time = datetime.now() # Wait for pgbench pod to reach running state pgsql.wait_for_pgbench_status(status=constants.STATUS_RUNNING) # Select a node where pgbench is not running and reboot osd_nodes_list = get_osd_running_nodes() node_list = pgsql.filter_pgbench_nodes_from_nodeslist(osd_nodes_list) node_1 = get_node_objs(node_list[random.randint(0, len(node_list) - 1)]) log.info(f"Selected node {node_1} for reboot operation") # Check worker node utilization (adm_top) get_node_resource_utilization_from_adm_top(node_type="worker", print_table=True) # Restart relevant node nodes.restart_nodes(node_1) # Wait for pg_bench pod to complete pgsql.wait_for_pgbench_status(status=constants.STATUS_COMPLETED) # Calculate the time from running state to completed state end_time = datetime.now() diff_time = end_time - start_time log.info( f"\npgbench pod reached to completed state after {diff_time.seconds} seconds\n" ) # Get pgbench pods pgbench_pods = pgsql.get_pgbench_pods() # Validate pgbench run and parse logs pgsql.validate_pgbench_run(pgbench_pods) # Perform cluster and Ceph health checks self.sanity_helpers.health_check(tries=40)
def test_run_jenkins_node_reboot(self, jenkins, nodes, node_type, num_projects, num_of_builds): """ Test Node Reboot jenkins """ # Init number of projects jenkins.number_projects = num_projects # Create app jenkins jenkins.create_app_jenkins() # Create jenkins pvc jenkins.create_jenkins_pvc() # Create jenkins build config jenkins.create_jenkins_build_config() # Wait jenkins deploy pod reach to completed state jenkins.wait_for_jenkins_deploy_status(status=STATUS_COMPLETED) # Get relevant node nodes_reboot = jenkins.get_node_name_where_jenkins_pod_not_hosted( node_type=node_type, num_of_nodes=1) # Init number of builds per project jenkins.number_builds_per_project = num_of_builds # Start Builds jenkins.start_build() if len(nodes_reboot) > 0: # Restart Node nodes.restart_nodes(get_node_objs(nodes_reboot)) else: log.info('No node was reboot') # Wait build reach 'Complete' state jenkins.wait_for_build_to_complete() # Print table of builds jenkins.print_completed_builds_results() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()
def test_osd_node_restart_and_check_osd_pods_status(self, nodes): """ 1) Restart one of the osd nodes. 2) Check that the osd pods associated with the node should change to a Terminating state. 3) Wait for the node to reach Ready state. 4) Check that the new osd pods with the same ids start on the same node. 5) Check the worker nodes security groups. """ # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162 if is_ms_consumer_cluster(): logger.info( "The test is applicable only for an MS provider cluster. " "Switching to the provider cluster...") config.switch_to_provider() self.create_resources() osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] old_osd_pod_ids = get_node_osd_ids(osd_node_name) logger.info(f"osd pod ids: {old_osd_pod_ids}") node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids) node_osd_pod_names = [p.name for p in node_osd_pods] logger.info(f"Going to restart the node {osd_node_name}") nodes.restart_nodes(nodes=[osd_node], wait=False) logger.info("Verify the node osd pods go into a Terminating state") res = pod.wait_for_pods_to_be_in_statuses( [constants.STATUS_TERMINATING], node_osd_pod_names) assert res, "Not all the node osd pods are in a Terminating state" wait_for_nodes_status(node_names=[osd_node_name]) assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids, timeout=300) logger.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}" ) logger.info( "Verify the worker nodes security groups on the provider...") assert verify_worker_nodes_security_groups()
def restart_nodes_teardown(self): """ Make sure all EC2 instances are up. To be used in the test teardown """ # Get all cluster nodes objects ocp_nodes = get_node_objs() # Get the cluster nodes ec2 instances ec2_instances = self.get_ec2_instances(ocp_nodes) assert ec2_instances, ( f"Failed to get ec2 instances for node {[n.name for n in ocp_nodes]}" ) logger.info( "Getting the instances that are in status 'stopping' (if there are any), " "and wait for them to get to status 'stopped', " "so it will be possible to start them") stopping_instances = { key: val for key, val in ec2_instances.items() if self.aws.get_instances_status_by_id(key) == constants.INSTANCE_STOPPING } logger.info("Waiting fot the instances that are in status 'stopping' " "(if there are any) to reach 'stopped'") if stopping_instances: for stopping_instance in stopping_instances: instance = self.aws.get_ec2_instance(stopping_instance.key()) instance.wait_until_stopped() stopped_instances = { key: val for key, val in ec2_instances.items() if self.aws.get_instances_status_by_id(key) == constants.INSTANCE_STOPPED } # Start the instances if stopped_instances: self.aws.start_ec2_instances(instances=stopped_instances, wait=True)
def finalizer(): not_ready_nodes = [ n for n in node.get_node_objs() if n.ocp.get_resource_status( n.name) == constants.NODE_NOT_READY ] logger.warning( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) if not_ready_nodes: nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status() # Restart node if the osd stays at CLBO state osd_pods_obj_list = get_osd_pods() for pod in osd_pods_obj_list: if (pod.get().get("status").get("containerStatuses")[0].get( "state") == constants.STATUS_CLBO): node_obj = get_pod_node(pod) nodes.restart_nodes([node_obj]) node.wait_for_nodes_status([node_obj.name])
def cycle_nodes(cluster_path, action): """ Start/Stop AWS nodes to save costs when not in use. Args: cluster_path(str): location of cluster path that has auth files action (str): action to perform either start or stop """ node_obj_file = os.path.join(cluster_path, NODE_OBJ_FILE) nodes_file = os.path.join(cluster_path, NODE_FILE) instance_file = os.path.join(cluster_path, INSTANCE_FILE) if action == 'stop': ceph = CephCluster() ceph.set_noout() node_objs = get_node_objs() kls = platform_nodes.PlatformNodesFactory() nodes = kls.get_nodes_platform() with open(instance_file, "wb") as instance_file: log.info("Storing ocs instances objects") pickle.dump(nodes.get_ec2_instances(nodes=node_objs), instance_file) with open(nodes_file, "wb") as node_file: log.info("Storing ocp nodes objects") pickle.dump(nodes, node_file) with open(node_obj_file, "wb") as node_obj_file: log.info("Stopping all nodes") pickle.dump(node_objs, node_obj_file) nodes.stop_nodes(nodes=node_objs) elif action == 'start': with open(instance_file, "rb") as instance_file: log.info("Reading instance objects") instances = pickle.load(instance_file) with open(nodes_file, "rb") as node_file: log.info("Reading ocp nodes object") nodes = pickle.load(node_file) with open(node_obj_file, "rb") as node_obj_file: log.info("Starting ocs nodes") node_objs = pickle.load(node_obj_file) nodes.start_nodes(instances=instances, nodes=node_objs) unset_noout()
def finalizer(): # Start the powered off nodes nodes.restart_nodes_teardown() try: node.wait_for_nodes_status(status=constants.NODE_READY) except ResourceWrongStatusException: # Restart the nodes if in NotReady state not_ready_nodes = [ n for n in node.get_node_objs() if n .ocp.get_resource_status(n.name) == constants.NODE_NOT_READY ] if not_ready_nodes: logger.info( f"Nodes in NotReady status found: {[n.name for n in not_ready_nodes]}" ) nodes.restart_nodes(not_ready_nodes) node.wait_for_nodes_status(status=constants.NODE_READY) # Check ceph health assert ceph_health_check(), f"Ceph cluster health is not OK" logger.info("Ceph cluster health is OK")
def check_automated_recovery_from_terminated_node(nodes): """ 1) Terminate node. 2) The rook ceph pods associated with the node should change to a Terminating state. 3) A new node should be created automatically 4) The new osd pods with the same ids of the terminated node should start on the new osd node. """ old_wnodes = get_worker_nodes() log.info(f"Current worker nodes: {old_wnodes}") osd_node_name = random.choice(get_osd_running_nodes()) osd_node = get_node_objs([osd_node_name])[0] machine_name = machine.get_machine_from_node_name(osd_node_name) machineset = machine.get_machineset_from_machine_name(machine_name) log.info(f"machineset name: {machineset}") old_osd_pod_ids = get_node_osd_ids(osd_node_name) log.info(f"osd pod ids: {old_osd_pod_ids}") pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate( osd_node.name) nodes.terminate_nodes([osd_node], wait=True) log.info(f"Successfully terminated the node: {osd_node_name}") log.info("Verify the node rook ceph pods go into a Terminating state") res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING], pod_names_expected_to_terminate) assert res, "Not all the node rook ceph pods are in a Terminating state" new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes) wait_for_osd_ids_come_up_on_node(new_wnode.name, old_osd_pod_ids, timeout=300) log.info( f"the osd ids {old_osd_pod_ids} Successfully come up on the node {new_wnode.name}" )
def test_automated_recovery_from_failed_nodes_IPI_reactive( self, nodes, pvc_factory, pod_factory, failure, dc_pod_factory, interface): """ Knip-678 Automated recovery from failed nodes Reactive case - IPI """ # Get OSD running nodes osd_running_nodes = get_osd_running_nodes() log.info(f"OSDs are running on nodes {osd_running_nodes}") # Label osd nodes with fedora app label_worker_node(osd_running_nodes, label_key='dc', label_value='fedora') # Create DC app pods log.info("Creating DC based app pods") if interface == 'rbd': interface = constants.CEPHBLOCKPOOL elif interface == 'cephfs': interface = constants.CEPHFILESYSTEM dc_pod_obj = [] for i in range(2): dc_pod = dc_pod_factory(interface=interface, node_selector={'dc': 'fedora'}) self.threads.append(pod.run_io_in_bg(dc_pod, fedora_dc=True)) dc_pod_obj.append(dc_pod) # Get app pods running nodes dc_pod_node_name = get_app_pod_running_nodes(dc_pod_obj) log.info(f"DC app pod running nodes are {dc_pod_node_name}") # Get both osd and app pod running node common_nodes = get_both_osd_and_app_pod_running_node( osd_running_nodes, dc_pod_node_name) log.info(f"Both OSD and app pod is running on nodes {common_nodes}") # Get the machine name using the node name machine_name = machine.get_machine_from_node_name(common_nodes[0]) log.info(f"{common_nodes[0]} associated machine is {machine_name}") # Get the machineset name using machine name machineset_name = machine.get_machineset_from_machine_name( machine_name) log.info( f"{common_nodes[0]} associated machineset is {machineset_name}") # Add a new node and label it add_new_node_and_label_it(machineset_name) # Get the failure node obj failure_node_obj = get_node_objs(node_names=[common_nodes[0]]) # Induce failure on the selected failure node log.info(f"Inducing failure on node {failure_node_obj[0].name}") if failure == "shutdown": nodes.stop_nodes(failure_node_obj, wait=True) log.info(f"Successfully powered off node: " f"{failure_node_obj[0].name}") elif failure == "terminate": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") try: # DC app pods on the failed node will get automatically created on other # running node. Waiting for all dc app pod to reach running state pod.wait_for_dc_app_pods_to_reach_running_state(dc_pod_obj, timeout=720) log.info("All the dc pods reached running state") pod.wait_for_storage_pods() except ResourceWrongStatusException: if failure == "shutdown": nodes.terminate_nodes(failure_node_obj, wait=True) log.info(f"Successfully terminated node : " f"{failure_node_obj[0].name} instance") raise # Check basic cluster functionality by creating resources # (pools, storageclasses, PVCs, pods - both CephFS and RBD), # run IO and delete the resources self.sanity_helpers.create_resources(pvc_factory, pod_factory) self.sanity_helpers.delete_resources() # Perform cluster and Ceph health checks self.sanity_helpers.health_check()