Exemplo n.º 1
0
def check_automated_recovery_from_stopped_node(nodes):
    """
    1) Stop node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) The node should power on automatically, or if removed from the cluster,
       a new node should create automatically.
    4) The new osd pods with the same ids should start on the stopped node after it powered on,
       or to start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node_name)

    nodes.stop_nodes([osd_node], wait=True)
    log.info(f"Successfully powered off node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    try:
        log.info(f"Wait for the node: {osd_node_name} to power on")
        wait_for_nodes_status([osd_node_name])
        log.info(f"Successfully powered on node {osd_node_name}")
    except ResourceWrongStatusException as e:
        log.info(
            f"The worker node {osd_node_name} didn't start due to the exception {str(e)} "
            f"Probably it has been removed from the cluster. Waiting for a new node to come up..."
        )
        new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)
        osd_node_name = new_wnode.name

    assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                            old_osd_pod_ids,
                                            timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
    )
Exemplo n.º 2
0
def check_automated_recovery_from_drain_node(nodes):
    """
    1) Drain one worker node.
    2) Delete the OSD pods associated with the node.
    3) The new OSD pods with the same ids that come up, should be in a Pending state.
    4) Schedule the worker node.
    5) The OSD pods associated with the node, should back into a Running state, and come up
        on the same node.

    """
    osd_node_name = random.choice(get_osd_running_nodes())
    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")
    node_osd_pods = get_osd_pods_having_ids(old_osd_pod_ids)

    unschedule_nodes([osd_node_name])
    log.info(f"Successfully unschedule the node: {osd_node_name}")

    log.info("Delete the node osd pods")
    delete_pods(node_osd_pods)

    new_osd_pods = wait_for_osd_pods_having_ids(osd_ids=old_osd_pod_ids)
    new_osd_pod_names = [p.name for p in new_osd_pods]

    wnodes = get_worker_nodes()
    if len(wnodes) <= 3:
        expected_pods_status = constants.STATUS_PENDING
    else:
        expected_pods_status = constants.STATUS_RUNNING

    log.info(
        f"Verify the new osd pods {new_osd_pod_names} go into a {expected_pods_status} state"
    )
    res = wait_for_pods_to_be_in_statuses(
        [expected_pods_status],
        new_osd_pod_names,
        raise_pod_not_found_error=True,
    )
    assert res, f"Not all the node osd pods are in a {expected_pods_status} state"

    log.info(f"Wait for the node: {osd_node_name} to be scheduled")
    schedule_nodes([osd_node_name])
    log.info(f"Successfully scheduled the node {osd_node_name}")

    if len(wnodes) <= 3:
        assert wait_for_osd_ids_come_up_on_node(osd_node_name, old_osd_pod_ids)
        log.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )
Exemplo n.º 3
0
def delete_and_create_osd_node(osd_node_name):
    """
    Delete an osd node, and create a new one to replace it

    Args:
        osd_node_name (str): The osd node name to delete

    """
    new_node_name = None
    old_osd_ids = node.get_node_osd_ids(osd_node_name)

    old_osd_node_names = node.get_osd_running_nodes()

    # error message for invalid deployment configuration
    msg_invalid = ("ocs-ci config 'deployment_type' value "
                   f"'{config.ENV_DATA['deployment_type']}' is not valid, "
                   f"results of this test run are all invalid.")

    if config.ENV_DATA["deployment_type"] == "ipi":
        if is_lso_cluster():
            # TODO: Implement functionality for Internal-Attached devices mode
            # once ocs-ci issue #4545 is resolved
            # https://github.com/red-hat-storage/ocs-ci/issues/4545
            pytest.skip(
                "Functionality not implemented for this deployment mode")
        else:
            new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name)

    elif config.ENV_DATA["deployment_type"] == "upi":
        if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
            new_node_name = node.delete_and_create_osd_node_aws_upi(
                osd_node_name)
        elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
            if is_lso_cluster():
                new_node_name = node.delete_and_create_osd_node_vsphere_upi_lso(
                    osd_node_name, use_existing_node=False)
            else:
                new_node_name = node.delete_and_create_osd_node_vsphere_upi(
                    osd_node_name, use_existing_node=False)
    else:
        log.error(msg_invalid)
        pytest.fail(msg_invalid)

    log.info("Start node replacement verification steps...")
    check_node_replacement_verification_steps(osd_node_name, new_node_name,
                                              old_osd_node_names, old_osd_ids)
Exemplo n.º 4
0
def delete_and_create_osd_node(osd_node_name):
    """
    Delete an osd node, and create a new one to replace it

    Args:
        osd_node_name (str): The osd node name to delete

    """
    new_node_name = None
    old_osd_ids = node.get_node_osd_ids(osd_node_name)

    old_osd_node_names = node.get_osd_running_nodes()

    # error message for invalid deployment configuration
    msg_invalid = ("ocs-ci config 'deployment_type' value "
                   f"'{config.ENV_DATA['deployment_type']}' is not valid, "
                   f"results of this test run are all invalid.")
    # TODO: refactor this so that AWS is not a "special" platform
    if config.ENV_DATA["platform"].lower() == constants.AWS_PLATFORM:
        if config.ENV_DATA["deployment_type"] == "ipi":
            new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name)

        elif config.ENV_DATA["deployment_type"] == "upi":
            new_node_name = node.delete_and_create_osd_node_aws_upi(
                osd_node_name)
        else:
            log.error(msg_invalid)
            pytest.fail(msg_invalid)
    elif config.ENV_DATA["platform"].lower() in constants.CLOUD_PLATFORMS:
        if config.ENV_DATA["deployment_type"] == "ipi":
            new_node_name = node.delete_and_create_osd_node_ipi(osd_node_name)
        else:
            log.error(msg_invalid)
            pytest.fail(msg_invalid)
    elif config.ENV_DATA["platform"].lower() == constants.VSPHERE_PLATFORM:
        if is_lso_cluster():
            new_node_name = node.delete_and_create_osd_node_vsphere_upi_lso(
                osd_node_name, use_existing_node=False)

        else:
            new_node_name = node.delete_and_create_osd_node_vsphere_upi(
                osd_node_name, use_existing_node=False)

    log.info("Start node replacement verification steps...")
    check_node_replacement_verification_steps(osd_node_name, new_node_name,
                                              old_osd_node_names, old_osd_ids)
Exemplo n.º 5
0
    def test_osd_node_restart_and_check_osd_pods_status(self, nodes):
        """
        1) Restart one of the osd nodes.
        2) Check that the osd pods associated with the node should change to a Terminating state.
        3) Wait for the node to reach Ready state.
        4) Check that the new osd pods with the same ids start on the same node.
        5) Check the worker nodes security groups.
        """
        # This is a workaround due to the issue https://github.com/red-hat-storage/ocs-ci/issues/6162
        if is_ms_consumer_cluster():
            logger.info(
                "The test is applicable only for an MS provider cluster. "
                "Switching to the provider cluster...")
            config.switch_to_provider()

        self.create_resources()

        osd_node_name = random.choice(get_osd_running_nodes())
        osd_node = get_node_objs([osd_node_name])[0]

        old_osd_pod_ids = get_node_osd_ids(osd_node_name)
        logger.info(f"osd pod ids: {old_osd_pod_ids}")
        node_osd_pods = pod.get_osd_pods_having_ids(old_osd_pod_ids)
        node_osd_pod_names = [p.name for p in node_osd_pods]

        logger.info(f"Going to restart the node {osd_node_name}")
        nodes.restart_nodes(nodes=[osd_node], wait=False)

        logger.info("Verify the node osd pods go into a Terminating state")
        res = pod.wait_for_pods_to_be_in_statuses(
            [constants.STATUS_TERMINATING], node_osd_pod_names)
        assert res, "Not all the node osd pods are in a Terminating state"

        wait_for_nodes_status(node_names=[osd_node_name])
        assert wait_for_osd_ids_come_up_on_node(osd_node_name,
                                                old_osd_pod_ids,
                                                timeout=300)
        logger.info(
            f"the osd ids {old_osd_pod_ids} Successfully come up on the node {osd_node_name}"
        )

        logger.info(
            "Verify the worker nodes security groups on the provider...")
        assert verify_worker_nodes_security_groups()
Exemplo n.º 6
0
def check_automated_recovery_from_terminated_node(nodes):
    """
    1) Terminate node.
    2) The rook ceph pods associated with the node should change to a Terminating state.
    3) A new node should be created automatically
    4) The new osd pods with the same ids of the terminated node should start on the new osd node.

    """
    old_wnodes = get_worker_nodes()
    log.info(f"Current worker nodes: {old_wnodes}")

    osd_node_name = random.choice(get_osd_running_nodes())
    osd_node = get_node_objs([osd_node_name])[0]

    machine_name = machine.get_machine_from_node_name(osd_node_name)
    machineset = machine.get_machineset_from_machine_name(machine_name)
    log.info(f"machineset name: {machineset}")

    old_osd_pod_ids = get_node_osd_ids(osd_node_name)
    log.info(f"osd pod ids: {old_osd_pod_ids}")

    pod_names_expected_to_terminate = get_node_pod_names_expected_to_terminate(
        osd_node.name)

    nodes.terminate_nodes([osd_node], wait=True)
    log.info(f"Successfully terminated the node: {osd_node_name}")

    log.info("Verify the node rook ceph pods go into a Terminating state")
    res = wait_for_pods_to_be_in_statuses([constants.STATUS_TERMINATING],
                                          pod_names_expected_to_terminate)
    assert res, "Not all the node rook ceph pods are in a Terminating state"

    new_wnode = wait_for_new_worker_node_ipi(machineset, old_wnodes)

    wait_for_osd_ids_come_up_on_node(new_wnode.name,
                                     old_osd_pod_ids,
                                     timeout=300)
    log.info(
        f"the osd ids {old_osd_pod_ids} Successfully come up on the node {new_wnode.name}"
    )
Exemplo n.º 7
0
    def test_check_pods_status_after_node_failure(self, nodes,
                                                  node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(
            node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name],
                              status=constants.NODE_NOT_READY)
        log.info(
            f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status"
        )

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout)
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info(
            "Check the rook ceph pods are in 'Running' or 'Completed' state")
        timeout = 480
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node,
            timeout=timeout,
            sleep=30)
        assert are_pods_running, f"The pods are not 'Running' after {timeout} seconds"

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name
            for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name
            for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set)
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(rook_ceph_pod_names_set -
                                       new_node_osd_mon_id_names_set)

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20)
        assert (are_new_pods_running
                ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")
        log.info(f"Starting the node '{node_name}' again...")
        nodes.start_nodes(nodes=[ocs_node])
        wait_for_nodes_status(node_names=[node_name])

        log.info(
            "Waiting for all the pods to be running and cluster health to be OK..."
        )
        wait_for_pods_to_be_running(timeout=600)
        self.sanity_helpers.health_check(tries=40)
Exemplo n.º 8
0
    def test_check_pods_status_after_node_failure(self, nodes, node_restart_teardown):
        """
        Test check pods status after a node failure event.
        All the rook ceph pods should be in "Running" or "Completed"
        state after a node failure event.

        """
        ocs_nodes = get_ocs_nodes()
        if not ocs_nodes:
            pytest.skip("We don't have ocs nodes in the cluster")

        wnodes = get_worker_nodes()

        ocs_node = random.choice(ocs_nodes)
        node_name = ocs_node.name
        log.info(f"Selected node is '{node_name}'")
        # Save the rook ceph pods, the osd ids, and the mon ids before shutting down the node
        rook_ceph_pod_names_not_in_node = get_rook_ceph_pod_names_not_in_node(node_name)
        node_osd_ids = get_node_osd_ids(node_name)
        node_mon_ids = get_node_mon_ids(node_name)

        log.info(f"Shutting down node '{node_name}'")
        nodes.stop_nodes([ocs_node])
        wait_for_nodes_status(node_names=[node_name], status=constants.NODE_NOT_READY)
        log.info(f"The node '{node_name}' reached '{constants.NODE_NOT_READY}' status")

        log.info("Wait for a change in the rook ceph pod statuses...")
        timeout = 480
        is_rook_ceph_pods_status_changed = wait_for_change_in_rook_ceph_pods(
            node_name, timeout=timeout
        )
        assert (
            is_rook_ceph_pods_status_changed
        ), f"Rook Ceph pods status didn't change after {timeout} seconds"

        log.info("Check the rook ceph pods are in 'Running' or 'Completed' state")
        previous_timeout = 480
        timeout = 600
        are_pods_running = wait_for_pods_to_be_running(
            pod_names=rook_ceph_pod_names_not_in_node, timeout=timeout, sleep=30
        )
        assert are_pods_running, (
            f"Increased timeout from {previous_timeout} to {timeout} seconds, "
            f"The pods are not 'Running' even after {timeout} seconds"
        )

        # Get the rook ceph pods without the osd, and mon pods have the old node ids
        osd_pods = get_osd_pods()
        new_node_osd_id_names_set = {
            p.name for p in osd_pods if get_osd_pod_id(p) in node_osd_ids
        }
        mon_pods = get_mon_pods()
        new_node_mon_id_names_set = {
            p.name for p in mon_pods if get_mon_pod_id(p) in node_mon_ids
        }

        new_node_osd_mon_id_names_set = new_node_osd_id_names_set.union(
            new_node_mon_id_names_set
        )
        rook_ceph_pod_names_set = set(get_rook_ceph_pod_names())
        new_rook_ceph_pod_names = list(
            rook_ceph_pod_names_set - new_node_osd_mon_id_names_set
        )

        log.info(
            "Verify that the new rook ceph pods are in 'Running' or 'Completed' state"
        )
        timeout = 300
        are_new_pods_running = wait_for_pods_to_be_running(
            pod_names=new_rook_ceph_pod_names, timeout=timeout, sleep=20
        )
        assert (
            are_new_pods_running
        ), f"The new pods are not 'Running' after {timeout} seconds"

        log.info("All the pods are in 'Running' or 'Completed' state")

        if is_managed_service_cluster():
            log.info(
                "When we use the managed service, the worker node should recover automatically "
                "by starting the node or removing it, and creating a new one."
                "Waiting for all the worker nodes to be ready..."
            )
            wait_for_node_count_to_reach_status(node_count=len(wnodes), timeout=900)
            log.info("Waiting for all the pods to be running")
            assert check_pods_after_node_replacement(), "Not all the pods are running"
        else:
            log.info(f"Starting the node '{node_name}' again...")
            nodes.start_nodes(nodes=[ocs_node])
            wait_for_nodes_status(node_names=[node_name])
            log.info("Waiting for all the pods to be running")
            wait_for_pods_to_be_running(timeout=600)

        log.info("Checking that the cluster health is OK...")
        self.sanity_helpers.health_check(tries=40)