예제 #1
0
def ceph_precheck():
    """
    Run test pre-checks before running CEPH tests.

    """

    LOG.info('Verify the health of the CEPH cluster')
    storage_helper.is_ceph_healthy()
예제 #2
0
def ceph_backend_installed():
    ceph_info = get_ceph_backend_info()
    if not ceph_info:
        skip("No ceph system installed in the lab")
    rel = storage_helper.is_ceph_healthy()
    if not rel:
        skip("Ceph health not OK")

    return ceph_info
def test_apply_storage_profile_negative(create_storage_profile, personality):

    if personality == 'controller':
        host_name = system_helper.get_standby_controller_name()
        assert host_name, "No standby controller available on system"
    else:
        host_name = host_helper.get_up_hypervisors()[0]

    # For storage systems, skip test if ceph isn't healthy
    if len(system_helper.get_storage_nodes()) > 0:
        ceph_healthy = storage_helper.is_ceph_healthy()
        if not ceph_healthy:
            skip('Skipping due to ceph not being healthy')

    profile_name = create_storage_profile['profile_name']
    origin_disk_num = create_storage_profile['disk_num']
    disks_num = len(storage_helper.get_host_disks(host_name, 'device_node'))

    expt_err = 'profile has more disks than host does' if disks_num < origin_disk_num -1 \
        else "Please check if host's disks match profile criteria"
    expt_err_list = [
        "Please check if host's disks match profile criteria",
        "Failed to create storage function. Host personality must be 'storage'",
    ]
    if disks_num < origin_disk_num - 1:
        expt_err_list.append("profile has more disks than host does")

    positional_arg = host_name + ' ' + profile_name

    HostsToRecover.add(host_name)
    host_helper.lock_host(host_name, swact=True)
    exitcode, output = cli.system('host-apply-storprofile',
                                  positional_arg,
                                  fail_ok=True)
    host_helper.unlock_host(host_name)

    assert exitcode == 1 and any(expt in output for expt in expt_err_list)
예제 #4
0
def _test_storage_profile(personality, from_backing, to_backing):
    """
    This test creates a storage profile and then applies it to a node with
    identical hardware, assuming one exists.

    Storage profiles do not apply on controller nodes.  Storage profiles can be
    applied on controller+compute nodes, compute nodes and storage nodes.

    Arguments:
    - personality (string) - controller, compute or storage
    - from_backing (string) - image, remote or None
    - to_backing (string) - image, remote or None

    Test Steps:
    1.  Query system and determine which nodes have compatible hardware.
    2.  Create a storage profile on one of those nodes
    3.  Apply the created storage profile on a compatible node*
    4.  Ensure the storage profiles have been successfully applied.

    * If the node is a compute node or a controller+compute, we will also change
      the backend if required for additional coverage.

    Returns:
    - Nothing
    """

    global PROFILES_TO_DELETE
    PROFILES_TO_DELETE = []

    # Skip if test is not applicable to hardware under test
    if personality == 'controller' and not system_helper.is_aio_system():
        skip("Test does not apply to controller hosts without subtype compute")

    hosts = system_helper.get_hosts(personality=personality)
    if not hosts:
        skip("No hosts of type {} available".format(personality))

    if (from_backing == "remote" or to_backing
            == "remote") and not system_helper.is_storage_system():
        skip("This test doesn't apply to systems without storage hosts")

    LOG.tc_step("Identify hardware compatible hosts")
    hash_to_hosts = get_hw_compatible_hosts(hosts)

    # Pick the hardware group that has the most compatible hosts
    current_size = 0
    candidate_hosts = []
    for value in hash_to_hosts:
        candidate_size = len(hash_to_hosts[value])
        if candidate_size > current_size:
            current_size = candidate_size
            candidate_hosts = hash_to_hosts[value]
    LOG.info(
        "This is the total set of candidate hosts: {}".format(candidate_hosts))

    if len(candidate_hosts) < 2:
        skip("Insufficient hardware compatible hosts to run test")

    # Rsync lab setup dot files between controllers
    con_ssh = ControllerClient.get_active_controller()
    _rsync_files_to_con1(con_ssh=con_ssh, file_to_check="force.txt")

    # Take the hardware compatible hosts and check if any of them already have
    # the backend that we want.  This will save us test time.
    new_to_backing = None
    if personality == "compute":
        from_hosts = []
        to_hosts = []
        for host in candidate_hosts:
            host_backing = host_helper.get_host_instance_backing(host)
            if host_backing == from_backing:
                from_hosts.append(host)
            elif host_backing == to_backing:
                to_hosts.append(host)
            else:
                pass
        LOG.info(
            "Candidate hosts that already have the right from backing {}: {}".
            format(from_backing, from_hosts))
        LOG.info(
            "Candidate hosts that already have the right to backing {}: {}".
            format(to_backing, to_hosts))

        # Determine what hosts to use
        if not from_hosts and to_hosts:
            to_host = random.choice(to_hosts)
            candidate_hosts.remove(to_host)
            from_host = random.choice(candidate_hosts)
        elif not to_hosts and from_hosts:
            from_host = random.choice(from_hosts)
            candidate_hosts.remove(from_host)
            to_host = random.choice(candidate_hosts)
        elif not to_hosts and not from_hosts:
            to_host = random.choice(candidate_hosts)
            candidate_hosts.remove(to_host)
            from_host = random.choice(candidate_hosts)
        else:
            to_host = random.choice(to_hosts)
            from_host = random.choice(from_hosts)

        LOG.info("From host is: {}".format(from_host))
        LOG.info("To host is: {}".format(to_host))

        LOG.tc_step(
            "Check from host backing and convert to {} if necessary".format(
                from_backing))
        host_helper.set_host_storage_backing(from_host, from_backing)
        system_helper.wait_for_host_values(
            from_host,
            availability=HostAvailState.AVAILABLE,
            timeout=120,
            fail_ok=False)

        LOG.tc_step(
            "Check to host backing and convert to {} if necessary".format(
                to_backing))
        new_to_backing = host_helper.set_host_storage_backing(
            to_host, to_backing)
    elif personality == "controller":
        # For now, we don't want to host reinstall controller-0 since it will default to
        # pxeboot, but this could be examined as a possible enhancement.
        from_host = "controller-0"
        to_host = "controller-1"

        LOG.info("From host is: {}".format(from_host))
        LOG.info("To host is: {}".format(to_host))

        LOG.tc_step(
            "Check from host backing and convert to {} if necessary".format(
                from_backing))
        host_helper.set_host_storage_backing(from_host, from_backing)

        LOG.tc_step(
            "Check to host backing and convert to {} if necessary".format(
                to_backing))
        new_to_backing = host_helper.set_host_storage_backing(
            to_host, to_backing)
    else:
        # Backing doesn't apply to storage nodes so just pick from compatible hardware
        from_host = random.choice(candidate_hosts)
        candidate_hosts.remove(from_host)
        to_host = random.choice(candidate_hosts)

    LOG.tc_step(
        "Create storage and interface profiles on the from host {}".format(
            from_host))
    prof_name = 'storprof_{}_{}'.format(
        from_host, time.strftime('%Y%m%d_%H%M%S', time.localtime()))
    storage_helper.create_storage_profile(from_host, profile_name=prof_name)
    PROFILES_TO_DELETE.append(prof_name)

    # Deleting VMs in case the remaining host(s) cannot handle all VMs
    # migrating on lock, particularly important in the case of AIO-DX systems.
    LOG.tc_step(
        "Delete all VMs and lock the host before applying the storage profile")
    vm_helper.delete_vms()
    HostsToRecover.add(to_host, scope='function')
    system_helper.wait_for_host_values(from_host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False)
    system_helper.wait_for_host_values(to_host,
                                       availability=HostAvailState.AVAILABLE,
                                       timeout=120,
                                       fail_ok=False)

    # Negative test #1 - attempt to apply profile on unlocked host (should be rejected)
    LOG.tc_step('Apply the storage-profile {} onto unlocked host:{}'.format(
        prof_name, to_host))
    cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
    rc, msg = cli.system(cmd, fail_ok=True)
    assert rc != 0, msg
    host_helper.lock_host(to_host, swact=True)

    # 3 conditions to watch for: no partitions, ready partitions and in-use
    # partitions on the compute.  If in-use, delete and freshly install host.
    # If ready, delete all ready partitions to make room for potentially new
    # partitions.  If no partitions, just delete nova-local lvg.
    if personality == "compute":

        # Negative test #2 - attempt to apply profile onto host with existing
        # nova-local (should be rejected)
        LOG.tc_step(
            'Apply the storage-profile {} onto host with existing nova-local:{}'
            .format(prof_name, to_host))
        cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
        rc, msg = cli.system(cmd, fail_ok=True)
        assert rc != 0, msg

        # If we were simply switching backing (without applying a storage
        # profile), the nova-local lvg deletion can be omitted according to design
        LOG.tc_step("Delete nova-local lvg on to host {}".format(to_host))
        cli.system("host-lvg-delete {} nova-local".format(to_host))

        in_use = storage_helper.get_host_partitions(to_host, "In-Use")

        if in_use:

            # Negative test #3 - attempt to apply profile onto host with existing
            # in-use partitions (should be rejected)
            LOG.tc_step('Apply the storage-profile {} onto host with existing \
                         in-use partitions:{}'.format(prof_name, to_host))
            cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name)
            rc, msg = cli.system(cmd, fail_ok=True)
            assert rc != 0, msg

            LOG.tc_step(
                "In-use partitions found.  Must delete the host and freshly install before proceeding."
            )
            LOG.info("Host {} has in-use partitions {}".format(
                to_host, in_use))
            lab = InstallVars.get_install_var("LAB")
            lab.update(create_node_dict(lab['compute_nodes'], 'compute'))
            lab['boot_device_dict'] = create_node_boot_dict(lab['name'])
            install_helper.open_vlm_console_thread(to_host)

            LOG.tc_step("Delete the host {}".format(to_host))
            cli.system("host-bulk-export")
            cli.system("host-delete {}".format(to_host))
            assert len(
                system_helper.get_controllers()) > 1, "Host deletion failed"

            cli.system("host-bulk-add hosts.xml")
            system_helper.wait_for_host_values(
                to_host, timeout=6000, availability=HostAvailState.ONLINE)

            wait_for_disks(to_host)

        ready = storage_helper.get_host_partitions(to_host, "Ready")
        if ready:
            LOG.tc_step(
                "Ready partitions have been found.  Must delete them before profile application"
            )
            LOG.info("Host {} has Ready partitions {}".format(to_host, ready))
            for uuid in reversed(ready):
                storage_helper.delete_host_partition(to_host, uuid)
            # Don't bother restoring in this case since the system should be
            # functional after profile is applied.

        LOG.tc_step('Apply the storage-profile {} onto host:{}'.format(
            prof_name, to_host))
        cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name))

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

        to_host_backing = host_helper.get_host_instance_backing(to_host)
        LOG.info("To host backing was {} and is now {}".format(
            new_to_backing, to_host_backing))
        assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"

    if personality == "storage":
        if not storage_helper.is_ceph_healthy():
            skip("Cannot run test when ceph is not healthy")

        LOG.tc_step("Delete the host {}".format(to_host))
        cli.system("host-bulk-export")
        cli.system("host-delete {}".format(to_host))
        cli.system("host-bulk-add hosts.xml")
        system_helper.wait_for_host_values(to_host,
                                           timeout=6000,
                                           availability=HostAvailState.ONLINE)

        wait_for_disks(to_host)

        LOG.tc_step('Apply the storage-profile {} onto host:{}'.format(
            prof_name, to_host))
        cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name))

        # Re-provision interfaces through lab_setup.sh
        LOG.tc_step("Reprovision the host as necessary")
        files = ['interfaces']
        con_ssh = ControllerClient.get_active_controller()
        delete_lab_setup_files(con_ssh, to_host, files)

        rc, msg = install_helper.run_lab_setup()
        assert rc == 0, msg

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

    if personality == "controller":

        # Note, install helper doesn't work on all labs.  Some labs don't
        # display BIOS type which causes install helper to fail
        lab = InstallVars.get_install_var("LAB")
        lab.update(create_node_dict(lab['controller_nodes'], 'controller'))
        lab['boot_device_dict'] = create_node_boot_dict(lab['name'])
        install_helper.open_vlm_console_thread(to_host)

        LOG.tc_step("Delete the host {}".format(to_host))
        cli.system("host-bulk-export")
        cli.system("host-delete {}".format(to_host))
        assert len(system_helper.get_controllers()) > 1, "Host deletion failed"

        cli.system("host-bulk-add hosts.xml")
        system_helper.wait_for_host_values(to_host,
                                           timeout=6000,
                                           availability=HostAvailState.ONLINE)

        wait_for_disks(to_host)

        LOG.tc_step("Apply the storage-profile {} onto host:{}".format(
            prof_name, to_host))
        cli.system("host-apply-storprofile {} {}".format(to_host, prof_name))

        # Need to re-provision everything on node through lab_setup (except storage)
        LOG.tc_step("Reprovision the host as necessary")
        files = [
            'interfaces', 'cinder_device', 'vswitch_cpus', 'shared_cpus',
            'extend_cgts_vg', 'addresses'
        ]
        con_ssh = ControllerClient.get_active_controller()
        delete_lab_setup_files(con_ssh, to_host, files)

        rc, msg = install_helper.run_lab_setup()
        assert rc == 0, msg

        LOG.tc_step("Unlock to host")
        host_helper.unlock_host(to_host)

        to_host_backing = host_helper.get_host_instance_backing(to_host)
        LOG.info("To host backing was {} and is now {}".format(
            new_to_backing, to_host_backing))
        assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"
예제 #5
0
def test_storgroup_semantic_checks():
    """
    This test validates CEPH semantic checks as it applies to storage nodes in
    a replication group.

    Args:
        - None

    Setup:
        - Requires a system with storage nodes (minimum of 2)
        - Requires TiS Release 3 and up

    Test Steps:
        1.  Lock one storage node in a storage node pair
        2.  Check the appropriate alarms are raised
        3.  Check OSDs are down on the storage node
        4.  Check that CEPH is no longer healthy
        5.  Attempt to lock the other node and ensure it is rejected
        6.  Attempt to force lock the other node and ensure it is rejected
        7.  If the storage node is a storage monitor, attempt to lock and force
            lock the controllers
        8.  Unlock the storage node in the storage node pair
        9.  Check that the alarms are cleared
        10.  Check that OSDs are up
        11.  Check that CEPH is healthy

    Defects this addresses:
        1.  CGTS-4286 Unexpected allowing lock action on storage node peergroup
            when redundancy lost
        2.  CGTS-3494 Some OSDs observed to be up on locked storage node
        3.  CGTS-3643 Able to lock standby controller despite only two CEPH
            monitors being available
        4.  CGTS-2690 Storage: Force locking a controller should be rejected when storage
            is locked.
    """

    con_ssh = ControllerClient.get_active_controller()

    table_ = table_parser.table(cli.system('storage-backend-show ceph-store')[1])
    capabilities = table_parser.get_value_two_col_table(table_, 'capabilities')
    replication_factor = capabilities[1]
    LOG.info("The replication factor is: {}".format(replication_factor))

    # We want to test storage-0 since it is a ceph monitor
    # Then we want to test another storage host in another group.  The choice
    # depends on the replication factor.
    storage_nodes = ["storage-0"]
    if replication_factor == "3":
        storage_nodes.append("storage-3")

    if replication_factor == "2" and len(storage_nodes) > 2:
        storage_nodes.append("storage-2")

    LOG.info("Storage hosts under test are: {}".format(storage_nodes))

    for host in storage_nodes:
        LOG.tc_step('Lock {}:'.format(host))
        HostsToRecover.add(host, scope='function')
        rtn_code, out = host_helper.lock_host(host)
        assert rtn_code == 0, out

        LOG.tc_step("Verify CEPH cluster health reflects the OSD being down")
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check that alarms are raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raise')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that the ceph health warning alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        hosts = []
        if host == 'storage-0':
            hosts.append('controller-0')
            hosts.append('controller-1')

        for node in hosts:
            LOG.tc_step('Attempt to lock the {}'.format(node))
            HostsToRecover.add(node)
            rtn_code, out = host_helper.lock_host(node, fail_ok=True)
            assert 1 == rtn_code, out

            LOG.tc_step('Attempt to force lock {}'.format(node))
            rtn_code, out = host_helper.lock_host(node, force=True, fail_ok=True)
            assert 1 == rtn_code, out

        LOG.tc_step('Unlock storage host {}'.format(host))
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        LOG.info("Check if alarms have cleared")
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg
예제 #6
0
def test_lock_cont_check_mon_down():
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that we alarm when a CEPH monitor goes
    down.  This test is specifically for controller hosts.

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock controller node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs stay up
            - Check that the appropriate alarms are raised:
              - controller-X is locked
              - ceph mon down
        3.  Unlock controller node
            - ensure CEPH is HEALTH_OK
            - Check that alarms are cleared

    Enhancements:
       1.  Should we do both controllers?  This will require a swact.
    """

    con_ssh = ControllerClient.get_active_controller()

    host = system_helper.get_standby_controller_name()
    LOG.tc_step('Lock standby controller node {}'.format(host))
    HostsToRecover.add(host, scope='function')
    rtn_code, out = host_helper.lock_host(host)
    assert rtn_code == 0, out

    LOG.tc_step('Check that storage degrade alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
        "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
        "Alarm {} not raised".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check OSDs are still up after lock')
    osd_list = storage_helper.get_osds(con_ssh=con_ssh)
    for osd_id in osd_list:
        osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
        msg = 'OSD ID {} should be up but is not'.format(osd_id)
        assert osd_up, msg
        msg = 'OSD ID {} is up'.format(osd_id)
        LOG.info(msg)

    LOG.tc_step('Unlock standby controller node {}'.format(host))
    rtn_code, out = host_helper.unlock_host(host, available_only=True)
    assert rtn_code == 0, out

    LOG.tc_step('Check that the host locked alarm is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
        "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check that the Storage Alarm Condition is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
        "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check health of CEPH cluster')
    msg = ''
    end_time = time.time() + 40
    while time.time() < end_time:
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        if ceph_healthy:
            break
    else:
        assert 0, "ceph is not healthy"
예제 #7
0
def test_lock_stor_check_osds_down(stx_openstack_required, host):
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that all OSDs go down on a locked storage
    node.  There are two variants:

    1.  Lock 'storage-0' which is a ceph monitor
    2.  Lock a storage node that is not 'storage-0', i.e. not a ceph monitor

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock storage node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs on the locked storage node are down
            - Check that the appropriate alarms are raised:
        3.  Unlock storage node
            - ensure CEPH is HEALTH_OK
            - ensure all OSDs on unlocked node are up
            - Check that alarms are cleared

    Note: If the storage node to be locked is monitor, we also expect to see
    the mon down alarm.

    What defects this addresses:
        1.  CGTS-2609 - Ceph processes fail to start after storage node reboot

    Notes:
        - Updated test to write to disk to add I/O load on system

    """

    con_ssh = ControllerClient.get_active_controller()

    if host == 'any':
        storage_nodes = system_helper.get_hosts(personality='storage')
        LOG.info('System has {} storage nodes:'.format(storage_nodes))
        storage_nodes.remove('storage-0')
        node_id = random.randint(0, len(storage_nodes) - 1)
        host = storage_nodes[node_id]

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")
    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        LOG.tc_step('Lock storage node {}'.format(host))
        HostsToRecover.add(host)
        host_helper.lock_host(host, check_first=False)

        LOG.tc_step('Determine the storage group for host {}'.format(host))
        storage_group, msg = storage_helper.get_storage_group(host)
        LOG.info(msg)

        LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that ceph is in health warn')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        # We're waiting 5 minutes for ceph rebalancing to be performed
        # DO NOT REMOVE.  This is part of the test.
        time.sleep(300)

        LOG.tc_step('Unlock storage node')
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        health = False
        end_time = time.time() + 40
        while time.time() < end_time:
            health = storage_helper.is_ceph_healthy(con_ssh)
            if health is True:
                break
        assert health, "Ceph did not become healthy"

        LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host))
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that the replication group alarm is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        LOG.tc_step('Check that the Storage Alarm Condition is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg

        LOG.tc_step('Check health of CEPH cluster')
        end_time = time.time() + 40
        while time.time() < end_time:
            ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
            if ceph_healthy is True:
                break

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
예제 #8
0
def test_ceph_reboot_storage_node(stx_openstack_required):
    """
    us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt

    Verify that ceph mon processes recover when they are killed
    nodes.

    Args:
        - Nothing

    Setup:
        - Requires system with storage nodes

    Test Steps:
        0.  Run CEPH pre-check fixture to check:
            - system has storage nodes
            - health of the ceph cluster is okay
            - that we have OSDs provisioned
        1.  Delete existing VMs
        2.  Boot new VMs and run dd on them
        3.  Reboot storage node and ensure both:
            - mon state goes down (if storage-0)
            - OSD state goes down
        4.  Ensure mon and OSD state recover afterwards
        5.  Cleanup VMs

    Potential rework:
        1.  Add the alarms checks for raise and clear
        2.  Maybe we don't want to reboot all storage nodes

    What defects this addresses:
        1.  CGTS-2975

    Update:
        This test was updated for the Storage and Robustness feature.
    """
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")

    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        storage_nodes = system_helper.get_storage_nodes(con_ssh)

        for host in storage_nodes:
            LOG.tc_step('Reboot {}'.format(host))
            HostsToRecover.add(host, scope='function')
            host_helper.reboot_hosts(host, wait_for_offline=True, wait_for_reboot_finish=False)

            LOG.tc_step('Check health of CEPH cluster')
            ceph_healthy = True
            msg = None
            end_time = time.time() + 10
            while time.time() < end_time:
                ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
                if not ceph_healthy:
                    break

            assert not ceph_healthy, "ceph is not healthy"
            LOG.info(msg)

            LOG.tc_step('Check that OSDs are down')
            osd_list = storage_helper.get_osds(host, con_ssh)
            all_osds_up = True
            up_list = osd_list.copy()
            end_time = time.time() + 60
            while time.time() < end_time and all_osds_up:
                for osd_id in osd_list:
                    osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
                    if not osd_up:
                        msg = 'OSD ID {} is down as expected'.format(osd_id)
                        LOG.info(msg)
                        up_list.remove(osd_id)
                if len(up_list) > 0:
                    osd_list = up_list.copy()
                else:
                    msg = ' All OSDs are down as expected'
                    LOG.info(msg)
                    all_osds_up = False

            assert not all_osds_up, " One or more OSD(s) {}  is(are) up but should be down".format(up_list)

            system_helper.wait_for_host_values(host, availability='available')

            LOG.tc_step('Check that OSDs are up')
            osd_list = storage_helper.get_osds(host, con_ssh)
            down_list = osd_list.copy()
            all_osds_up = False
            end_time = time.time() + 60
            while time.time() < end_time and not all_osds_up:
                for osd_id in osd_list:
                    osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
                    if osd_up:
                        msg = 'OSD ID {} is up as expected'.format(osd_id)
                        LOG.info(msg)
                        down_list.remove(osd_id)
                if len(down_list) > 0:
                    osd_list = down_list.copy()
                else:
                    msg = ' All OSDs are up as expected'
                    LOG.info(msg)
                    all_osds_up = True

            assert all_osds_up, " One or more OSD(s) {}  is(are) down but should be up".format(down_list)

            LOG.tc_step('Check health of CEPH cluster')
            end_time = time.time() + 40
            while time.time() < end_time:
                ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
                if ceph_healthy is True:
                    break

            assert ceph_healthy, "ceph is not healthy"

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()