Exemplo n.º 1
0
def test_storgroup_semantic_checks():
    """
    This test validates CEPH semantic checks as it applies to storage nodes in
    a replication group.

    Args:
        - None

    Setup:
        - Requires a system with storage nodes (minimum of 2)
        - Requires TiS Release 3 and up

    Test Steps:
        1.  Lock one storage node in a storage node pair
        2.  Check the appropriate alarms are raised
        3.  Check OSDs are down on the storage node
        4.  Check that CEPH is no longer healthy
        5.  Attempt to lock the other node and ensure it is rejected
        6.  Attempt to force lock the other node and ensure it is rejected
        7.  If the storage node is a storage monitor, attempt to lock and force
            lock the controllers
        8.  Unlock the storage node in the storage node pair
        9.  Check that the alarms are cleared
        10.  Check that OSDs are up
        11.  Check that CEPH is healthy

    Defects this addresses:
        1.  CGTS-4286 Unexpected allowing lock action on storage node peergroup
            when redundancy lost
        2.  CGTS-3494 Some OSDs observed to be up on locked storage node
        3.  CGTS-3643 Able to lock standby controller despite only two CEPH
            monitors being available
        4.  CGTS-2690 Storage: Force locking a controller should be rejected when storage
            is locked.
    """

    con_ssh = ControllerClient.get_active_controller()

    table_ = table_parser.table(cli.system('storage-backend-show ceph-store')[1])
    capabilities = table_parser.get_value_two_col_table(table_, 'capabilities')
    replication_factor = capabilities[1]
    LOG.info("The replication factor is: {}".format(replication_factor))

    # We want to test storage-0 since it is a ceph monitor
    # Then we want to test another storage host in another group.  The choice
    # depends on the replication factor.
    storage_nodes = ["storage-0"]
    if replication_factor == "3":
        storage_nodes.append("storage-3")

    if replication_factor == "2" and len(storage_nodes) > 2:
        storage_nodes.append("storage-2")

    LOG.info("Storage hosts under test are: {}".format(storage_nodes))

    for host in storage_nodes:
        LOG.tc_step('Lock {}:'.format(host))
        HostsToRecover.add(host, scope='function')
        rtn_code, out = host_helper.lock_host(host)
        assert rtn_code == 0, out

        LOG.tc_step("Verify CEPH cluster health reflects the OSD being down")
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check that alarms are raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raise')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that the ceph health warning alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        hosts = []
        if host == 'storage-0':
            hosts.append('controller-0')
            hosts.append('controller-1')

        for node in hosts:
            LOG.tc_step('Attempt to lock the {}'.format(node))
            HostsToRecover.add(node)
            rtn_code, out = host_helper.lock_host(node, fail_ok=True)
            assert 1 == rtn_code, out

            LOG.tc_step('Attempt to force lock {}'.format(node))
            rtn_code, out = host_helper.lock_host(node, force=True, fail_ok=True)
            assert 1 == rtn_code, out

        LOG.tc_step('Unlock storage host {}'.format(host))
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        LOG.info("Check if alarms have cleared")
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg
Exemplo n.º 2
0
def test_lock_cont_check_mon_down():
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that we alarm when a CEPH monitor goes
    down.  This test is specifically for controller hosts.

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock controller node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs stay up
            - Check that the appropriate alarms are raised:
              - controller-X is locked
              - ceph mon down
        3.  Unlock controller node
            - ensure CEPH is HEALTH_OK
            - Check that alarms are cleared

    Enhancements:
       1.  Should we do both controllers?  This will require a swact.
    """

    con_ssh = ControllerClient.get_active_controller()

    host = system_helper.get_standby_controller_name()
    LOG.tc_step('Lock standby controller node {}'.format(host))
    HostsToRecover.add(host, scope='function')
    rtn_code, out = host_helper.lock_host(host)
    assert rtn_code == 0, out

    LOG.tc_step('Check that storage degrade alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
        "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
        "Alarm {} not raised".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check OSDs are still up after lock')
    osd_list = storage_helper.get_osds(con_ssh=con_ssh)
    for osd_id in osd_list:
        osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
        msg = 'OSD ID {} should be up but is not'.format(osd_id)
        assert osd_up, msg
        msg = 'OSD ID {} is up'.format(osd_id)
        LOG.info(msg)

    LOG.tc_step('Unlock standby controller node {}'.format(host))
    rtn_code, out = host_helper.unlock_host(host, available_only=True)
    assert rtn_code == 0, out

    LOG.tc_step('Check that the host locked alarm is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
        "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check that the Storage Alarm Condition is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
        "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check health of CEPH cluster')
    msg = ''
    end_time = time.time() + 40
    while time.time() < end_time:
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        if ceph_healthy:
            break
    else:
        assert 0, "ceph is not healthy"
Exemplo n.º 3
0
def test_ceph_reboot_storage_node(stx_openstack_required):
    """
    us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt

    Verify that ceph mon processes recover when they are killed
    nodes.

    Args:
        - Nothing

    Setup:
        - Requires system with storage nodes

    Test Steps:
        0.  Run CEPH pre-check fixture to check:
            - system has storage nodes
            - health of the ceph cluster is okay
            - that we have OSDs provisioned
        1.  Delete existing VMs
        2.  Boot new VMs and run dd on them
        3.  Reboot storage node and ensure both:
            - mon state goes down (if storage-0)
            - OSD state goes down
        4.  Ensure mon and OSD state recover afterwards
        5.  Cleanup VMs

    Potential rework:
        1.  Add the alarms checks for raise and clear
        2.  Maybe we don't want to reboot all storage nodes

    What defects this addresses:
        1.  CGTS-2975

    Update:
        This test was updated for the Storage and Robustness feature.
    """
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")

    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        storage_nodes = system_helper.get_storage_nodes(con_ssh)

        for host in storage_nodes:
            LOG.tc_step('Reboot {}'.format(host))
            HostsToRecover.add(host, scope='function')
            host_helper.reboot_hosts(host, wait_for_offline=True, wait_for_reboot_finish=False)

            LOG.tc_step('Check health of CEPH cluster')
            ceph_healthy = True
            msg = None
            end_time = time.time() + 10
            while time.time() < end_time:
                ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
                if not ceph_healthy:
                    break

            assert not ceph_healthy, "ceph is not healthy"
            LOG.info(msg)

            LOG.tc_step('Check that OSDs are down')
            osd_list = storage_helper.get_osds(host, con_ssh)
            all_osds_up = True
            up_list = osd_list.copy()
            end_time = time.time() + 60
            while time.time() < end_time and all_osds_up:
                for osd_id in osd_list:
                    osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
                    if not osd_up:
                        msg = 'OSD ID {} is down as expected'.format(osd_id)
                        LOG.info(msg)
                        up_list.remove(osd_id)
                if len(up_list) > 0:
                    osd_list = up_list.copy()
                else:
                    msg = ' All OSDs are down as expected'
                    LOG.info(msg)
                    all_osds_up = False

            assert not all_osds_up, " One or more OSD(s) {}  is(are) up but should be down".format(up_list)

            system_helper.wait_for_host_values(host, availability='available')

            LOG.tc_step('Check that OSDs are up')
            osd_list = storage_helper.get_osds(host, con_ssh)
            down_list = osd_list.copy()
            all_osds_up = False
            end_time = time.time() + 60
            while time.time() < end_time and not all_osds_up:
                for osd_id in osd_list:
                    osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
                    if osd_up:
                        msg = 'OSD ID {} is up as expected'.format(osd_id)
                        LOG.info(msg)
                        down_list.remove(osd_id)
                if len(down_list) > 0:
                    osd_list = down_list.copy()
                else:
                    msg = ' All OSDs are up as expected'
                    LOG.info(msg)
                    all_osds_up = True

            assert all_osds_up, " One or more OSD(s) {}  is(are) down but should be up".format(down_list)

            LOG.tc_step('Check health of CEPH cluster')
            end_time = time.time() + 40
            while time.time() < end_time:
                ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
                if ceph_healthy is True:
                    break

            assert ceph_healthy, "ceph is not healthy"

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
Exemplo n.º 4
0
def test_lock_stor_check_osds_down(stx_openstack_required, host):
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that all OSDs go down on a locked storage
    node.  There are two variants:

    1.  Lock 'storage-0' which is a ceph monitor
    2.  Lock a storage node that is not 'storage-0', i.e. not a ceph monitor

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock storage node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs on the locked storage node are down
            - Check that the appropriate alarms are raised:
        3.  Unlock storage node
            - ensure CEPH is HEALTH_OK
            - ensure all OSDs on unlocked node are up
            - Check that alarms are cleared

    Note: If the storage node to be locked is monitor, we also expect to see
    the mon down alarm.

    What defects this addresses:
        1.  CGTS-2609 - Ceph processes fail to start after storage node reboot

    Notes:
        - Updated test to write to disk to add I/O load on system

    """

    con_ssh = ControllerClient.get_active_controller()

    if host == 'any':
        storage_nodes = system_helper.get_hosts(personality='storage')
        LOG.info('System has {} storage nodes:'.format(storage_nodes))
        storage_nodes.remove('storage-0')
        node_id = random.randint(0, len(storage_nodes) - 1)
        host = storage_nodes[node_id]

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")
    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        LOG.tc_step('Lock storage node {}'.format(host))
        HostsToRecover.add(host)
        host_helper.lock_host(host, check_first=False)

        LOG.tc_step('Determine the storage group for host {}'.format(host))
        storage_group, msg = storage_helper.get_storage_group(host)
        LOG.info(msg)

        LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that ceph is in health warn')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        # We're waiting 5 minutes for ceph rebalancing to be performed
        # DO NOT REMOVE.  This is part of the test.
        time.sleep(300)

        LOG.tc_step('Unlock storage node')
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        health = False
        end_time = time.time() + 40
        while time.time() < end_time:
            health = storage_helper.is_ceph_healthy(con_ssh)
            if health is True:
                break
        assert health, "Ceph did not become healthy"

        LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host))
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that the replication group alarm is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        LOG.tc_step('Check that the Storage Alarm Condition is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg

        LOG.tc_step('Check health of CEPH cluster')
        end_time = time.time() + 40
        while time.time() < end_time:
            ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
            if ceph_healthy is True:
                break

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
Exemplo n.º 5
0
def test_horizon_storage_overview_display(storage_overview_pg):
    """
    Tests the storage overview display:

    Setups:
        - Login as Admin
        - Go to Admin > Platform > Storage Overview

    Teardown:
        - Logout

    Test Steps:
        - Test Storage cluster UUID, Health Status and Details display
        - Test host and rank table display
        - Test osd.# table and status display
    """
    con_ssh = ControllerClient.get_active_controller()
    standby = system_helper.get_standby_controller_name(con_ssh=con_ssh)
    if standby == 'controller-1':
        LOG.info('Workaround for CGTS-16739')
        host_helper.swact_host(con_ssh=con_ssh)

    LOG.tc_step('Check storage cluster UUID, ceph health and storage usage display')
    cli_storage_service_info = []

    uuid = system_helper.get_clusters(field='cluster_uuid')[0]
    cli_storage_service_info.append(uuid)

#   'ceph health' cmd output sample:
#   HEALTH_ERR 1728 pgs are stuck inactive for more than 300 seconds; 1728 pgs stuck inactive; 1728 pgs stuck unclean;\
#   1 mons down, quorum 0,1 controller-0,controller-1
    health_details = con_ssh.exec_cmd('ceph health')[1]
    health_status = health_details.split(' ')[0]
    cli_storage_service_info.append(health_status)

    if health_status == 'HEALTH_ERR':
        health_details = health_details.split('HEALTH_ERR ')[1]
    elif health_status == 'HEALTH_WARN':
        health_details = health_details.split('HEALTH_WARN ')[1]
    cli_storage_service_info.append(health_details)

    horizon_ceph_info = storage_overview_pg.storage_service_info.get_content()
    for info in cli_storage_service_info:
        assert info in horizon_ceph_info.values(), 'Horizon storage cluster info does not match to cli info'
    LOG.tc_step('Storage service details display correct')

    LOG.info('Test host and rank table display')
    ceph_mon_status = eval(con_ssh.exec_cmd('ceph mon_status')[1])
    mon_map = ceph_mon_status.get('monmap')
    cli_ceph_monitor = {}
    # mon_map.get('mons') returns a dict list
    for mon_info_dict in mon_map.get('mons'):
        host_name = mon_info_dict.get('name')
        host_rank = mon_info_dict.get('rank')
        cli_ceph_monitor[host_name] = str(host_rank)

    for host_name in cli_ceph_monitor.keys():
        cli_rank_val = cli_ceph_monitor[host_name]
        horizon_rank_val = storage_overview_pg.get_storage_overview_monitor_info(host_name, 'Rank')
        assert horizon_rank_val == cli_rank_val, '{} rank display incorrectly'.format(host_name)

    LOG.info('Host and rank table display correct')

    LOG.tc_step('Test osd table and status display')
    osd_list = storage_helper.get_osds()
    for osd_id in osd_list:
        if osd_id is not None:
            expt_horizon = {}
            for header in storage_overview_pg.osds_table.column_names:
                host_name = storage_helper.get_osd_host(osd_id)
                osd_name = 'osd.{}'.format(osd_id)
                expt_horizon['Host'] = host_name
                expt_horizon['Name'] = osd_name
                expt_horizon['Status'] = 'up'
                if not storage_helper.is_osd_up(osd_id, con_ssh):
                    expt_horizon['Status'] = 'down'
                horizon_val = storage_overview_pg.get_storage_overview_osd_info(osd_name, header)
                assert expt_horizon[header] == horizon_val, '{}{} display incorrect'.format(osd_name, header)
    LOG.info('Osd table display correct')
    horizon.test_result = True