Exemplo n.º 1
0
def test_reapply_openstack():
    container_helper.wait_for_apps_status(apps="stx-openstack",
                                          status=AppStatus.APPLIED,
                                          timeout=600,
                                          check_interval=60)
    container_helper.remove_app(app_name="stx-openstack", check_first=True)
    alarm_id = EventLogID.CONFIG_OUT_OF_DATE
    if system_helper.wait_for_alarm(alarm_id=alarm_id,
                                    entity_id='controller',
                                    timeout=15,
                                    fail_ok=True)[0]:
        system_helper.wait_for_alarm_gone(alarm_id=alarm_id,
                                          entity_id='controller',
                                          timeout=120,
                                          check_interval=10)
    container_helper.apply_app(app_name="stx-openstack",
                               check_first=False,
                               check_interval=300,
                               applied_timeout=5400)
    provider_network_setup(PHYSNET0, PHYSNET1)
    tenant_networking_setup(physnet0=PHYSNET0,
                            physnet1=PHYSNET1,
                            externalnet=EXTERNALNET,
                            publicnet=PUBLICNET,
                            privatenet=PRIVATENET,
                            internalnet=INTERNALNET,
                            publicsubnet=PUBLICSUBNET,
                            privatesubnet=PRIVATESUBNET,
                            internalsubnet=INTERNALSUBNET,
                            externalsubnet=EXTERNALSUBNET,
                            publicrouter=PUBLICROUTER,
                            privaterouter=PRIVATEROUTER)
Exemplo n.º 2
0
def __clear_config_out_of_date_alarms(hosts):
    LOG.info("Check config out-of-date alarms are raised against the nodes and lock unlock them to clear alarms")
    for node in hosts:
        system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, entity_id="host={}".format(node))

    LOG.info("Wait 60 seconds to ensure the service parameter is applied")
    time.sleep(60)

    host_helper.lock_unlock_hosts(hosts=hosts)
    for node in hosts:
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, entity_id="host={}".format(node))
Exemplo n.º 3
0
def test_enable_tpm(swact_first):
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step('Check if TPM is already configured')
    code, cert_id, cert_type = get_tpm_status(con_ssh)

    if code == 0:
        LOG.info('TPM already configured on the lab, cert_id:{}, cert_type:{}'.
                 format(cert_id, cert_type))

        LOG.tc_step('disable TPM first in order to test enabling TPM')
        code, output = remove_cert_from_tpm(con_ssh,
                                            fail_ok=False,
                                            check_first=False)
        assert 0 == code, 'failed to disable TPM'
        time.sleep(30)

        LOG.info('Waiting alarm: out-of-config cleaned up')
        system_helper.wait_for_alarm_gone(EventLogID.CONFIG_OUT_OF_DATE)

    else:
        LOG.info('TPM is NOT configured on the lab')
        LOG.info('-code:{}, cert_id:{}, cert_type:{}'.format(
            code, cert_id, cert_type))

    if swact_first:
        LOG.tc_step('Swact the active controller as instructed')

        if len(system_helper.get_controllers()) < 2:
            LOG.info('Less than 2 controllers, skip swact')
        else:
            host_helper.swact_host(fail_ok=False)
            copy_config_from_local(
                con_ssh, local_conf_backup_dir,
                os.path.join(HostLinuxUser.get_home(), conf_backup_dir))

    LOG.tc_step('Install HTTPS Certificate into TPM')
    code, output = store_cert_into_tpm(
        con_ssh,
        check_first=False,
        fail_ok=False,
        pem_password=HostLinuxUser.get_password())
    assert 0 == code, 'Failed to instll certificate into TPM, cert-file'

    LOG.info('OK, certificate is installed into TPM')

    LOG.info('Wait the out-of-config alarm cleared')
    system_helper.wait_for_alarm_gone(EventLogID.CONFIG_OUT_OF_DATE)

    LOG.tc_step(
        'Verify the configurations changes for impacted components, expecting all changes exit'
    )
    verify_configuration_changes(expected=True, connection=con_ssh)
Exemplo n.º 4
0
def clear_config_out_of_date_alarm():
    active, standby = system_helper.get_active_standby_controllers()
    for host in (standby, active):
        if host and system_helper.wait_for_alarm(
                alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                timeout=5,
                entity_id=host,
                fail_ok=True)[0]:
            host_helper.lock_host(host, swact=True)
            time.sleep(60)
            host_helper.unlock_host(host)
            system_helper.wait_for_alarm_gone(
                alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                entity_id=host,
                fail_ok=False)
Exemplo n.º 5
0
def test_dead_office_recovery(reserve_unreserve_all_hosts_module):
    """
    Test dead office recovery with vms
    Args:
        reserve_unreserve_all_hosts_module: test fixture to reserve unreserve all vlm nodes for lab under test

    Setups:
        - Reserve all nodes in vlm

    Test Steps:
        - Boot 5 vms with various boot_source, disks, etc and ensure they can be reached from NatBox
        - Power off all nodes in vlm using multi-processing to simulate a power outage
        - Power on all nodes
        - Wait for nodes to become online/available
        - Check vms are recovered after hosts come back up and vms can be reached from NatBox

    """
    LOG.tc_step("Boot 5 vms with various boot_source, disks, etc")
    vms = vm_helper.boot_vms_various_types()

    hosts = system_helper.get_hosts()
    hosts_to_check = system_helper.get_hosts(availability=['available', 'online'])

    LOG.info("Online or Available hosts before power-off: {}".format(hosts_to_check))
    LOG.tc_step("Powering off hosts in multi-processes to simulate power outage: {}".format(hosts))
    region = None
    if ProjVar.get_var('IS_DC'):
        region = ProjVar.get_var('PRIMARY_SUBCLOUD')

    try:
        vlm_helper.power_off_hosts_simultaneously(hosts, region=region)
    except:
        raise
    finally:
        LOG.tc_step("Wait for 60 seconds and power on hosts: {}".format(hosts))
        time.sleep(60)
        LOG.info("Hosts to check after power-on: {}".format(hosts_to_check))
        vlm_helper.power_on_hosts(hosts, reserve=False, reconnect_timeout=HostTimeout.REBOOT+HostTimeout.REBOOT,
                                  hosts_to_check=hosts_to_check, region=region)

    LOG.tc_step("Check vms are recovered after dead office recovery")
    vm_helper.wait_for_vms_values(vms, fail_ok=False, timeout=600)
    for vm in vms:
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm, timeout=VMTimeout.DHCP_RETRY)
    computes = host_helper.get_hypervisors()
    if len(computes) >= 4:
        system_helper.wait_for_alarm(alarm_id=EventLogID.MULTI_NODE_RECOVERY, timeout=120)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.MULTI_NODE_RECOVERY, check_interval=60, timeout=1200)
Exemplo n.º 6
0
def wait_after_change_sysadmin_password():
    total_wait_time = MAX_WAIT_FOR_ALARM
    each_wait_time = 120
    waited_time = 0

    time.sleep(30)

    alarm_id = ALARM_ID_OUTOF_CONFIG
    while waited_time < total_wait_time:
        waited_time += each_wait_time

        found = system_helper.wait_for_alarm(alarm_id=alarm_id,
                                             fail_ok=True,
                                             timeout=each_wait_time)
        if found:
            LOG.info('OK, found alarm for password change, alarm-id:{}'.format(
                alarm_id))
            alarm_gone = system_helper.wait_for_alarm_gone(
                alarm_id, fail_ok=True, timeout=each_wait_time)
            if alarm_gone:
                LOG.info(
                    'OK, found alarms were cleared for password change, alarm-id:{}'
                    .format(alarm_id))
                break
    else:
        assert False, 'Failed to find alarms/or alarms not cleared for password change within {} seconds, ' \
                      'expecting alarm-id:{}'.format(waited_time, alarm_id)
    return True
Exemplo n.º 7
0
def test_increase_controllerfs():
    """
    This test increases the size of the various controllerfs filesystems all at
    once.

    Arguments:
    - None

    Test Steps:
    - Query the filesystem for their current size
    - Increase the size of each filesystem at once

    Assumptions:
    - There is sufficient free space to allow for an increase, otherwise skip
      test.

    """
    drbdfs_val = {}
    LOG.tc_step("Determine the space available for each drbd filesystem")
    for fs in DRBDFS:
        drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
        LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))
        drbdfs_val[fs] = drbdfs_val[fs] + 1
        LOG.info("Will attempt to increase the value of {} to {}".format(
            fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of all filesystems")
    storage_helper.modify_controllerfs(**drbdfs_val)
    # Need to wait until the change takes effect before checking the
    # filesystems
    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)

    LOG.tc_step(
        "Confirm the underlying filesystem size matches what is expected")
    storage_helper.check_controllerfs(**drbdfs_val)
Exemplo n.º 8
0
def alarm_summary_add_and_del(subcloud):
    try:
        # Test adding alarm on subcloud
        ssh_client = ControllerClient.get_active_controller(name=subcloud)
        LOG.info("Wait for alarm raised on subcloud {}".format(subcloud))
        system_helper.wait_for_alarm(
            alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_client)
        LOG.tc_step(
            "Ensure alarm summary match nn Central with subcloud: {}".format(
                subcloud))
        check_alarm_summary_match_subcloud(subcloud)

        # Test clearing alarm on subcloud
        LOG.tc_step("Clear alarm on subcloud: {}".format(subcloud))
        ssh_client.exec_cmd('fmClientCli -D host=testhost-0', fail_ok=False)
        LOG.info("Wait for alarm clear on subcloud {}".format(subcloud))
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.PROVIDER_NETWORK_FAILURE, con_ssh=ssh_client)
        check_alarm_summary_match_subcloud(subcloud)
    finally:
        ssh_client = ControllerClient.get_active_controller(name=subcloud)
        LOG.info("Clear alarm on subcloud: {}".format(subcloud))
        ssh_client.exec_cmd('fmClientCli -D host=testhost-0')
Exemplo n.º 9
0
def test_disable_tpm(swact_first):
    ssh_client = ControllerClient.get_active_controller()

    LOG.tc_step('Check if TPM is already configured')
    code, cert_id, cert_type = get_tpm_status(ssh_client)

    if code == 0:
        LOG.info('TPM is configured on the lab')

        if swact_first:
            LOG.tc_step('Swact the active controller as instructed')
            if len(system_helper.get_controllers()) < 2:
                LOG.info('Less than 2 controllers, skip swact')
            else:
                host_helper.swact_host(fail_ok=False)
                copy_config_from_local(
                    ssh_client, local_conf_backup_dir,
                    os.path.join(HostLinuxUser.get_home(), conf_backup_dir))

        LOG.tc_step('Disabling TPM')
        code, output = remove_cert_from_tpm(ssh_client,
                                            fail_ok=False,
                                            check_first=False)
        assert 0 == code, 'failed to disable TPM'

        LOG.info('Wait the out-of-config alarm cleared')
        system_helper.wait_for_alarm_gone(EventLogID.CONFIG_OUT_OF_DATE)

        LOG.tc_step(
            'Verify the configurations changes for impacted components, DO NOT expect any of the changes'
        )
        verify_configuration_changes(expected=False, connection=ssh_client)

    else:
        LOG.info('TPM is NOT configured on the lab, skip the test')
        skip('TPM is NOT configured on the lab, skip the test')
Exemplo n.º 10
0
def _test_modify_oam_ips(restore_oam, oam_ips):
    """
    Change OAM IPs using CLI

    Verify that oam IPs on both standby and active controller can be modified by cli

    Test Steps:
        - verify there is no 250.001 alarm
        - modify oam IPs
        - verify oam IPs have been changed
        - verify Alarms 250.001 Configuration out-of-date raised for controllers
        - lock/unlock standby controllers
        - verify there is standby controller 250.001 alarm in clear
        - swact controller
        - lock/unlock another controllers
        - verify there is no 250.001 alarms
        - verify all controllers are in good status

    Teardown:
        - Revert oam ips if modified

    """

    # make sure there is no 250.001 alarm in alarm-list
    if not system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE, fail_ok=False):
        skip("250.001 Alarms did not clear at the beginning of the test")

    LOG.tc_step("Modify OAM IPs to new IPs")
    new_oam_ip0 = SpareIP.NEW_OAM_IP0
    new_oam_ip1 = SpareIP.NEW_OAM_IP1
    new_oam_ip2 = SpareIP.NEW_OAM_IP2

    kwargs = {}
    if 'c0' in oam_ips:
        kwargs['oam_c0_ip'] = new_oam_ip0
    if 'c1' in oam_ips:
        kwargs['oam_c1_ip'] = new_oam_ip1
    if 'floating' in oam_ips:
        kwargs['oam_floating_ip'] = new_oam_ip2

    system_helper.modify_oam_ips(**kwargs)
Exemplo n.º 11
0
def activate_upgrade(con_ssh=None, fail_ok=False):
    """
    Activates upgrade
    Args:
        con_ssh (SSHClient):
        fail_ok (bool):

    Returns (tuple):
        (0, dict/list) - success
        (1, <stderr>)   # cli returns stderr, applicable if fail_ok is true

    """
    rc, output = cli.system('upgrade-activate', ssh_client=con_ssh, fail_ok=True)
    if rc != 0:
        err_msg = "CLI system upgrade-activate failed: {}".format(output)
        LOG.warning(err_msg)
        if fail_ok:
            return rc, output
        else:
            raise exceptions.CLIRejected(err_msg)

    if not system_helper.wait_for_alarm_gone("250.001", con_ssh=con_ssh, timeout=900, check_interval=60, fail_ok=True):

        alarms = system_helper.get_alarms(alarm_id="250.001")
        err_msg = "After activating upgrade alarms are not cleared : {}".format(alarms)
        LOG.warning(err_msg)
        if fail_ok:
            return 1, err_msg
        else:
            raise exceptions.HostError(err_msg)

    if not wait_for_upgrade_activate_complete(fail_ok=True):
        err_msg = "Upgrade activate failed"
        LOG.warning(err_msg)
        if fail_ok:
            return 1, err_msg
        else:
            raise exceptions.HostError(err_msg)

    LOG.info("Upgrade activation complete")
    return 0, None
Exemplo n.º 12
0
def test_ceph_mon_process_kill(monitor, ceph_monitors):
    """
    us69932_tc2_ceph_mon_process_kill from us69932_ceph_monitoring.odt

    Verify that ceph mon processes recover when they are killed.

    Args:
        - Nothing

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Run CEPH pre-check fixture to check:
            - system has storage nodes
            - health of the ceph cluster is okay
            - that we have OSDs provisioned
        2.  Pick one ceph monitor and remove it from the quorum
        3.  Kill the monitor process
        4.  Check that the appropriate alarms are raised
        5.  Restore the monitor to the quorum
        6.  Check that the alarms clear
        7.  Ensure the ceph monitor is restarted under a different pid

    Potential flaws:
        1.  We're not checking if unexpected alarms are raised (TODO)

    Teardown:
        - None

    What defects this addresses:
        1.  CGTS-2975

    """
    if monitor not in ceph_monitors:
        skip("{} is not a ceph monitor".format(monitor))

    LOG.tc_step('Get process ID of ceph monitor')
    mon_pid = storage_helper.get_mon_pid(monitor)

    with host_helper.ssh_to_host(monitor) as host_ssh:
        with host_ssh.login_as_root() as root_ssh:
            LOG.tc_step('Remove the monitor')
            cmd = 'ceph mon remove {}'.format(monitor)
            root_ssh.exec_cmd(cmd)

            LOG.tc_step('Stop the ceph monitor')
            cmd = 'service ceph stop mon.{}'.format('controller' if system_helper.is_aio_duplex() else monitor)
            root_ssh.exec_cmd(cmd)

    LOG.tc_step('Check that ceph monitor failure alarm is raised')
    system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_DEGRADE, timeout=300)

    with host_helper.ssh_to_host(monitor) as host_ssh:
        with host_ssh.login_as_root() as root_ssh:
            LOG.tc_step('Get cluster fsid')
            cmd = 'ceph fsid'
            fsid = host_ssh.exec_cmd(cmd)[0]
            ceph_conf = '/etc/ceph/ceph.conf'

            LOG.tc_step('Remove old ceph monitor directory')
            cmd = 'rm -rf /var/lib/ceph/mon/ceph-{}'.format(monitor)
            root_ssh.exec_cmd(cmd)

            LOG.tc_step('Re-add the monitor')
            cmd = 'ceph-mon -i {} -c {} --mkfs --fsid {}'.format(monitor, ceph_conf, fsid)
            root_ssh.exec_cmd(cmd)

    LOG.tc_step('Check the ceph storage alarm condition clears')
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.STORAGE_DEGRADE, timeout=360)

    LOG.tc_step('Check the ceph-mon process is restarted with a different pid')
    mon_pid2 = None
    for i in range(0, PROC_RESTART_TIME):
        mon_pid2 = storage_helper.get_mon_pid(monitor, fail_ok=True)
        if mon_pid2 and mon_pid2 != mon_pid:
            break
        time.sleep(5)

    LOG.info('Old pid is {} and new pid is {}'.format(mon_pid, mon_pid2))
    msg = 'Process did not restart in time'
    assert mon_pid2 and mon_pid2 != mon_pid, msg
Exemplo n.º 13
0
def apply_app(app_name,
              check_first=False,
              fail_ok=False,
              applied_timeout=300,
              check_interval=10,
              wait_for_alarm_gone=True,
              con_ssh=None,
              auth_info=Tenant.get('admin_platform')):
    """
    Apply/Re-apply application via system application-apply. Check for status
    reaches 'applied'.
    Args:
        app_name (str):
        check_first:
        fail_ok:
        applied_timeout:
        check_interval:
        con_ssh:
        wait_for_alarm_gone (bool):
        auth_info:

    Returns (tuple):
        (-1, "<app_name> is already applied. Do nothing.")     # only returns
        if check_first=True.
        (0, "<app_name> (re)applied successfully")
        (1, <std_err>)  # cli rejected
        (2, "<app_name> failed to apply")   # did not reach applied status
        after apply.

    """
    if check_first:
        app_status = get_apps(application=app_name,
                              field='status',
                              con_ssh=con_ssh,
                              auth_info=auth_info)
        if app_status and app_status[0] == AppStatus.APPLIED:
            msg = '{} is already applied. Do nothing.'.format(app_name)
            LOG.info(msg)
            return -1, msg

    LOG.info("Apply application: {}".format(app_name))
    code, output = cli.system('application-apply',
                              app_name,
                              ssh_client=con_ssh,
                              fail_ok=fail_ok,
                              auth_info=auth_info)
    if code > 0:
        return 1, output

    res = wait_for_apps_status(apps=app_name,
                               status=AppStatus.APPLIED,
                               timeout=applied_timeout,
                               check_interval=check_interval,
                               con_ssh=con_ssh,
                               auth_info=auth_info,
                               fail_ok=fail_ok)[0]
    if not res:
        return 2, "{} failed to apply".format(app_name)

    if wait_for_alarm_gone:
        alarm_id = EventLogID.CONFIG_OUT_OF_DATE
        if system_helper.wait_for_alarm(alarm_id=alarm_id,
                                        entity_id='controller',
                                        timeout=15,
                                        fail_ok=True,
                                        auth_info=auth_info,
                                        con_ssh=con_ssh)[0]:
            system_helper.wait_for_alarm_gone(alarm_id=alarm_id,
                                              entity_id='controller',
                                              timeout=120,
                                              check_interval=10,
                                              con_ssh=con_ssh,
                                              auth_info=auth_info)

    msg = '{} (re)applied successfully'.format(app_name)
    LOG.info(msg)
    return 0, msg
Exemplo n.º 14
0
def test_increase_extensionfs_with_alarm():
    """
    This test increases the size of the extenteion controllerfs filesystems while there is an alarm condition for the
    fs.

    Arguments:
    - None

    Test Steps:
    - Query the filesystem for their current size
    - cause an alarm condition by filling the space on that fs
    - verify controller-0 is degraded
    - Increase the size of extension filesystem.
    - Verify alarm is gone

    Assumptions:
    - There is sufficient free space to allow for an increase, otherwise skip
      test.
    """
    file_loc = "/opt/extension"
    cmd = "cd " + file_loc
    file_path = file_loc + "/" + "testFile"
    drbdfs_val = {}
    fs = "extension"

    active_controller = system_helper.get_active_controller_name()

    LOG.tc_step("Determine the space available for extension filesystem")
    drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))

    # get the 91% of the current size
    LOG.info(
        "Will attempt to fill up the space to 90% of fs {} of value of {}".
        format(fs, drbdfs_val[fs]))
    file_size = int((drbdfs_val[fs] * 0.91) * 1000)
    file_size = str(file_size) + "M"
    cmd1 = "fallocate -l {} testFile".format(file_size)
    con_ssh = ControllerClient.get_active_controller()
    con_ssh.exec_cmd(cmd)
    con_ssh.exec_sudo_cmd(cmd1)
    if not con_ssh.file_exists(file_path=file_path):
        LOG.info("File {} is not created".format(file_path))
        return 0

    # fill_in_fs(size=file_size)
    LOG.tc_step(
        "Verifying that the alarm is created after filling the fs space in {}".
        format(fs))
    system_helper.wait_for_alarm(alarm_id="100.104",
                                 entity_id=active_controller,
                                 timeout=600,
                                 strict=False)

    # verify the controller is in degraded state
    LOG.tc_step(
        "Verifying controller is degraded after filling the fs space in {}".
        format(fs))
    system_helper.wait_for_host_values(active_controller,
                                       availability='degraded')

    drbdfs_val[fs] = drbdfs_val[fs] + 2

    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of extension filesystem")
    storage_helper.modify_controllerfs(**drbdfs_val)

    # Need to wait until the change takes effect before checking the
    # filesystems
    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)
        LOG.tc_step(
            "Verifying that the alarm is cleared after increasing the fs space in {}"
            .format(fs))
        system_helper.wait_for_alarm_gone(alarm_id="100.104",
                                          entity_id="host={}".format(host),
                                          timeout=600,
                                          strict=False)

    LOG.tc_step(
        "Confirm the underlying filesystem size matches what is expected")
    storage_helper.check_controllerfs(**drbdfs_val)

    # verify the controller is in available state
    LOG.tc_step(
        "Verifying that the controller is in available state after increasing the fs space in {}"
        .format(fs))
    system_helper.wait_for_host_values(active_controller,
                                       availability='available')
Exemplo n.º 15
0
def _test_increase_ceph_mon():
    """
    Increase the size of ceph-mon.  Only applicable to a storage system.

    Fails until CGTS-8216

    Test steps:
    1.  Determine the current size of ceph-mon
    2.  Attempt to modify ceph-mon to invalid values
    3.  Check if there is free space to increase ceph-mon
    4.  Attempt to increase ceph-mon
    5.  Wait for config out-of-date alarms to raise
    6.  Lock/unlock all affected nodes (controllers and storage)
    7.  Wait for alarms to clear
    8.  Check that ceph-mon has the correct updated value

    Enhancement:
    1.  Possibly check there is enough disk space for ceph-mon to increase.  Not sure if
    this is required since there always seems to be some space on the rootfs.

    """
    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    LOG.info("ceph_mon_gib is currently: {}".format(ceph_mon_gib))

    LOG.tc_step("Attempt to modify ceph-mon to invalid values")
    invalid_cmg = ['19', '41', 'fds']
    for value in invalid_cmg:
        host = "controller-0"
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(host, value),
                   fail_ok=True)

    if int(ceph_mon_gib) >= 30:
        skip("Insufficient disk space to execute test")

    ceph_mon_gib_avail = 40 - int(ceph_mon_gib)
    new_ceph_mon_gib = math.trunc(ceph_mon_gib_avail / 10) + int(ceph_mon_gib)

    LOG.tc_step("Increase ceph_mon_gib to {}".format(new_ceph_mon_gib))
    hosts = system_helper.get_controllers()
    for host in hosts:
        cli.system("ceph-mon-modify {} ceph_mon_gib={}".format(
            host, new_ceph_mon_gib))
        # We only need to do this for one controller now and it applies to both
        break

    LOG.info("Wait for expected alarms to appear")
    storage_hosts = system_helper.get_storage_nodes()
    total_hosts = hosts + storage_hosts
    for host in total_hosts:
        system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                     entity_id="host={}".format(host))

    LOG.tc_step("Lock/unlock all affected nodes")
    for host in storage_hosts:
        HostsToRecover.add(host)
        host_helper.lock_host(host)
        host_helper.unlock_host(host)
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host))
        time.sleep(10)

    standby = system_helper.get_standby_controller_name()
    active = system_helper.get_active_controller_name()
    HostsToRecover.add(standby)
    host_helper.lock_host(standby)
    host_helper.unlock_host(standby)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(standby))
    time.sleep(10)
    host_helper.swact_host(active)
    HostsToRecover.add(active)
    host_helper.lock_host(active)
    host_helper.unlock_host(active)
    system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                      entity_id="host={}".format(active))

    table_ = table_parser.table(cli.system("ceph-mon-list")[1])
    ceph_mon_gib = table_parser.get_values(table_, "ceph_mon_gib",
                                           **{"hostname": "controller-0"})[0]
    assert ceph_mon_gib != new_ceph_mon_gib, "ceph-mon did not change"
Exemplo n.º 16
0
def test_modify_drdb_swact_then_reboot():
    """
    This test modifies the size of the drbd based filesystems, does and
    immediate swact and then reboots the active controller.

    Arguments:
    - None

    Test Steps:
    - Determine how much free space we have available
    - Increase datebase
    - Increase extension
    - Initiate a controller swact
    - Initiate a controller reboot

    Assumptions:
    - None

    """

    drbdfs = DRBDFS
    con_ssh = ControllerClient.get_active_controller()

    LOG.tc_step("Determine the available free space on the system")
    cmd = "vgdisplay -C --noheadings --nosuffix -o vg_free --units g cgts-vg"
    rc, out = con_ssh.exec_sudo_cmd(cmd)
    free_space = out.lstrip()
    LOG.info("Available free space on the system is: {}".format(free_space))
    if float(free_space) <= 10:
        skip("Not enough free space to complete test.")

    drbdfs_val = {}
    LOG.tc_step("Determine the space available for each drbd fs")
    for fs in drbdfs:
        table_ = table_parser.table(
            cli.system('controllerfs-show {}'.format(fs))[1])
        drbdfs_val[fs] = table_parser.get_value_two_col_table(table_, 'size')

    LOG.info("Current fs values are: {}".format(drbdfs_val))

    LOG.tc_step("Increase the size of the extension and database filesystem")
    partition_name = "database"
    partition_value = drbdfs_val[partition_name]
    backup_freespace = math.trunc(float(free_space) / 10)
    new_partition_value = backup_freespace + int(partition_value)
    cmd = "controllerfs-modify {}={}".format(partition_name,
                                             new_partition_value)
    cli.system(cmd)

    partition_name = "extension"
    partition_value = drbdfs_val[partition_name]
    cgcs_freespace = math.trunc(backup_freespace / 2)
    new_partition_value = cgcs_freespace + int(partition_value)
    cmd = "controllerfs-modify {}={}".format(partition_name,
                                             new_partition_value)
    cli.system(cmd)

    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)
    standby_cont = system_helper.get_standby_controller_name()
    system_helper.wait_for_host_values(standby_cont,
                                       availability=HostAvailState.AVAILABLE)
    host_helper.swact_host()

    act_cont = system_helper.get_active_controller_name()
    host_helper.reboot_hosts(act_cont)

    time.sleep(5)

    system_helper.wait_for_alarm_gone(
        alarm_id=EventLogID.HOST_RECOVERY_IN_PROGRESS,
        entity_id="host={}".format(act_cont),
        timeout=600)
Exemplo n.º 17
0
def test_resize_drbd_filesystem_while_resize_inprogress():
    """
    This test attempts to resize a drbd filesystem while an existing drbd
    resize is in progress.  This should be rejected.

    Arguments:
    - None

    Test steps:
    1.  Increase the size of backup to allow for test to proceed.
    2.  Wait for alarms to clear and then check the underlying filesystem is
    updated
    2.  Attempt to resize the glance filesystem.  This should be successful.
    3.  Attempt to resize cgcs again immediately.  This should be rejected.

    Assumptions:
    - None

    """

    start_time = common.get_date_in_format()
    drbdfs_val = {}
    fs = "extension"
    LOG.tc_step(
        "Increase the {} size before proceeding with rest of test".format(fs))
    drbdfs_val[fs] = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, drbdfs_val[fs]))
    drbdfs_val[fs] = int(drbdfs_val[fs]) + 5
    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))
    LOG.tc_step("Increase the size of filesystems")
    storage_helper.modify_controllerfs(**drbdfs_val)

    hosts = system_helper.get_controllers()
    for host in hosts:
        system_helper.wait_for_events(
            event_log_id=EventLogID.CONFIG_OUT_OF_DATE,
            start=start_time,
            entity_instance_id="host={}".format(host),
            strict=False,
            **{'state': 'set'})

    for host in hosts:
        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
            entity_id="host={}".format(host),
            timeout=600)

    LOG.tc_step(
        "Confirm the underlying filesystem size matches what is expected")
    storage_helper.check_controllerfs(**drbdfs_val)

    drbdfs_val = {}
    fs = "database"
    LOG.tc_step("Determine the current filesystem size")
    value = storage_helper.get_controllerfs_values(fs)[0]
    LOG.info("Current value of {} is {}".format(fs, value))
    drbdfs_val[fs] = int(value) + 1
    LOG.info("Will attempt to increase the value of {} to {}".format(
        fs, drbdfs_val[fs]))

    LOG.tc_step("Increase the size of filesystems")
    storage_helper.modify_controllerfs(**drbdfs_val)

    LOG.tc_step("Attempt to increase the size of the filesystem again")
    drbdfs_val[fs] = int(drbdfs_val[fs]) + 1
    code = storage_helper.modify_controllerfs(fail_ok=True, **drbdfs_val)[0]
    assert 1 == code, "Filesystem modify succeeded while failure is expected: {}".format(
        drbdfs_val)

    # Appearance of sync alarm is delayed so wait for it to appear and then
    # clear
    if not system_helper.is_aio_simplex():
        system_helper.wait_for_alarm(alarm_id=EventLogID.CON_DRBD_SYNC,
                                     timeout=300)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CON_DRBD_SYNC,
                                          timeout=300)
Exemplo n.º 18
0
def modify_https(enable_https=True,
                 check_first=True,
                 con_ssh=None,
                 auth_info=Tenant.get('admin_platform'),
                 fail_ok=False):
    """
    Modify platform https via 'system modify https_enable=<bool>'

    Args:
        enable_https (bool): True/False to enable https or not
        check_first (bool): if user want to check if the lab is already in
        the state that user try to enable
        con_ssh (SSHClient):
        auth_info (dict):
        fail_ok (bool):

    Returns (tuple):
        (-1, msg)
        (0, msg)
        (1, <std_err>)

    """
    if check_first:
        is_https = keystone_helper.is_https_enabled(source_openrc=False,
                                                    auth_info=auth_info,
                                                    con_ssh=con_ssh)
        if (is_https and enable_https) or (not is_https and not enable_https):
            msg = "Https is already {}. Do nothing.".format(
                'enabled' if enable_https else 'disabled')
            LOG.info(msg)
            return -1, msg

    LOG.info("Modify system to {} https".format(
        'enable' if enable_https else 'disable'))
    res, output = system_helper.modify_system(fail_ok=fail_ok,
                                              con_ssh=con_ssh,
                                              auth_info=auth_info,
                                              https_enabled='{}'.format(
                                                  str(enable_https).lower()))
    if res == 1:
        return 1, output

    LOG.info("Wait up to 60s for config out-of-date alarm with best effort.")
    system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                 entity_id='controller-',
                                 strict=False,
                                 con_ssh=con_ssh,
                                 timeout=60,
                                 fail_ok=True,
                                 auth_info=auth_info)

    LOG.info("Wait up to 600s for config out-of-date alarm to clear.")
    system_helper.wait_for_alarm_gone(EventLogID.CONFIG_OUT_OF_DATE,
                                      con_ssh=con_ssh,
                                      timeout=600,
                                      check_interval=20,
                                      fail_ok=False,
                                      auth_info=auth_info)

    LOG.info("Wait up to 300s for public endpoints to be updated")
    expt_status = 'enabled' if enable_https else 'disabled'
    end_time = time.time() + 300
    while time.time() < end_time:
        if keystone_helper.is_https_enabled(con_ssh=con_ssh,
                                            source_openrc=False,
                                            auth_info=auth_info) == \
                enable_https:
            break
        time.sleep(10)
    else:
        raise exceptions.KeystoneError(
            "Https is not {} in 'openstack endpoint list'".format(expt_status))

    msg = 'Https is {} successfully'.format(expt_status)
    LOG.info(msg)
    # TODO: install certificate for https. There will be a warning msg if
    #  self-signed certificate is used

    if not ProjVar.get_var('IS_DC') or \
            (auth_info and auth_info.get('region', None) in (
            'RegionOne', 'SystemController')):
        # If DC, use the central region https as system https, since that is
        # the one used for external access
        CliAuth.set_vars(HTTPS=enable_https)

    return 0, msg
Exemplo n.º 19
0
def test_storgroup_semantic_checks():
    """
    This test validates CEPH semantic checks as it applies to storage nodes in
    a replication group.

    Args:
        - None

    Setup:
        - Requires a system with storage nodes (minimum of 2)
        - Requires TiS Release 3 and up

    Test Steps:
        1.  Lock one storage node in a storage node pair
        2.  Check the appropriate alarms are raised
        3.  Check OSDs are down on the storage node
        4.  Check that CEPH is no longer healthy
        5.  Attempt to lock the other node and ensure it is rejected
        6.  Attempt to force lock the other node and ensure it is rejected
        7.  If the storage node is a storage monitor, attempt to lock and force
            lock the controllers
        8.  Unlock the storage node in the storage node pair
        9.  Check that the alarms are cleared
        10.  Check that OSDs are up
        11.  Check that CEPH is healthy

    Defects this addresses:
        1.  CGTS-4286 Unexpected allowing lock action on storage node peergroup
            when redundancy lost
        2.  CGTS-3494 Some OSDs observed to be up on locked storage node
        3.  CGTS-3643 Able to lock standby controller despite only two CEPH
            monitors being available
        4.  CGTS-2690 Storage: Force locking a controller should be rejected when storage
            is locked.
    """

    con_ssh = ControllerClient.get_active_controller()

    table_ = table_parser.table(cli.system('storage-backend-show ceph-store')[1])
    capabilities = table_parser.get_value_two_col_table(table_, 'capabilities')
    replication_factor = capabilities[1]
    LOG.info("The replication factor is: {}".format(replication_factor))

    # We want to test storage-0 since it is a ceph monitor
    # Then we want to test another storage host in another group.  The choice
    # depends on the replication factor.
    storage_nodes = ["storage-0"]
    if replication_factor == "3":
        storage_nodes.append("storage-3")

    if replication_factor == "2" and len(storage_nodes) > 2:
        storage_nodes.append("storage-2")

    LOG.info("Storage hosts under test are: {}".format(storage_nodes))

    for host in storage_nodes:
        LOG.tc_step('Lock {}:'.format(host))
        HostsToRecover.add(host, scope='function')
        rtn_code, out = host_helper.lock_host(host)
        assert rtn_code == 0, out

        LOG.tc_step("Verify CEPH cluster health reflects the OSD being down")
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check that alarms are raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raise')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that the ceph health warning alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        hosts = []
        if host == 'storage-0':
            hosts.append('controller-0')
            hosts.append('controller-1')

        for node in hosts:
            LOG.tc_step('Attempt to lock the {}'.format(node))
            HostsToRecover.add(node)
            rtn_code, out = host_helper.lock_host(node, fail_ok=True)
            assert 1 == rtn_code, out

            LOG.tc_step('Attempt to force lock {}'.format(node))
            rtn_code, out = host_helper.lock_host(node, force=True, fail_ok=True)
            assert 1 == rtn_code, out

        LOG.tc_step('Unlock storage host {}'.format(host))
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        LOG.info("Check if alarms have cleared")
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert ceph_healthy, "ceph is not healthy"

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg
Exemplo n.º 20
0
def upgrade_host(host, timeout=InstallTimeout.UPGRADE, fail_ok=False, con_ssh=None,
                 auth_info=Tenant.get('admin_platform'), lock=False, unlock=False):
    """
    Upgrade given host
    Args:
        host (str):
        timeout (int): MAX seconds to wait for host to become online after unlocking
        fail_ok (bool):
        con_ssh (SSHClient):
        auth_info (str):
        unlock (bool):
        lock


    Returns (tuple):
        (0, "Host is upgraded and in online state.")
        (1, "Cli host upgrade rejected. Applicable only if ail_ok")
        (2, "Host failed data migration. Applicable only if fail_ok")
        (3, "Host did not come online after upgrade. Applicable if fail_ok ")
        (4, "Host fail lock before starting upgrade". Applicable if lock arg is True and fail_ok")
        (5, "Host fail to unlock after host upgrade.  Applicable if unlock arg is True and fail_ok")
        (6, "Host unlocked after upgrade, but alarms are not cleared after 120 seconds.
        Applicable if unlock arg is True and fail_ok")

    """
    LOG.info("Upgrading host {}...".format(host))

    if lock:
        if system_helper.get_host_values(host, 'administrative', con_ssh=con_ssh)[0] == HostAdminState.UNLOCKED:
            message = "Host is not locked. Locking host  before starting upgrade"
            LOG.info(message)
            rc, output = host_helper.lock_host(host, con_ssh=con_ssh, fail_ok=True)
            if rc != 0 and rc != -1:
                err_msg = "Host {} fail on lock before starting upgrade: {}".format(host, output)
                if fail_ok:
                    return 4, err_msg
                else:
                    raise exceptions.HostError(err_msg)
    if system_helper.is_aio_simplex():
        exitcode, output = simplex_host_upgrade(con_ssh=con_ssh)
        return exitcode, output

    exitcode, output = cli.system('host-upgrade', host, ssh_client=con_ssh, fail_ok=True, auth_info=auth_info,
                                  timeout=timeout)
    if exitcode == 1:
        err_msg = "Host {} cli upgrade host failed: {}".format(host, output)
        if fail_ok:
            return 1, err_msg
        else:
            raise exceptions.HostError(err_msg)

    # sleep for 180 seconds to let host be re-installed with upgrade release
    time.sleep(180)

    if not system_helper.wait_for_host_values(host, timeout=timeout, check_interval=60,
                                                       availability=HostAvailState.ONLINE, con_ssh=con_ssh,
                                                       fail_ok=fail_ok):
        err_msg = "Host {} did not become online  after upgrade".format(host)
        if fail_ok:
            return 3, err_msg
        else:
            raise exceptions.HostError(err_msg)

    if host.strip() == "controller-1":
        rc, output = _wait_for_upgrade_data_migration_complete(timeout=timeout,
                                                               auth_info=auth_info, fail_ok=fail_ok, con_ssh=con_ssh)
        if rc != 0:
            err_msg = "Host {} upgrade data migration failure: {}".format(host, output)
            if fail_ok:
                return 2, err_msg
            else:
                raise exceptions.HostError(err_msg)

    if unlock:
        rc, output = host_helper.unlock_host(host, fail_ok=True, available_only=True)
        if rc != 0:
            err_msg = "Host {} fail to unlock after host upgrade: ".format(host, output)
            if fail_ok:
                return 5, err_msg
            else:
                raise exceptions.HostError(err_msg)

        # wait until  400.001  alarms get cleared
        if not system_helper.wait_for_alarm_gone("400.001", fail_ok=True):
            err_msg = "Alarms did not clear after host {} upgrade and unlock: ".format(host)
            if fail_ok:
                return 6, err_msg
            else:
                raise exceptions.HostError(err_msg)

    LOG.info("Upgrading host {} complete ...".format(host))
    return 0, None
Exemplo n.º 21
0
def test_transition_sensorgroup_actions(host,
                                        event_type,
                                        action_level,
                                        action,
                                        suppression,
                                        expt_alarm,
                                        expt_host_avail,
                                        new_action,
                                        new_suppression,
                                        new_expt_alarm,
                                        new_expt_host_avail,
                                        sensor_data_fit):
    """
    Verify the sensorgroup can properly transition from one action to another when
    an event remains unchanged.

    Test Steps:
        - Get a sensorgroup to test
        - Set the event level and expected action
        - trigger an out-of-scope event for that sensorgroup
        - verify that the expected action is taken
        - transition the sensorgroup action
        - verify the new action is taken
    """
    bmc_hosts = sensor_data_fit
    if host not in bmc_hosts:
        skip("{} is not configured with BMC sensor".format(host))

    global HOST
    HOST = host
    # Get a sensor to validate
    expt_severity = action_level.split('_')[-1] if 'yes' in expt_alarm else None
    new_expt_severity = action_level.split('_')[-1] if 'yes' in new_expt_alarm else None

    if suppression is not None:
        suppression = True if suppression == 'suppressed' else False
    if new_suppression is not None:
        new_suppression = True if new_suppression == 'suppressed' else False

    for sensorgroup_name in bmc_helper.get_sensorgroup_name(host):
        LOG.tc_step("Validating that sensorgroup: {} can be set to sensor action: {} for event level: {}".
                    format(sensorgroup_name, action, action_level))

        # Set the sensorgroup action, suppress state, and audit interval
        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', audit_interval=10, suppress=suppression,
                                      **{action_level: action})

        # Get a sensor that is part of the sensorgroup
        sensor_name = bmc_helper.get_first_sensor_from_sensorgroup(sensorgroup_name, host)
        entity_id = 'host={}.sensor={}'.format(host, sensor_name)

        LOG.tc_step("Trigger event for sensorgroup: {} and sensor name: {}".format(sensorgroup_name, sensor_name))
        bmc_helper.trigger_event(host, sensor_name, event_type)

        LOG.tc_step("Check the alarm status for sensor: {}".format(sensor_name))
        res = system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, timeout=60, entity_id=entity_id,
                                           severity=expt_severity, strict=False, fail_ok=True)[0]

        if expt_alarm == 'yes_alarm':
            assert res, "FAIL: Alarm expected but no alarms found for sensor on {}".format(host)
        else:
            assert not res, "FAIL: Alarm raised but no alarms were expected for sensor on {}".format(host)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        system_helper.wait_for_host_values(host, timeout=90, availability=expt_host_avail, fail_ok=False)

        start_time = common.get_date_in_format()
        # modify sensorgroup with new action/suppression level
        LOG.tc_step("Transition sensorgroup: {} from current sensor action: {} to new sensor action: {} "
                    "for event level: {}".format(sensorgroup_name, action, new_action, action_level))

        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', suppress=new_suppression,
                                      **{action_level: new_action})

        # Verify the new action is taken
        LOG.tc_step("Check alarm status after transition from {} to {} for {}".format(action, new_action, sensor_name))

        if new_expt_alarm == 'yes_alarm':
            system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id,
                                         severity=new_expt_severity, timeout=60, strict=False, fail_ok=False)
        else:
            events = system_helper.wait_for_events(timeout=60, num=10, event_log_id=EventLogID.BMC_SENSOR_ACTION,
                                                   entity_instance_id=entity_id, start=start_time, state='log',
                                                   fail_ok=True, strict=False, severity=new_expt_severity)
            if new_expt_alarm == 'yes_log':
                assert events, "No event log found for {} {} {} event".format(host, sensorgroup_name, action_level)
            else:
                assert not events, "Event logged unexpectedly for sensor on {}".format(host)
                system_helper.wait_for_alarm_gone(EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id, strict=False,
                                                  timeout=5, fail_ok=False)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        system_helper.wait_for_host_values(host, timeout=90, availability=new_expt_host_avail, fail_ok=False)

        LOG.tc_step("Check the alarm clears and host in available state after clearing events")
        bmc_helper.clear_events(host)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=host, strict=False,
                                          timeout=60)
        system_helper.wait_for_host_values(host, fail_ok=False, availability='available')

    HOST = ''
Exemplo n.º 22
0
def _test_basic_swift_provisioning(pool_size, pre_swift_check):
    """
    Verifies basic swift provisioning works as expected
    Args:
        pool_size:
        pre_swift_check:

    Returns:

    """
    ceph_backend_info = get_ceph_backend_info()

    if pool_size == 'default' and pre_swift_check[0]:
        skip("Swift is already provisioned")

    if pool_size == 'fixed_size' and pre_swift_check[0]:
        skip("Swift is already provisioned and set to non-default pool value")

    object_pool_gib = None
    cinder_pool_gib = ceph_backend_info['cinder_pool_gib']

    if pool_size == 'default':
        if not ceph_backend_info['object_gateway']:
            LOG.tc_step("Enabling SWIFT object store .....")

    else:
        if not ceph_backend_info['object_gateway']:
            skip("Swift is not provisioned")

        total_gib = ceph_backend_info['ceph_total_space_gib']
        unallocated_gib = (total_gib - cinder_pool_gib -
                           ceph_backend_info['glance_pool_gib'] -
                           ceph_backend_info['ephemeral_pool_gib'])
        if unallocated_gib == 0:
            unallocated_gib = int(int(cinder_pool_gib) / 4)
            cinder_pool_gib = str(int(cinder_pool_gib) - unallocated_gib)
        elif unallocated_gib < 0:
            skip("Unallocated gib < 0. System is in unknown state.")

        object_pool_gib = str(unallocated_gib)
        LOG.tc_step(
            "Enabling SWIFT object store and setting object pool size to {}....."
            .format(object_pool_gib))

    rc, updated_backend_info = storage_helper.modify_storage_backend(
        'ceph',
        object_gateway=False,
        cinder=cinder_pool_gib,
        object_gib=object_pool_gib,
        services='cinder,glance,nova,swift')

    LOG.info("Verifying if swift object gateway is enabled...")
    assert str(updated_backend_info['object_gateway']).lower() == 'true', "Fail to enable Swift object gateway: {}"\
        .format(updated_backend_info)
    LOG.info("Swift object gateway is enabled.")

    LOG.info("Verifying ceph task ...")
    state = storage_helper.get_storage_backends(backend='ceph',
                                                field='state')[0]
    if system_helper.wait_for_alarm(alarm_id=EventLogID.CONFIG_OUT_OF_DATE,
                                    timeout=10,
                                    fail_ok=True,
                                    entity_id='controller-')[0]:
        LOG.info("Verifying ceph task is set to 'add-object-gateway'...")
        assert BackendState.CONFIGURING == state, \
            "Unexpected ceph state '{}' after swift object gateway update ".format(state)

        LOG.info("Lock/Unlock controllers...")
        active_controller, standby_controller = system_helper.get_active_standby_controllers(
        )
        LOG.info("Active Controller is {}; Standby Controller is {}...".format(
            active_controller, standby_controller))

        for controller in [standby_controller, active_controller]:
            if not controller:
                continue
            HostsToRecover.add(controller)
            host_helper.lock_host(controller, swact=True)
            storage_helper.wait_for_storage_backend_vals(
                backend='ceph-store',
                **{
                    'task': BackendTask.RECONFIG_CONTROLLER,
                    'state': BackendState.CONFIGURING
                })
            host_helper.unlock_host(controller)

        system_helper.wait_for_alarm_gone(
            alarm_id=EventLogID.CONFIG_OUT_OF_DATE, fail_ok=False)
    else:
        assert BackendState.CONFIGURED == state, \
            "Unexpected ceph state '{}' after swift object gateway update ".format(state)

    LOG.info("Verifying Swift provisioning setups...")
    assert verify_swift_object_setup(), "Failure in swift setups"

    for i in range(3):
        vm_name = 'vm_swift_api_{}'.format(i)
        LOG.tc_step(
            "Boot vm {} and perform nova actions on it".format(vm_name))
        vm_id = vm_helper.boot_vm(name=vm_name, cleanup='function')[1]
        vm_helper.wait_for_vm_pingable_from_natbox(
            vm_id, timeout=VMTimeout.DHCP_RETRY)

        LOG.info("Cold migrate VM {} ....".format(vm_name))
        rc = vm_helper.cold_migrate_vm(vm_id=vm_id)[0]
        assert rc == 0, "VM {} failed to cold migrate".format(vm_name)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

        LOG.info("Live migrate VM {} ....".format(vm_name))
        rc = vm_helper.live_migrate_vm(vm_id=vm_id)[0]
        assert rc == 0, "VM {} failed to live migrate".format(vm_name)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

        LOG.info("Suspend/Resume VM {} ....".format(vm_name))
        vm_helper.suspend_vm(vm_id)
        vm_helper.resume_vm(vm_id)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    LOG.info("Checking overall system health...")
    assert system_helper.get_system_health_query(
    ), "System health not OK after VMs"

    LOG.tc_step("Create Swift container using swift post cli command ...")
    container_names = [
        "test_container_1", "test_container_2", "test_container_3"
    ]

    for container in container_names:
        LOG.info("Creating swift object container {}".format(container))
        rc, out = swift_helper.create_swift_container(container)
        assert rc == 0, "Fail to create swift container {}".format(container)
        LOG.info(
            "Create swift object container {} successfully".format(container))

    LOG.tc_step("Verify swift list to list containers ...")
    container_list = swift_helper.get_swift_containers()[1]
    assert set(container_names) <= set(container_list), "Swift containers {} not listed in {}"\
        .format(container_names, container_list)

    LOG.tc_step("Verify swift delete a container...")
    container_to_delete = container_names[2]
    rc, out = swift_helper.delete_swift_container(container_to_delete)
    assert rc == 0, "Swift delete container rejected: {}".format(out)
    assert container_to_delete not in swift_helper.get_swift_containers()[1], "Unable to delete swift container {}"\
        .format(container_to_delete)

    LOG.tc_step("Verify swift stat to show info of a single container...")
    container_to_stat = container_names[0]
    out = swift_helper.get_swift_container_stat_info(container_to_stat)
    assert out["Container"] == container_to_stat, "Unable to stat swift container {}"\
        .format(container_to_stat)
    assert out["Objects"] == '0', "Incorrect number of objects container {}. Expected O objects, but has {} objects"\
        .format(container_to_stat, out["Objects"])
Exemplo n.º 23
0
def test_migrate_vm(check_system, guest_os, mig_type, cpu_pol):
    """
    Test migrate vms for given guest type
    Args:
        check_system:
        guest_os:
        mig_type:
        cpu_pol:

    Test Steps:
        - Create a glance image from given guest type
        - Create a vm from cinder volume using above image
        - Live/cold migrate the vm
        - Ensure vm moved to other host and in good state (active and
            reachabe from NatBox)

    """
    LOG.tc_step("Create a flavor with 1 vcpu")
    flavor_id = \
        nova_helper.create_flavor(name='{}-mig'.format(mig_type), vcpus=1,
                                  root_disk=9, cleanup='function')[1]

    if cpu_pol is not None:
        specs = {FlavorSpec.CPU_POLICY: cpu_pol}
        LOG.tc_step("Add following extra specs: {}".format(specs))
        nova_helper.set_flavor(flavor=flavor_id, **specs)

    LOG.tc_step("Create a volume from {} image".format(guest_os))
    image_id = glance_helper.get_guest_image(guest_os=guest_os)

    vol_id = cinder_helper.create_volume(source_id=image_id, size=9,
                                         guest_image=guest_os)[1]
    ResourceCleanup.add('volume', vol_id)

    LOG.tc_step("Boot a vm from above flavor and volume")
    vm_id = vm_helper.boot_vm(guest_os, flavor=flavor_id, source='volume',
                              source_id=vol_id, cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    if guest_os == 'ubuntu_14':
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.CINDER_IO_CONGEST,
                                          entity_id='cinder_io_monitor',
                                          strict=False, timeout=300,
                                          fail_ok=False)

    LOG.tc_step("{} migrate vm and check vm is moved to different host".format(
        mig_type))
    prev_vm_host = vm_helper.get_vm_host(vm_id)

    if mig_type == 'live':
        code, output = vm_helper.live_migrate_vm(vm_id)
        if code == 1:
            assert False, "No host to live migrate to. System may not be in " \
                          "good state."
    else:
        vm_helper.cold_migrate_vm(vm_id)

    vm_host = vm_helper.get_vm_host(vm_id)
    assert prev_vm_host != vm_host, "vm host did not change after {} " \
                                    "migration".format(mig_type)

    LOG.tc_step("Ping vm from NatBox after {} migration".format(mig_type))
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id)
Exemplo n.º 24
0
def test_sensorgroup_power_cycle(host,
                                 eventlevel,
                                 action,
                                 expected_host_state,
                                 expected_alarm_state,
                                 event_type,
                                 suppressionlevel, sensor_data_fit):
    """
    Verify that the sensorgroup action taken for an event is valid.

    Test Steps:
        - Get a sensorgroup to test
        - Set the event level and expected action
        - trigger an out-of-scope event for that sensorgroup
        - verify that the expected action is taken

    """
    bmc_hosts = sensor_data_fit
    if host not in bmc_hosts:
        skip("{} is not configured with BMC sensor".format(host))

    global HOST
    HOST = host

    if suppressionlevel == 'suppressed':
        # global SUPPRESSED
        # SUPPRESSED = host
        suppress = True
    else:
        suppress = False

    expt_severity = eventlevel.split('_')[-1] if 'yes' in expected_alarm_state else None

    # Get a sensor to validate
    sensorgroup_name = random.choice(bmc_helper.get_sensor_names(host, sensor_group=True))
    for i in range(4):
        LOG.info("################## iter {} #########################".format(i+1))
        LOG.tc_step("Validating that sensorgroup: {} "
                    "can be set to sensor action: {} "
                    "for event level: {}".format(sensorgroup_name, action,
                                                 eventlevel))

        # Set the event level and action
        bmc_helper.modify_sensorgroup(host, sensorgroup_name, value='name', suppress=suppress, audit_interval=10,
                                      **{eventlevel: action})

        # Get a sensor that is part of the sensorgroup
        sensor_name = bmc_helper.get_first_sensor_from_sensorgroup(sensorgroup_name, host)
        entity_id = 'host={}.sensor={}'.format(host, sensor_name)

        LOG.tc_step("Trigger event for sensorgroup: {} and sensor name: {}".
                    format(sensorgroup_name, sensor_name))
        if action in ['power-cycle', 'reset']:
            HostsToRecover.add(host)

        start_time = common.get_date_in_format()
        bmc_helper.trigger_event(host, sensor_name, event_type)

        LOG.tc_step("Check sensor status and alarm for {}".format(sensor_name))
        if expected_alarm_state == 'yes_alarm':
            system_helper.wait_for_alarm(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id,
                                         severity=expt_severity, timeout=60, strict=False, fail_ok=False)
        else:
            events = system_helper.wait_for_events(timeout=60, num=10, event_log_id=EventLogID.BMC_SENSOR_ACTION,
                                                   entity_instance_id=entity_id, start=start_time, state='log',
                                                   severity=expt_severity, fail_ok=True, strict=False)
            if expected_alarm_state == 'yes_log':
                assert events, "No event log found for {} {} {} event".format(host, sensorgroup_name, eventlevel)
            else:
                assert not events, "Event logged unexpectedly for sensor on {}".format(host)
                system_helper.wait_for_alarm_gone(EventLogID.BMC_SENSOR_ACTION, entity_id=entity_id, strict=False,
                                                  timeout=5, fail_ok=False)

        LOG.tc_step("Check the host status for sensor: {}".format(sensor_name))
        host_state_timeout = 120
        if action == 'reset':
            host_state_timeout = 1080  # 15 min reset interval in between two reset triggers
        system_helper.wait_for_host_values(host, timeout=host_state_timeout, fail_ok=False,
                                                    availability=expected_host_state)
        if action == 'power-cycle':
            system_helper.wait_for_host_values(host, timeout=20, task=HostTask.POWER_CYCLE, strict=False)

        LOG.tc_step("Check the alarm clears and host in available state after clearing events")
        bmc_helper.clear_events(host)
        system_helper.wait_for_alarm_gone(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=host, strict=False,
                                          timeout=60)
        wait_time = 3000 if action == 'power-cycle' else HostTimeout.REBOOT
        expt_states = {'availability': 'available'}
        strict = True
        if action == 'power-cycle' and i == 3:
            wait_time = 1200
            strict = False
            expt_states = {'availability': HostAvailState.POWER_OFF,
                           'operational': HostOperState.DISABLED,
                           'administrative': HostAdminState.UNLOCKED,
                           'task': HostTask.POWER_DOWN}

        system_helper.wait_for_host_values(host, fail_ok=False, timeout=wait_time, strict=strict, **expt_states)

    LOG.tc_step("Power on {} after test ends".format(host))
    host_helper.lock_host(host=host)
    host_helper.power_on_host(host=host)
    HOST = ''
Exemplo n.º 25
0
def test_lock_stor_check_osds_down(stx_openstack_required, host):
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that all OSDs go down on a locked storage
    node.  There are two variants:

    1.  Lock 'storage-0' which is a ceph monitor
    2.  Lock a storage node that is not 'storage-0', i.e. not a ceph monitor

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock storage node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs on the locked storage node are down
            - Check that the appropriate alarms are raised:
        3.  Unlock storage node
            - ensure CEPH is HEALTH_OK
            - ensure all OSDs on unlocked node are up
            - Check that alarms are cleared

    Note: If the storage node to be locked is monitor, we also expect to see
    the mon down alarm.

    What defects this addresses:
        1.  CGTS-2609 - Ceph processes fail to start after storage node reboot

    Notes:
        - Updated test to write to disk to add I/O load on system

    """

    con_ssh = ControllerClient.get_active_controller()

    if host == 'any':
        storage_nodes = system_helper.get_hosts(personality='storage')
        LOG.info('System has {} storage nodes:'.format(storage_nodes))
        storage_nodes.remove('storage-0')
        node_id = random.randint(0, len(storage_nodes) - 1)
        host = storage_nodes[node_id]

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()

    LOG.tc_step("Boot various VMs")
    vms = vm_helper.boot_vms_various_types(cleanup="function")

    vm_threads = []
    LOG.tc_step("SSH to VMs and write to disk")
    end_event = Events("End dd in vms")
    try:
        for vm in vms:
            vm_thread = vm_helper.write_in_vm(vm, end_event=end_event, expect_timeout=40)
            vm_threads.append(vm_thread)

        LOG.tc_step('Lock storage node {}'.format(host))
        HostsToRecover.add(host)
        host_helper.lock_host(host, check_first=False)

        LOG.tc_step('Determine the storage group for host {}'.format(host))
        storage_group, msg = storage_helper.get_storage_group(host)
        LOG.info(msg)

        LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host, strict=False)[0], \
            "Alarm {} not raised".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check health of CEPH cluster')
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        assert not ceph_healthy

        LOG.tc_step('Check that OSDs are down')
        osd_list = storage_helper.get_osds(host, con_ssh)
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} is up but should be down'.format(osd_id)
            assert not osd_up, msg
            msg = 'OSD ID {} is down as expected'.format(osd_id)
            LOG.info(msg)

        LOG.tc_step('Check that loss of replication alarm is raised')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_LOR)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_LOR)

        LOG.tc_step('Check that ceph is in health warn')
        assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
            "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

        # We're waiting 5 minutes for ceph rebalancing to be performed
        # DO NOT REMOVE.  This is part of the test.
        time.sleep(300)

        LOG.tc_step('Unlock storage node')
        rtn_code, out = host_helper.unlock_host(host)
        assert rtn_code == 0, out

        health = False
        end_time = time.time() + 40
        while time.time() < end_time:
            health = storage_helper.is_ceph_healthy(con_ssh)
            if health is True:
                break
        assert health, "Ceph did not become healthy"

        LOG.tc_step('Check that host lock alarm is cleared when {} is unlocked'.format(host))
        assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host, strict=False), \
            "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

        LOG.tc_step('Check that the replication group alarm is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_LOR), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_LOR)
        LOG.tc_step('Check that the Storage Alarm Condition is cleared')
        assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
            "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

        LOG.tc_step('Check OSDs are up after unlock')
        for osd_id in osd_list:
            osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
            msg = 'OSD ID {} should be up but is not'.format(osd_id)
            assert osd_up, msg

        LOG.tc_step('Check health of CEPH cluster')
        end_time = time.time() + 40
        while time.time() < end_time:
            ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
            if ceph_healthy is True:
                break

        for vm_thread in vm_threads:
            assert vm_thread.res is True, "Writing in vm stopped unexpectedly"
    finally:
        # wait_for_thread_end needs to be called even if test failed in the middle, otherwise thread will not end
        end_event.set()
        for vm_thread in vm_threads:
            vm_thread.wait_for_thread_end(timeout=20)

    LOG.tc_step("Delete existing VMs")
    vm_helper.delete_vms()
Exemplo n.º 26
0
 def cleanup():
     if HOST:
         bmc_helper.clear_events(HOST)
         system_helper.wait_for_alarm_gone(alarm_id=EventLogID.BMC_SENSOR_ACTION, entity_id=HOST, strict=False,
                                           timeout=60)
Exemplo n.º 27
0
def test_lock_cont_check_mon_down():
    """
    This test is adapted from
    us69932_tc3_ceph_mon_maintenance_operations from us69932_ceph_monitoring.odt

    The goal of this test is to check that we alarm when a CEPH monitor goes
    down.  This test is specifically for controller hosts.

    Args:
        - None

    Setup:
        - Requires system with storage nodes

    Test Steps:
        1.  Lock controller node
        2.  Check
            - CEPH cluster is in HEALTH_WARN
            - Ensure all OSDs stay up
            - Check that the appropriate alarms are raised:
              - controller-X is locked
              - ceph mon down
        3.  Unlock controller node
            - ensure CEPH is HEALTH_OK
            - Check that alarms are cleared

    Enhancements:
       1.  Should we do both controllers?  This will require a swact.
    """

    con_ssh = ControllerClient.get_active_controller()

    host = system_helper.get_standby_controller_name()
    LOG.tc_step('Lock standby controller node {}'.format(host))
    HostsToRecover.add(host, scope='function')
    rtn_code, out = host_helper.lock_host(host)
    assert rtn_code == 0, out

    LOG.tc_step('Check that storage degrade alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.STORAGE_ALARM_COND)[0], \
        "Alarm {} not raised".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check that host lock alarm is raised when {} is locked'.format(host))
    assert system_helper.wait_for_alarm(alarm_id=EventLogID.HOST_LOCK, entity_id=host)[0], \
        "Alarm {} not raised".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check OSDs are still up after lock')
    osd_list = storage_helper.get_osds(con_ssh=con_ssh)
    for osd_id in osd_list:
        osd_up = storage_helper.is_osd_up(osd_id, con_ssh)
        msg = 'OSD ID {} should be up but is not'.format(osd_id)
        assert osd_up, msg
        msg = 'OSD ID {} is up'.format(osd_id)
        LOG.info(msg)

    LOG.tc_step('Unlock standby controller node {}'.format(host))
    rtn_code, out = host_helper.unlock_host(host, available_only=True)
    assert rtn_code == 0, out

    LOG.tc_step('Check that the host locked alarm is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.HOST_LOCK, entity_id=host), \
        "Alarm {} not cleared".format(EventLogID.HOST_LOCK)

    LOG.tc_step('Check that the Storage Alarm Condition is cleared')
    assert system_helper.wait_for_alarm_gone(EventLogID.STORAGE_ALARM_COND), \
        "Alarm {} not cleared".format(EventLogID.STORAGE_ALARM_COND)

    LOG.tc_step('Check health of CEPH cluster')
    msg = ''
    end_time = time.time() + 40
    while time.time() < end_time:
        ceph_healthy = storage_helper.is_ceph_healthy(con_ssh)
        if ceph_healthy:
            break
    else:
        assert 0, "ceph is not healthy"