Exemplo n.º 1
0
    def _recover_hosts(hostnames, scope):
        if system_helper.is_aio_simplex():
            LOG.fixture_step('{} Recover simplex host'.format(scope))
            host_helper.recover_simplex(fail_ok=False)
            return

        # Recover hosts for non-simplex system
        hostnames = sorted(set(hostnames))
        table_ = table_parser.table(cli.system('host-list')[1])
        table_ = table_parser.filter_table(table_, hostname=hostnames)

        # unlocked_hosts = table_parser.get_values(table_, 'hostname',
        # administrative='unlocked')
        locked_hosts = table_parser.get_values(table_,
                                               'hostname',
                                               administrative='locked')

        err_msg = []
        if locked_hosts:
            LOG.fixture_step("({}) Unlock hosts: {}".format(
                scope, locked_hosts))
            # Hypervisor state will be checked later in wait_for_hosts_ready
            # which handles platform only deployment
            res1 = host_helper.unlock_hosts(hosts=locked_hosts,
                                            fail_ok=True,
                                            check_hypervisor_up=False)
            for host in res1:
                if res1[host][0] not in [0, 4]:
                    err_msg.append(
                        "Not all host(s) unlocked successfully. Detail: "
                        "{}".format(res1))

        host_helper.wait_for_hosts_ready(hostnames)
Exemplo n.º 2
0
def test_force_reboot_host(host_type):
    """
    Verify lock unlock host

    Test Steps:
        - Select a host per given type. If type is controller, select standby
            controller.
        - Lock selected host and ensure it is successfully locked
        - Unlock selected host and ensure it is successfully unlocked

    """

    LOG.tc_step("Select a {} node from system if any".format(host_type))
    hosts = system_helper.get_hosts(availability=(HostAvailState.AVAILABLE,
                                                  HostAvailState.DEGRADED),
                                    personality=host_type)
    if not hosts:
        skip("No available or degraded {} host found on system".format(
            host_type))

    host = hosts[0]
    LOG.tc_step("Force reboot {} host: {}".format(host_type, host))
    HostsToRecover.add(host)
    host_helper.reboot_hosts(hostnames=host)
    host_helper.wait_for_hosts_ready(host)
Exemplo n.º 3
0
def reboot_hosts(hosts,
                 lab=None,
                 reserve=True,
                 post_check=True,
                 reconnect=True,
                 reconnect_timeout=HostTimeout.REBOOT,
                 hosts_to_check=None,
                 con_ssh=None):
    if isinstance(hosts, str):
        hosts = [hosts]

    _perform_vlm_action_on_hosts(hosts,
                                 action=VlmAction.VLM_REBOOT,
                                 lab=lab,
                                 reserve=reserve)

    if post_check:
        if con_ssh is None:
            con_ssh = ControllerClient.get_active_controller(
                name=lab['short_name'] if lab else None)

        if reconnect:
            con_ssh.connect(retry=True, retry_timeout=reconnect_timeout)
            host_helper._wait_for_openstack_cli_enable(con_ssh=con_ssh)

        if not hosts_to_check:
            hosts_to_check = hosts
        elif isinstance(hosts_to_check, str):
            hosts_to_check = [hosts_to_check]

        host_helper.wait_for_hosts_ready(hosts_to_check, con_ssh=con_ssh)
Exemplo n.º 4
0
def test_system_persist_over_host_reboot(host_type, stx_openstack_required):
    """
    Validate Inventory summary over reboot of one of the controller see if data persists over reboot

    Test Steps:
        - capture Inventory summary for list of hosts on system service-list and neutron agent-list
        - reboot the current Controller-Active
        - Wait for reboot to complete
        - Validate key items from inventory persist over reboot

    """
    if host_type == 'controller':
        host = system_helper.get_active_controller_name()
    elif host_type == 'compute':
        if system_helper.is_aio_system():
            skip("No compute host for AIO system")

        host = None
    else:
        hosts = system_helper.get_hosts(personality='storage')
        if not hosts:
            skip(msg="Lab has no storage nodes. Skip rebooting storage node.")

        host = hosts[0]

    LOG.tc_step("Pre-check for system status")
    system_helper.wait_for_services_enable()
    up_hypervisors = host_helper.get_up_hypervisors()
    network_helper.wait_for_agents_healthy(hosts=up_hypervisors)

    LOG.tc_step("Launch a vm")
    vm_id = vm_helper.boot_vm(cleanup='function')[1]
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id)

    if host is None:
        host = vm_helper.get_vm_host(vm_id)

    LOG.tc_step("Reboot a {} node and wait for reboot completes: {}".format(host_type, host))
    HostsToRecover.add(host)
    host_helper.reboot_hosts(host)
    host_helper.wait_for_hosts_ready(host)

    LOG.tc_step("Check vm is still active and pingable after {} reboot".format(host))
    vm_helper.wait_for_vm_status(vm_id, status=VMStatus.ACTIVE, fail_ok=False)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_id, timeout=VMTimeout.DHCP_RETRY)

    LOG.tc_step("Check neutron agents and system services are in good state after {} reboot".format(host))
    network_helper.wait_for_agents_healthy(up_hypervisors)
    system_helper.wait_for_services_enable()

    if host in up_hypervisors:
        LOG.tc_step("Check {} can still host vm after reboot".format(host))
        if not vm_helper.get_vm_host(vm_id) == host:
            time.sleep(30)
            vm_helper.live_migrate_vm(vm_id, destination_host=host)
Exemplo n.º 5
0
 def check_kern_log():
     cmd = """cat /var/log/kern.log | grep -i --color=never "(i40e): 
     transmit queue" | awk '$0 > "{}"'""". \
         format(start_time)
     i40e_errs = []
     host_helper.wait_for_hosts_ready(hosts=hosts)
     for host in hosts:
         with host_helper.ssh_to_host(hostname=host) as host_ssh:
             output = host_ssh.exec_cmd(cmd)[1]
             if output:
                 i40e_errs.append("{}: {}".format(host, output))
     assert not i40e_errs, "i40e errors: {}".format(i40e_errs)
Exemplo n.º 6
0
def test_snat_evacuate_vm(snat_setups, snat):
    """
    Test VM external access after evacuation.

    Args:
        snat_setups (tuple): returns vm id and fip. Enable snat, create vm and attach floating ip.
        snat (bool): whether or not to enable SNAT on router

    Test Setups (module):
        - Find a tenant router that is dvr or non-dvr based on the parameter
        - Enable SNAT on tenant router
        - boot a vm and attach a floating ip
        - Ping vm from NatBox

    Test Steps:
        - Ping VM from NatBox
        - Reboot vm host
        - Verify vm is evacuated to other host
        - Verify vm can still ping outside

    Test Teardown:
        - Delete the created vm     (module)
        - Disable snat  (module)

    """
    vm_ = snat_setups[0]

    snat = True if snat == 'snat_enabled' else False
    LOG.tc_step("Update tenant router external gateway to set SNAT to {}".format(snat))
    network_helper.set_router_gateway(enable_snat=snat)

    time.sleep(30)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_, timeout=60, use_fip=True)

    host = vm_helper.get_vm_host(vm_)

    LOG.tc_step("Ping VM from NatBox".format(vm_))
    vm_helper.ping_vms_from_natbox(vm_, use_fip=False)
    # vm_helper.ping_vms_from_natbox(vm_, use_fip=True)

    LOG.tc_step("Evacuate vm")
    vm_helper.evacuate_vms(host=host, vms_to_check=vm_)

    LOG.tc_step("Verify vm can still ping outside")
    vm_helper.wait_for_vm_pingable_from_natbox(vm_, use_fip=snat, timeout=VMTimeout.DHCP_RETRY)
    vm_helper.ping_ext_from_vm(vm_, use_fip=True)

    host_helper.wait_for_hosts_ready(hosts=host)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_, timeout=60, use_fip=False)
    if snat:
        vm_helper.wait_for_vm_pingable_from_natbox(vm_, timeout=60, use_fip=True)
Exemplo n.º 7
0
def test_lock_unlock_standby_controller(no_simplex):
    """
    Lock - Unlock standby controller
    """
    standby_controller_host = system_helper.get_standby_controller_name()
    LOG.info("Standby Controller Host: {}".format(standby_controller_host))
    assert standby_controller_host, "Standby controller not found"

    # Lock
    host_helper.lock_host(host=standby_controller_host, fail_ok=False)

    container_helper.wait_for_apps_status(apps="stx-openstack",
                                          status=AppStatus.APPLIED,
                                          timeout=600,
                                          check_interval=60)
    # Unlock
    host_helper.unlock_host(host=standby_controller_host, fail_ok=False)
    host_helper.wait_for_hosts_ready(hosts=standby_controller_host)
Exemplo n.º 8
0
def test_lock_unlock_compute_hosts(no_simplex, no_duplex):
    """
    Lock - Unlock Compute Hosts
    """
    compute_hosts = system_helper.get_computes()
    LOG.info(" Compute nodes found: {}".format(len(compute_hosts)))

    for host in compute_hosts:
        LOG.info("Compute Host: {}".format(host))

        # Lock
        host_helper.lock_host(host=host, fail_ok=False)
        host_helper.wait_for_hosts_ready(hosts=host)

        container_helper.wait_for_apps_status(apps="stx-openstack",
                                              status=AppStatus.APPLIED,
                                              timeout=600,
                                              check_interval=60)
        # Unlock
        host_helper.unlock_host(host=host, fail_ok=False)
        host_helper.wait_for_hosts_ready(hosts=host)
Exemplo n.º 9
0
    def test_kpi_evacuate(self, vm_type, get_hosts, collect_kpi):
        if not collect_kpi:
            skip("KPI only test. Skip due to kpi collection is not enabled.")
        if not system_helper.is_avs() and vm_type in ('dpdk', 'avp'):
            skip('avp vif unsupported by OVS')

        def operation(vm_id_, host_):
            vm_helper.evacuate_vms(host=host_,
                                   vms_to_check=vm_id_,
                                   ping_vms=True)

        vm_test, vm_observer = vm_helper.launch_vm_pair(
            vm_type=vm_type, storage_backing='local_image')

        host_src_evacuation, host_observer = self._prepare_test(
            vm_test, vm_observer, get_hosts.copy(), with_router=True)
        time.sleep(60)
        with_router_kpi = vm_helper.get_traffic_loss_duration_on_operation(
            vm_test, vm_observer, operation, vm_test, host_src_evacuation)
        assert with_router_kpi > 0, "Traffic loss duration is not properly detected"
        kpi_log_parser.record_kpi(local_kpi_file=collect_kpi,
                                  kpi_name=Evacuate.NAME.format(
                                      vm_type, 'with'),
                                  kpi_val=with_router_kpi / 1000,
                                  uptime=5)

        host_helper.wait_for_hosts_ready(hosts=host_src_evacuation)

        if len(get_hosts) > 2:
            host_src_evacuation, host_observer = self._prepare_test(
                vm_test, vm_observer, get_hosts.copy(), with_router=False)
            time.sleep(60)
            without_router_kpi = vm_helper.get_traffic_loss_duration_on_operation(
                vm_test, vm_observer, operation, vm_test, host_src_evacuation)
            assert without_router_kpi > 0, "Traffic loss duration is not properly detected"
            kpi_log_parser.record_kpi(local_kpi_file=collect_kpi,
                                      kpi_name=Evacuate.NAME.format(
                                          vm_type, 'no'),
                                      kpi_val=without_router_kpi / 1000,
                                      uptime=5)
Exemplo n.º 10
0
    def _recover_hosts(hostnames, scope):
        if system_helper.is_aio_simplex():
            LOG.fixture_step('{} Recover simplex host'.format(scope))
            host_helper.recover_simplex(fail_ok=False)
            return

        # Recover hosts for non-simplex system
        hostnames = sorted(set(hostnames))
        table_ = table_parser.table(cli.system('host-list')[1])
        table_ = table_parser.filter_table(table_, hostname=hostnames)

        # unlocked_hosts = table_parser.get_values(table_, 'hostname', administrative='unlocked')
        locked_hosts = table_parser.get_values(table_,
                                               'hostname',
                                               administrative='locked')

        err_msg = []
        if locked_hosts:
            LOG.fixture_step("({}) Unlock hosts: {}".format(
                scope, locked_hosts))
            # Hypervisor state will be checked later in wait_for_hosts_ready which handles platform only deployment
            res1 = host_helper.unlock_hosts(hosts=locked_hosts,
                                            fail_ok=True,
                                            check_hypervisor_up=False)
            for host in res1:
                if res1[host][0] not in [0, 4]:
                    err_msg.append(
                        "Not all host(s) unlocked successfully. Detail: {}".
                        format(res1))
        #
        # if unlocked_hosts:
        #     LOG.fixture_step("({}) Wait for hosts to becomes available or degraded: {}".format(scope, unlocked_hosts))
        #     res2 = host_helper.wait_for_hosts_states(unlocked_hosts, timeout=HostTimeout.REBOOT, check_interval=10,
        #                                              fail_ok=True, availability=['available'])
        #     if not res2:
        #         err_msg.append("Somtable_ = table_parser.table(e host(s) from {} are not available.".format(unlocked_hosts))

        host_helper.wait_for_hosts_ready(hostnames)
Exemplo n.º 11
0
def test_lock_unlock_storage_hosts(no_simplex, no_duplex):
    """
    Lock - Unlock Storage Hosts
    """
    if ProjVar.get_var('SYS_TYPE') != SysType.STORAGE:
        skip('Only applicable to Standard-external system')

    storage_hosts = system_helper.get_storage_nodes()
    LOG.info(" Storage nodes found: {}".format(len(storage_hosts)))

    for host in storage_hosts:
        LOG.info("Storage Host: {}".format(host))

        # Lock
        host_helper.lock_host(host=host, fail_ok=False)
        host_helper.wait_for_hosts_ready(hosts=host)

        container_helper.wait_for_apps_status(apps="stx-openstack",
                                              status=AppStatus.APPLIED,
                                              timeout=600,
                                              check_interval=60)
        # Unlock
        host_helper.unlock_host(host=host, fail_ok=False)
        host_helper.wait_for_hosts_ready(hosts=host)
Exemplo n.º 12
0
def test_patch_orch_with_ignored_alarms(patch_orchestration_setup, patch_function_check, ignored_alarm_texts):
    """
    This test verifies the patch orchestration operation with presence of alarms that are normally ignored by the
    orchestration. These alarms are '200.001', '700.004,', '900.001', '900.005', '900.101'. This test generates the
    alarms host lock (200.001) and VM stopped ( 700.004) before executing the patch orchestration.
    Args:
        patch_orchestration_setup:
        patch_function_check
        ignored_alarm_texts:

    Returns:

    """
    vms = patch_function_check
    patches, controllers, computes, storages = patch_orchestration_setup
    hosts = controllers + computes + storages
    patch_id = patching_helper.parse_test_patches(patch_ids=patches, search_str='INSVC_ALLNODES')[0]

    if 'HOST_LOCK' in ignored_alarm_texts and len(hosts) < 2:
        skip("Not enough hosts present in the system")

    if 'HOST_LOCK' in ignored_alarm_texts:
        host = hosts[-1]
        HostsToRecover.add(host)
        LOG.info("Lock host {} to generate 200.001 alarm".format(host))
        host_helper.lock_host(host)
        system_helper.wait_for_alarm(alarm_id='200.001', fail_ok=False)
        LOG.info("Host {} is locked and 200.001 alarm is generated".format(host))

    vm_id_to_stop = None
    if 'VM_STOP' in ignored_alarm_texts:
        vm_id_to_stop = vms[0]
        LOG.info("Stop VM {} to generate 700.004 alarm".format(vm_id_to_stop))
        vm_helper.stop_vms(vm_id_to_stop)
        system_helper.wait_for_alarm(alarm_id='700.004')

    patch_file = patches[patch_id]

    LOG.tc_step("Upload patch file {}".format(patch_file))
    uploaded_id = patching_helper.upload_patches(patch_files=patch_file)[1][0]
    assert patch_id == uploaded_id, "Expected patch {} and uploaded patch {} mismatch".format(patch_id, uploaded_id)
    LOG.info("Patch {} uploaded".format(uploaded_id))

    LOG.tc_step("Apply patch {}".format(uploaded_id))
    applied = patching_helper.apply_patches(patch_ids=[uploaded_id])[1]
    LOG.info("Patch {} applied".format(applied))

    LOG.tc_step("Install patch {} through orchestration".format(uploaded_id))
    patching_helper.wait_for_affecting_alarms_gone()
    run_patch_orchestration_strategy()
    LOG.info("Install patch through orchestration completed for patch {}".format(applied))
    host_helper.wait_for_hosts_ready(hosts=hosts)

    LOG.tc_step("Check vms after patch is installed.")
    if vm_id_to_stop:
        vm_helper.start_vms(vm_id_to_stop)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id_to_stop)
    check_vms(vms)

    LOG.tc_step("Remove test patch {}".format(applied))
    if vm_id_to_stop:
        vm_helper.stop_vms(vm_id_to_stop)

    patching_helper.remove_patches(patch_ids=applied)

    LOG.tc_step("Remove patch through orchestration: {}".format(applied))
    run_patch_orchestration_strategy(alarm_restrictions='relaxed')
    LOG.info("Apply/Remove through patch orchestration completed for patch {}".format(applied))

    LOG.tc_step("Check vms after patch removed: {}.".format(applied))
    if vm_id_to_stop:
        vm_helper.start_vms(vm_id_to_stop)
        vm_helper.wait_for_vm_pingable_from_natbox(vm_id_to_stop)
    check_vms(vms)
Exemplo n.º 13
0
def test_restore(restore_setup):
    controller1 = 'controller-1'
    controller0 = 'controller-0'

    lab = restore_setup["lab"]
    is_aio_lab = lab.get('system_type', 'Standard') == 'CPE'
    is_sx = is_aio_lab and (len(lab['controller_nodes']) < 2)

    tis_backup_files = restore_setup['tis_backup_files']
    backup_src = RestoreVars.get_restore_var('backup_src'.upper())
    backup_src_path = RestoreVars.get_restore_var('backup_src_path'.upper())

    controller_node = lab[controller0]
    con_ssh = ControllerClient.get_active_controller(name=lab['short_name'],
                                                     fail_ok=True)
    sys_prompt = Prompt.TIS_NODE_PROMPT_BASE.format('.*' +
                                                    lab['name'].split('_')[0])
    controller_prompt = '{}|{}'.format(sys_prompt, Prompt.CONTROLLER_0)
    controller_node.telnet_conn.set_prompt(controller_prompt)

    if not con_ssh:
        LOG.info("Establish ssh connection with {}".format(controller0))
        controller_node.ssh_conn = install_helper.ssh_to_controller(
            controller_node.host_ip, initial_prompt=controller_prompt)
        controller_node.ssh_conn.deploy_ssh_key()
        con_ssh = controller_node.ssh_conn
        ControllerClient.set_active_controller(con_ssh)

    LOG.info("Restore system from backup....")
    system_backup_file = [
        file for file in tis_backup_files if "system.tgz" in file
    ].pop()
    images_backup_file = [
        file for file in tis_backup_files if "images.tgz" in file
    ].pop()

    LOG.tc_step("Restoring {}".format(controller0))

    LOG.info("System config restore from backup file {} ...".format(
        system_backup_file))

    if backup_src.lower() == 'usb':
        system_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH,
                                            system_backup_file)
    else:
        system_backup_path = "{}{}".format(HostLinuxUser.get_home(),
                                           system_backup_file)

    compute_configured = install_helper.restore_controller_system_config(
        system_backup=system_backup_path, is_aio=is_aio_lab)[2]

    # return

    LOG.info('re-connect to the active controller using ssh')
    con_ssh.close()
    controller_node.ssh_conn = install_helper.ssh_to_controller(
        controller_node.host_ip, initial_prompt=controller_prompt)
    LOG.info("Source Keystone user admin environment ...")
    LOG.info("set prompt to:{}, telnet_conn:{}".format(
        controller_prompt, controller_node.telnet_conn))

    controller_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc")
    con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
    controller_node.ssh_conn = con_ssh
    ControllerClient.set_active_controller(con_ssh)

    make_sure_all_hosts_locked(con_ssh)

    if backup_src.lower() == 'local':
        images_backup_path = "{}{}".format(HostLinuxUser.get_home(),
                                           images_backup_file)
        common.scp_from_test_server_to_active_controller(
            "{}/{}".format(backup_src_path, images_backup_file),
            HostLinuxUser.get_home())
    else:
        images_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH,
                                            images_backup_file)

    LOG.info(
        "Images restore from backup file {} ...".format(images_backup_file))

    new_prompt = r'{}.*~.*\$ |controller\-0.*~.*\$ '.format(
        lab['name'].split('_')[0])
    LOG.info('set prompt to:{}'.format(new_prompt))
    con_ssh.set_prompt(new_prompt)

    install_helper.restore_controller_system_images(
        images_backup=images_backup_path,
        tel_net_session=controller_node.telnet_conn)
    # this is a workaround for CGTS-8190
    install_helper.update_auth_url(con_ssh)

    LOG.tc_step(
        "Verifying  restoring controller-0 is complete and is in available state ..."
    )
    LOG.debug('Wait for system ready in 60 seconds')
    time.sleep(60)

    timeout = HostTimeout.REBOOT + 60
    availability = HostAvailState.AVAILABLE
    is_available = system_helper.wait_for_hosts_states(
        controller0,
        availability=HostAvailState.AVAILABLE,
        fail_ok=True,
        timeout=timeout)
    if not is_available:
        LOG.warn(
            'After {} seconds, the first node:{} does NOT reach {}'.format(
                timeout, controller0, availability))
        LOG.info('Check if drbd is still synchronizing data')
        con_ssh.exec_sudo_cmd('drbd-overview')
        is_degraded = system_helper.wait_for_hosts_states(
            controller0,
            availability=HostAvailState.DEGRADED,
            fail_ok=True,
            timeout=300)
        if is_degraded:
            LOG.warn('Node: {} is degraded: {}'.format(
                controller0, HostAvailState.DEGRADED))
            con_ssh.exec_sudo_cmd('drbd-overview')
        else:
            LOG.fatal('Node:{} is NOT in Available nor Degraded status')
            # the customer doc does have wording regarding this situation, continue
            # assert False, 'Node:{} is NOT in Available nor Degraded status'

    # delete the system backup files from sysadmin home
    LOG.tc_step("Copying backup files to /opt/backups ... ")
    if backup_src.lower() == 'local':
        con_ssh.exec_cmd("rm -f {} {}".format(system_backup_path,
                                              images_backup_path))

        cmd_rm_known_host = r'sed -i "s/^[^#]\(.*\)"/#\1/g /etc/ssh/ssh_known_hosts; \sync'
        con_ssh.exec_sudo_cmd(cmd_rm_known_host)

        # transfer all backup files to /opt/backups from test server
        with con_ssh.login_as_root():
            con_ssh.scp_on_dest(source_user=TestFileServer.get_user(),
                                source_ip=TestFileServer.get_server(),
                                source_pswd=TestFileServer.get_password(),
                                source_path=backup_src_path + "/*",
                                dest_path=StxPath.BACKUPS + '/',
                                timeout=1200)

    else:
        # copy all backupfiles from USB to /opt/backups
        cmd = " cp  {}/* {}".format(BackupRestore.USB_BACKUP_PATH,
                                    StxPath.BACKUPS)
        con_ssh.exec_sudo_cmd(cmd, expect_timeout=600)

    LOG.tc_step("Checking if backup files are copied to /opt/backups ... ")
    assert int(con_ssh.exec_cmd("ls {} | wc -l".format(StxPath.BACKUPS))[1]) >= 2, \
        "Missing backup files in {}".format(StxPath.BACKUPS)

    if is_aio_lab:
        LOG.tc_step("Restoring Cinder Volumes ...")
        restore_volumes()

        LOG.tc_step('Run restore-complete (CGTS-9756)')
        cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format(
            HostLinuxUser.get_password())
        controller_node.telnet_conn.login()
        controller_node.telnet_conn.exec_cmd(
            cmd, extra_expects=[' will reboot on completion'])

        LOG.info('- wait untill reboot completes, ')
        time.sleep(120)
        LOG.info('- confirm the active controller is actually back online')
        controller_node.telnet_conn.login()

        LOG.tc_step(
            "reconnecting to the active controller after restore-complete")
        con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)

        if not compute_configured:
            LOG.tc_step(
                'Latest 18.07 EAR1 or Old-load on AIO/CPE lab: config its '
                'compute functionalities')
            # install_helper.run_cpe_compute_config_complete(controller_node, controller0)

            # LOG.info('closing current ssh connection')
            # con_ssh.close()

            LOG.tc_step('Run restore-complete (CGTS-9756)')
            controller_node.telnet_conn.login()

            cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.\
                format(HostLinuxUser.get_password())
            controller_node.telnet_conn.exec_cmd(cmd,
                                                 extra_expects=' will reboot ')
            controller_node.telnet_conn.close()

            LOG.info(
                'Wait until "config_controller" reboot the active controller')
            time.sleep(180)

            controller_node.telnet_conn = install_helper.open_telnet_session(
                controller_node)
            controller_node.telnet_conn.login()
            time.sleep(120)

            con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
            controller_node.ssh_conn = con_ssh

            ControllerClient.set_active_controller(con_ssh)

            host_helper.wait_for_hosts_ready(controller0)

        LOG.tc_step('Install the standby controller: {}'.format(controller1))
        if not is_sx:
            install_non_active_node(controller1, lab)

    elif len(lab['controller_nodes']) >= 2:
        LOG.tc_step('Install the standby controller: {}'.format(controller1))
        install_non_active_node(controller1, lab)

        boot_interfaces = lab['boot_device_dict']

        hostnames = system_helper.get_hosts()
        storage_hosts = [host for host in hostnames if 'storage' in host]
        compute_hosts = [
            host for host in hostnames
            if 'storage' not in host and 'controller' not in host
        ]

        if len(storage_hosts) > 0:
            # con_ssh.exec_sudo_cmd('touch /etc/ceph/ceph.client.None.keyring')
            for storage_host in storage_hosts:
                LOG.tc_step("Restoring {}".format(storage_host))
                install_helper.open_vlm_console_thread(
                    storage_host,
                    boot_interface=boot_interfaces,
                    vlm_power_on=True)

                LOG.info(
                    "Verifying {} is Locked, Diabled and Online ...".format(
                        storage_host))
                system_helper.wait_for_hosts_states(
                    storage_host,
                    administrative=HostAdminState.LOCKED,
                    operational=HostOperState.DISABLED,
                    availability=HostAvailState.ONLINE)

                LOG.info("Unlocking {} ...".format(storage_host))
                rc, output = host_helper.unlock_host(storage_host,
                                                     available_only=True)
                assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format(
                    storage_host, rc, output)

            LOG.info("Veryifying the Ceph cluster is healthy ...")
            storage_helper.wait_for_ceph_health_ok(timeout=600)

            LOG.info("Importing images ...")
            image_backup_files = install_helper.get_backup_files(
                IMAGE_BACKUP_FILE_PATTERN, StxPath.BACKUPS, con_ssh)
            LOG.info("Image backup found: {}".format(image_backup_files))
            imported = install_helper.import_image_from_backup(
                image_backup_files)
            LOG.info("Images successfully imported: {}".format(imported))

        LOG.tc_step("Restoring Cinder Volumes ...")
        restore_volumes()

        LOG.tc_step('Run restore-complete (CGTS-9756), regular lab')
        controller_node.telnet_conn.login()
        cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format(
            HostLinuxUser.get_password())
        controller_node.telnet_conn.exec_cmd(
            cmd, extra_expects='controller-0 login:'******'rebuild ssh connection')
        con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
        controller_node.ssh_conn = con_ssh

        LOG.tc_step("Restoring Compute Nodes ...")
        if len(compute_hosts) > 0:
            for compute_host in compute_hosts:
                LOG.tc_step("Restoring {}".format(compute_host))
                install_helper.open_vlm_console_thread(
                    compute_host,
                    boot_interface=boot_interfaces,
                    vlm_power_on=True)

                LOG.info(
                    "Verifying {} is Locked, Diabled and Online ...".format(
                        compute_host))
                system_helper.wait_for_hosts_states(
                    compute_host,
                    administrative=HostAdminState.LOCKED,
                    operational=HostOperState.DISABLED,
                    availability=HostAvailState.ONLINE)
                LOG.info("Unlocking {} ...".format(compute_host))
                rc, output = host_helper.unlock_host(compute_host,
                                                     available_only=True)
                assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format(
                    compute_host, rc, output)

        LOG.info("All nodes {} are restored ...".format(hostnames))
    else:
        LOG.warn('Only 1 controller, but not AIO lab!!??')

    LOG.tc_step("Delete backup files from {} ....".format(StxPath.BACKUPS))
    con_ssh.exec_sudo_cmd("rm -rf {}/*".format(StxPath.BACKUPS))

    LOG.tc_step('Perform post-restore testing/checking')
    post_restore_test(con_ssh)

    LOG.tc_step("Waiting until all alarms are cleared ....")
    timeout = 300
    healthy, alarms = system_helper.wait_for_all_alarms_gone(timeout=timeout,
                                                             fail_ok=True)
    if not healthy:
        LOG.warn('Alarms exist: {}, after waiting {} seconds'.format(
            alarms, timeout))
        rc, message = con_ssh.exec_sudo_cmd('drbd-overview')

        if rc != 0 or (r'[===>' not in message
                       and r'] sync\'ed: ' not in message):
            LOG.warn('Failed to get drbd-overview information')

        LOG.info('Wait for the system to be ready in {} seconds'.format(
            HostTimeout.REBOOT))
        system_helper.wait_for_all_alarms_gone(timeout=HostTimeout.REBOOT,
                                               fail_ok=False)

    LOG.tc_step("Verifying system health after restore ...")
    rc, failed = system_helper.get_system_health_query(con_ssh=con_ssh)
    assert rc == 0, "System health not OK: {}".format(failed)

    collect_logs()
Exemplo n.º 14
0
def power_on_hosts(hosts,
                   reserve=True,
                   post_check=True,
                   reconnect=True,
                   reconnect_timeout=HostTimeout.REBOOT,
                   hosts_to_check=None,
                   con_ssh=None,
                   region=None,
                   count=1,
                   check_interval=10):
    """

    Args:
        hosts (str|list): hostname(s)
        reserve (bool): whether to reserve first
        post_check (bool): whether to wait for hosts to be ready after power on
        reconnect (bool): whether or reconnect to lab via ssh after power on. Useful when power on controllers
        reconnect_timeout (int): max seconds to wait before reconnect succeeds
        hosts_to_check (list|str|None): host(s) to perform post check after power-on. when None, hosts_to_check=hosts
        check_interval (int)
        con_ssh (SSHClient):
        region:
        count (int): how many times to perform the action

    Returns:

    """

    if isinstance(hosts, str):
        hosts = [hosts]

    lab = None
    if region and ProjVar.get_var('IS_DC'):
        lab = ProjVar.get_var('LAB')[region]
    _perform_vlm_action_on_hosts(hosts,
                                 action=VlmAction.VLM_TURNON,
                                 lab=lab,
                                 reserve=reserve,
                                 count=count)

    if post_check:
        if con_ssh is None:
            con_ssh = ControllerClient.get_active_controller(name=region)
        auth_info = Tenant.get('admin_platform',
                               dc_region='RegionOne' if region
                               and region == 'central_region' else region)
        if reconnect:
            con_ssh.connect(retry=True, retry_timeout=reconnect_timeout)
            host_helper._wait_for_openstack_cli_enable(
                con_ssh=con_ssh,
                auth_info=auth_info,
                timeout=300,
                reconnect=True,
                check_interval=check_interval)

        if not hosts_to_check:
            hosts_to_check = hosts
        elif isinstance(hosts_to_check, str):
            hosts_to_check = [hosts_to_check]

        host_helper.wait_for_hosts_ready(hosts_to_check,
                                         auth_info=auth_info,
                                         con_ssh=con_ssh,
                                         check_interval=check_interval)
Exemplo n.º 15
0
def test_evacuate_pci_vm(vif_model_check):
    """
    Test evacuate vm with multiple ports on same network

    Args:

    Setups:
        - create a flavor with dedicated cpu policy (module)
        - choose one tenant network and one internal network to be used by test (module)
        - boot a base vm - vm1 with above flavor and networks, and ping it from NatBox (module)
        - Boot a vm under test - vm2 with above flavor and with multiple ports on same tenant network with base vm,
        and ping it from NatBox     (class)
        - Ping vm2's own data network ips       (class)
        - Ping vm2 from vm1 to verify management and data networks connection   (class)

    Test Steps:
        - Reboot vm2 host
        - Wait for vm2 to be evacuated to other host
        - Wait for vm2 pingable from NatBox
        - Verify ping from vm1 to vm2 over management and data networks still works

    Teardown:
        - Delete created vms and flavor
    """
    vif_model, base_vm, flavor_id, nics_to_test, seg_id, net_type, pnet_name, extra_pcipt_net = vif_model_check

    LOG.tc_step("Boot a vm with {} vif model on {} net".format(
        vif_model, net_type))
    res, vm_id, err = vm_helper.boot_vm(name=vif_model,
                                        flavor=flavor_id,
                                        cleanup='function',
                                        nics=nics_to_test)
    assert 0 == res, "VM is not booted successfully. Error: {}".format(err)

    vm_helper.wait_for_vm_pingable_from_natbox(vm_id, fail_ok=False)

    if 'pci-passthrough' == vif_model:
        LOG.tc_step("Add vlan to pci-passthrough interface for VM.")
        vm_helper.add_vlan_for_vm_pcipt_interfaces(vm_id=vm_id,
                                                   net_seg_id=seg_id,
                                                   init_conf=True)

    LOG.tc_step("Ping vm over mgmt and {} nets from base vm".format(net_type))
    vm_helper.ping_vms_from_vm(to_vms=vm_id,
                               from_vm=base_vm,
                               net_types=['mgmt', net_type])

    host = vm_helper.get_vm_host(vm_id)

    # Remove the following ssh VM to sync code once CGTS-9279 is fixed
    LOG.tc_step("Login in to VM & do sync command")
    with vm_helper.ssh_to_vm_from_natbox(vm_id) as vm_ssh:
        vm_ssh.exec_sudo_cmd('sync')

    LOG.tc_step("Reboot vm host {}".format(host))
    vm_helper.evacuate_vms(host=host,
                           vms_to_check=vm_id,
                           ping_vms=True,
                           wait_for_host_up=False)

    if 'pci-passthrough' == vif_model:
        LOG.tc_step(
            "Add vlan to pci-passthrough interface for VM again after evacuation due to interface change."
        )
        vm_helper.add_vlan_for_vm_pcipt_interfaces(vm_id=vm_id,
                                                   net_seg_id=seg_id)

    LOG.tc_step(
        "Check vm still pingable over mgmt, and {} nets after evacuation".
        format(net_type))
    vm_helper.ping_vms_from_vm(to_vms=vm_id,
                               from_vm=base_vm,
                               net_types=['mgmt', net_type])

    LOG.tc_step(
        "Wait for rebooted host {} to recover and ensure vm are still reachable"
        .format(host))
    host_helper.wait_for_hosts_ready(hosts=host)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_id)
    vm_helper.ping_vms_from_vm(to_vms=vm_id,
                               from_vm=base_vm,
                               net_types=['mgmt', net_type])
def test_distributed_cloud_install(install_setup):
    """
         Configure the active controller

         Prerequisites:
             - pxeboot has been setup.
         Test Setups:
             - Retrieve dictionary containing lab information
             - Retrieve required paths to directories, images, and licenses
             - Determine active controller
             - Initialize build server and boot server objects
             - Retrieve what steps to be skipped
         Test Steps:
             - Install controller-0
             - Download configuration files, heat templates, images, and licenses
             - Configure controller-0, run lab_setup, and unlock controller-0
             - Add the other hosts
             - Boot the other hosts
             - Run lab setup
             - Unlock the other hosts
             - Run lab setup
             - Setup heat resources and clear any install related alarms
         """
    dc_lab = install_setup["lab"]
    central_region_lab = dc_lab['central_region']

    hosts = dc_lab['central_region']["hosts"]
    boot_device = central_region_lab['boot_device_dict']
    controller0_node = central_region_lab["controller-0"]
    final_step = install_setup["control"]["stop"]
    patch_dir = install_setup["directories"]["patches"]
    patch_server = install_setup["servers"]["patches"]
    guest_server = install_setup["servers"]["guest"]

    if final_step == '0' or final_step == "setup":
        pytest.skip("stopping at install step: {}".format(LOG.test_step))

    fresh_install_helper.install_controller(
        lab=central_region_lab,
        sys_type=SysType.DISTRIBUTED_CLOUD,
        patch_dir=patch_dir,
        patch_server_conn=patch_server.ssh_conn,
        init_global_vars=True)
    # controller0_node.telnet_conn.login()
    # controller0_node.telnet_conn.flush()
    # fresh_install_helper.set_software_version_var(use_telnet=True, con_telnet=controller0_node.telnet_conn)

    lab_files_server = install_setup["servers"]["lab_files"]
    build_server = install_setup["servers"]["build"]
    load_path = InstallVars.get_install_var("TIS_BUILD_DIR")
    ipv6_install = InstallVars.get_install_var("DC_IPV6")

    fresh_install_helper.download_lab_files(
        lab=central_region_lab,
        lab_files_server=lab_files_server,
        build_server=build_server,
        guest_server=guest_server,
        load_path=load_path,
        license_path=InstallVars.get_install_var("LICENSE"),
        guest_path=InstallVars.get_install_var('GUEST_IMAGE'))

    # TODO Change config and lab setup files to common name
    fresh_install_helper.configure_controller_(controller0_node,
                                               lab=central_region_lab,
                                               banner=False,
                                               branding=False)

    fresh_install_helper.wait_for_deploy_mgr_controller_config(
        controller0_node, lab=central_region_lab)

    controller0_node.telnet_conn.hostname = "controller\-[01]"
    controller0_node.telnet_conn.set_prompt(Prompt.CONTROLLER_PROMPT)
    if controller0_node.ssh_conn is None:
        controller0_node.ssh_conn = install_helper.ssh_to_controller(
            controller0_node.host_ip)

    fresh_install_helper.wait_for_deployment_mgr_to_bulk_add_hosts(
        controller0_node, lab=central_region_lab)

    LOG.info("Booting standby controller host...")

    # TODO: get controller-1 hostname
    fresh_install_helper.boot_hosts(boot_device,
                                    hostnames=['controller-1'],
                                    lab=central_region_lab,
                                    wait_for_online=False)
    host_helper.wait_for_hosts_ready(
        [host for host in hosts if controller0_node.name not in host],
        con_ssh=controller0_node.ssh_conn)

    fresh_install_helper.wait_for_deploy_mgr_lab_config(controller0_node,
                                                        lab=central_region_lab)

    fresh_install_helper.wait_for_hosts_ready(["controller-1"],
                                              lab=central_region_lab)
    container_helper.wait_for_apps_status(apps='platform-integ-apps',
                                          timeout=1800,
                                          con_ssh=controller0_node.ssh_conn,
                                          status='applied')
    LOG.info("Running lab setup script ...")
    fresh_install_helper.run_lab_setup(con_ssh=controller0_node.ssh_conn)

    if dc_lab.get("floating ip"):
        collect_sys_net_info(dc_lab)
        setup_tis_ssh(dc_lab)

    fresh_install_helper.wait_for_hosts_ready(controller0_node.name,
                                              lab=central_region_lab)

    fresh_install_helper.attempt_to_run_post_install_scripts(
        controller0_node=controller0_node)

    fresh_install_helper.reset_global_vars()
    fresh_install_helper.verify_install_uuid(lab=central_region_lab)
    fresh_install_helper.validate_deployment_mgr_install(
        controller0_node, central_region_lab)
Exemplo n.º 17
0
def test_snat_computes_lock_reboot(snat_setups):
    """
    test vm external access after host compute reboot with all rest of computes locked

    Args:
        snat_setups (tuple): returns vm id and fip. Enable snat, create vm and attach floating ip.

    Test Setups (module):
        - Find a tenant router that is dvr or non-dvr based on the parameter
        - Enable SNAT on tenant router
        - boot a vm and attach a floating ip
        - Ping vm from NatBox

    Steps:
        - Ping VM {} from NatBox
        - Lock all nova hosts except the vm host
        - Ping external from vm
        - Reboot VM host
        - Wait for vm host to complete reboot
        - Verify vm is recovered after host reboot complete and can still ping outside

    Test Teardown:
        - Unlock all hosts
        - Delete the created vm     (module)
        - Disable SNAT on router    (module)

    """
    hypervisors = host_helper.get_hypervisors(state='up')
    if len(hypervisors) > 3:
        skip("More than 3 hypervisors on system. Skip to reduce run time.")

    vm_ = snat_setups[0]
    LOG.tc_step("Ping VM {} from NatBox".format(vm_))
    vm_helper.wait_for_vm_pingable_from_natbox(vm_, timeout=60, use_fip=True)

    vm_host = vm_helper.get_vm_host(vm_)
    LOG.info("VM host is {}".format(vm_host))
    assert vm_host in hypervisors, "vm host is not in nova hypervisor-list"

    hosts_should_lock = set(hypervisors) - {vm_host}
    hosts_already_locked = set(system_helper.get_hosts(administrative='locked'))
    hosts_to_lock = list(hosts_should_lock - hosts_already_locked)
    LOG.tc_step("Lock all compute hosts {} except vm host {}".format(hosts_to_lock, vm_host))
    for host_ in hosts_to_lock:
        HostsToRecover.add(host_, scope='function')
        host_helper.lock_host(host_, swact=True)

    vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_, timeout=120)
    LOG.tc_step("Ping external from vm {}".format(vm_))
    vm_helper.ping_ext_from_vm(vm_, use_fip=True)

    LOG.tc_step("Evacuate vm and expect VM to stay on same host")
    code, output = vm_helper.evacuate_vms(host=vm_host, vms_to_check=vm_, fail_ok=True)
    assert code > 0, "Actual: {}".format(output)

    LOG.tc_step("Verify vm is recovered and can still ping outside")
    host_helper.wait_for_hosts_ready(hosts=vm_host)
    vm_helper.wait_for_vm_status(vm_id=vm_)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_)
    vm_helper.wait_for_vm_pingable_from_natbox(vm_id=vm_, use_fip=True, timeout=60)
    vm_helper.ping_ext_from_vm(vm_, use_fip=True)