Exemplo n.º 1
0
def install_non_active_node(node_name, lab):
    """
    Install the non-active controller node, usually it is controller-1, the second controller
        on a non-AIO SX system.

    Args:
        node_name:
            - the name of the host/node, usually 'controller-1'
        lab:
            - lab to test
    """

    boot_interfaces = lab['boot_device_dict']
    LOG.tc_step("Restoring {}".format(node_name))
    install_helper.open_vlm_console_thread(node_name,
                                           boot_interface=boot_interfaces,
                                           vlm_power_on=True)

    LOG.info(
        "Verifying {} is Locked, Disabled and Online ...".format(node_name))
    system_helper.wait_for_hosts_states(node_name,
                                        administrative=HostAdminState.LOCKED,
                                        operational=HostOperState.DISABLED,
                                        availability=HostAvailState.ONLINE)

    LOG.info("Unlocking {} ...".format(node_name))
    rc, output = host_helper.unlock_host(node_name, available_only=False)

    assert rc == 0 or rc == 4, "Host {} failed to unlock: rc = {}, msg: {}".format(
        node_name, rc, output)

    if rc == 4:
        LOG.warn('{} now is in degraded status'.format(node_name))

    LOG.info('{} is installed'.format(node_name))
Exemplo n.º 2
0
def test_unlock_hosts():
    active = system_helper.get_active_controller_name()
    standby = 'controller-1' if active == 'controller-0' else 'controller-0'
    system_helper.wait_for_hosts_states([standby, 'compute-1'],
                                        availability='available')
    LOG.tc_step("Lock hosts.")
    host_helper.lock_host(standby)
    host_helper.lock_host('compute-1')
    LOG.tc_step("Unlock hosts")
    res = host_helper.unlock_hosts([standby, 'compute-1'])
    LOG.tc_step("Show results")
    LOG.info("Unlock hosts result: {}".format(res))
Exemplo n.º 3
0
def test_reboot_standby_controller(no_simplex):
    active, standby = system_helper.get_active_standby_controllers()
    LOG.tc_step("'sudo reboot -f' from {}".format(standby))
    host_helper.reboot_hosts(standby,
                             wait_for_offline=True,
                             wait_for_reboot_finish=True,
                             force_reboot=True)
    system_helper.wait_for_hosts_states(standby,
                                        timeout=360,
                                        check_interval=30,
                                        availability=['available'])
    kube_helper.wait_for_pods_healthy(check_interval=30, all_namespaces=True)
Exemplo n.º 4
0
def check_host_state(host, expected_host_state):
    """ Return the state that the host enters after the
    triggered event and configured sensor action."""

    con_ssh = ControllerClient.get_active_controller()

    return system_helper.wait_for_hosts_states(
        host,
        timeout=90,
        check_interval=10,
        con_ssh=con_ssh,
        availability=['{}'.format(expected_host_state)])
Exemplo n.º 5
0
def test_detect_failed_compute(no_simplex, no_duplex):
    con_ssh = ssh.ControllerClient.get_active_controller()
    active_controller = system_helper.get_active_controller_name()
    compute_host = system_helper.get_computes(
        administrative=HostAdminState.UNLOCKED,
        operational=HostOperState.ENABLED,
        availability=HostAvailState.AVAILABLE)[0]

    compute_su_prompt = r'.*compute\-([0-9]){1,}\:/home/sysadmin#'
    cmd_get_offset = ("ntpq -p | grep {} -A1 | "
                      "tail -1 | awk '{{print$8}}'".format(active_controller))
    cmd_magic_keys_enable = ("echo 1 > /proc/sys/kernel/sysrq")
    cmd_get_start_date = ("python -c \"import datetime; "
                          "print str(datetime.datetime.now())[:-3]\"")
    cmd_get_end_date = ("cat /var/log/mtcAgent.log | "
                        "grep \"{} MNFA new candidate\" | "
                        "tail -1 | awk '{{print$1}}'".format(compute_host))
    cmd_trigger_reboot = ("echo b > /proc/sysrq-trigger")

    res = list()

    for i in range(20):
        LOG.tc_step("Start of iter {}".format(i))
        st = str()
        offset = float()
        with host_helper.ssh_to_host(compute_host) as node_ssh:
            offset = float(
                node_ssh.exec_cmd(cmd=cmd_get_offset,
                                  get_exit_code=False)[1]) / 1000
            node_ssh.send_sudo(cmd="su")
            node_ssh.expect(compute_su_prompt)
            node_ssh.send_sudo(cmd=cmd_magic_keys_enable)
            node_ssh.expect(compute_su_prompt)
            st = node_ssh.exec_cmd(cmd=cmd_get_start_date,
                                   get_exit_code=False,
                                   blob=compute_su_prompt)[1]
            node_ssh.exec_sudo_cmd(cmd_trigger_reboot, get_exit_code=False)

        system_helper.wait_for_hosts_states(
            compute_host,
            check_interval=20,
            availability=HostAvailState.AVAILABLE)
        pods_health = kube_helper.wait_for_pods_healthy(
            check_interval=20, timeout=HostTimeout.REBOOT)
        assert pods_health is True, "Check PODs health has failed"

        st_date = datetime.datetime.fromtimestamp(
            datetime.datetime.strptime(st, '%Y-%m-%d %H:%M:%S.%f').timestamp()
            - offset)
        et = con_ssh.exec_cmd(cmd=cmd_get_end_date, get_exit_code=False)[1]
        et_date = datetime.datetime.strptime(et, '%Y-%m-%dT%H:%M:%S.%f')
        diff = et_date - st_date
        LOG.info("\noffset = {}\nstart time = {}\nend time = {}".format(
            offset, st, et))
        LOG.info("\ndiff = {}".format(diff))
        res.append(diff)

    def calc_avg(lst):
        rtrn_sum = datetime.timedelta()
        for i in lst:
            LOG.info("Iter {}: {}".format(lst.index(i), i))
            rtrn_sum += i
        return rtrn_sum / len(lst)

    final_res = calc_avg(res)
    LOG.info("Avg time is : {}".format(final_res))
def test_install_cloned_image(install_clone_setup):

    controller1 = 'controller-1'

    lab = InstallVars.get_install_var('LAB')
    install_output_dir = ProjVar.get_var('LOG_DIR')

    controller0_node = lab['controller-0']
    hostnames = install_clone_setup['hostnames']
    system_mode = install_clone_setup['system_mode']
    lab_name = lab['name']
    LOG.info("Starting install-clone on AIO lab {} .... ".format(lab_name))
    LOG.tc_step("Booting controller-0 ... ")

    if controller0_node.telnet_conn is None:
        controller0_node.telnet_conn = install_helper.open_telnet_session(
            controller0_node, install_output_dir)
        try:
            controller0_node.telnet_conn.login()
        except:
            LOG.info("Telnet Login failed. Attempting to reset password")
            try:
                controller0_node.telnet_conn.login(reset=True)
            except:
                if controller0_node.telnet_conn:
                    controller0_node.telnet_conn.close()
                    controller0_node.telnet_conn = None

    if controller0_node.telnet_conn:
        install_helper.wipe_disk_hosts(hostnames)

    # power off hosts
    LOG.tc_step("Powring off system hosts ... ")
    install_helper.power_off_host(hostnames)

    install_helper.boot_controller(boot_usb=True,
                                   small_footprint=True,
                                   clone_install=True)

    # establish telnet connection with controller
    LOG.tc_step(
        "Establishing telnet connection with controller-0 after install-clone ..."
    )

    node_name_in_ini = '{}.*\~\$ '.format(controller0_node.host_name)
    normalized_name = re.sub(r'([^\d])0*(\d+)', r'\1\2', node_name_in_ini)

    # controller_prompt = Prompt.TIS_NODE_PROMPT_BASE.format(lab['name'].split('_')[0]) \
    #                     + '|' + Prompt.CONTROLLER_0 \
    #                     + '|{}'.format(node_name_in_ini) \
    #                     + '|{}'.format(normalized_name)

    if controller0_node.telnet_conn:
        controller0_node.telnet_conn.close()

    output_dir = ProjVar.get_var('LOG_DIR')
    controller0_node.telnet_conn = install_helper.open_telnet_session(
        controller0_node, output_dir)
    controller0_node.telnet_conn.login()
    controller0_node.telnet_conn.exec_cmd("xterm")

    LOG.tc_step("Verify install-clone status ....")
    install_helper.check_clone_status(
        tel_net_session=controller0_node.telnet_conn)

    LOG.info("Source Keystone user admin environment ...")

    #controller0_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc")

    LOG.tc_step("Checking controller-0 hardware ....")
    install_helper.check_cloned_hardware_status('controller-0')

    if system_mode == 'duplex':
        LOG.tc_step("Booting controller-1 ... ")
        boot_interfaces = lab['boot_device_dict']
        install_helper.open_vlm_console_thread('controller-1',
                                               boot_interface=boot_interfaces,
                                               vlm_power_on=True,
                                               wait_for_thread=True)

        LOG.info("waiting for {} to boot ...".format(controller1))

        LOG.info("Verifying {} is Locked, Disabled and Online ...".format(
            controller1))
        system_helper.wait_for_hosts_states(
            controller1,
            check_interval=20,
            use_telnet=True,
            con_telnet=controller0_node.telnet_conn,
            administrative=HostAdminState.LOCKED,
            operational=HostOperState.DISABLED,
            availability=HostAvailState.ONLINE)

        LOG.info("Unlocking {} ...".format(controller1))

        rc, output = host_helper.unlock_host(
            controller1,
            use_telnet=True,
            con_telnet=controller0_node.telnet_conn)
        assert rc == 0, "Host {} unlock failed: {}".format(controller1, output)

        LOG.info("Host {} unlocked successfully ...".format(controller1))

        LOG.info("Host controller-1  booted successfully... ")

        LOG.tc_step("Checking controller-1 hardware ....")
        install_helper.check_cloned_hardware_status(controller1)
    #
    LOG.tc_step("Customizing the cloned system ....")
    LOG.info("Changing the OAM IP configuration ... ")
    install_helper.update_oam_for_cloned_system(system_mode=system_mode)

    LOG.tc_step("Downloading lab specific license, config and scripts ....")
    software_version = system_helper.get_sw_version()
    load_path = BuildServerPath.LATEST_HOST_BUILD_PATHS[software_version]
    install_helper.download_lab_config_files(
        lab, install_clone_setup['build_server'], load_path)

    LOG.tc_step("Running lab cleanup to removed source attributes ....")
    install_helper.run_setup_script(script='lab_cleanup')

    LOG.tc_step(
        "Running lab setup script to upadate cloned system attributes ....")
    rc, output = install_helper.run_lab_setup()
    assert rc == 0, "Lab setup run failed: {}".format(output)

    time.sleep(30)
    LOG.tc_step(
        "Checking config status of controller-0 and perform lock/unlock if necessary..."
    )
    if system_helper.get_host_values(
            'controller-0', 'config_status')[0] == 'Config out-of-date':
        host_helper.lock_unlock_controllers()

    LOG.tc_step("Verifying system health after restore ...")
    system_helper.wait_for_all_alarms_gone(timeout=300)
    rc, failed = system_helper.get_system_health_query()
    assert rc == 0, "System health not OK: {}".format(failed)
Exemplo n.º 7
0
def test_modify_mtu_oam_interface(mtu_range):
    """

    of the 2016-04-04 sysinv_test_plan.pdf
    20) Change the MTU value of the OAM interface using CLI

    Verify that MTU on oam interfaces on both standby and active controller can be modified by cli

    Args:
        mtu_range (str): A string that contain the mtu want to be tested

    Setup:
        - Nothing

    Test Steps:
        - lock standby controller
        - modify the imtu value of the controller
        - unlock the controller
        - revert and oam mtu of the controller and check system is still healthy
        - swact the controller
        - lock the controller
        - modify the imtu value of the controller
        - unlock the controller
        - check the controllers have expected mtu
        - revert the oam mtu of the controller and check system is still healthy

    Teardown:
        - Nothing

    """
    is_sx = system_helper.is_aio_simplex()
    origin_active, origin_standby = system_helper.get_active_standby_controllers()
    if not origin_standby and not is_sx:
        skip("Standby controller unavailable. Cannot lock controller.")

    mtu = __get_mtu_to_mod(providernet_name='-ext', mtu_range=mtu_range)
    first_host = origin_active if is_sx else origin_standby
    max_mtu, cur_mtu, nic_name = get_max_allowed_mtus(host=first_host, network_type='oam')
    LOG.info('OK, the max MTU for {} is {}'.format(nic_name, max_mtu))

    expecting_pass = not max_mtu or mtu <= max_mtu
    if not expecting_pass:
        LOG.warn('Expecting to fail in changing MTU: changing to:{}, max-mtu:{}'.format(mtu, max_mtu))

    oam_attributes = host_helper.get_host_interfaces(host=first_host, field='attributes', name='oam', strict=False)

    # sample attributes: [MTU=9216,AE_MODE=802.3ad]
    pre_oam_mtu = int(oam_attributes[0].split(',')[0].split('=')[1])
    is_stx_openstack_applied = container_helper.is_stx_openstack_deployed(applied_only=True)

    if not is_sx:
        HostsToRecover.add(origin_standby)
        prev_bad_pods = kube_helper.get_unhealthy_pods(all_namespaces=True)

        LOG.tc_step("Modify {} oam interface MTU from {} to {} on standby controller, and "
                    "ensure it's applied successfully after unlock".format(origin_standby, pre_oam_mtu, mtu))
        if mtu == cur_mtu:
            LOG.info('Setting to same MTU: from:{} to:{}'.format(mtu, cur_mtu))

        code, res = host_helper.modify_mtu_on_interfaces(origin_standby, mtu_val=mtu, network_type='oam',
                                                         lock_unlock=True, fail_ok=True)

        LOG.tc_step("Revert OAM MTU to original value: {}".format(pre_oam_mtu))
        code_revert, res_revert = host_helper.modify_mtu_on_interfaces(origin_standby, mtu_val=pre_oam_mtu,
                                                                       network_type='oam',
                                                                       lock_unlock=True, fail_ok=True)
        if 0 == code:
            assert expecting_pass, "OAM MTU is not modified successfully. Result: {}".format(res)
        else:
            assert not expecting_pass, "OAM MTU WAS modified unexpectedly. Result: {}".format(res)

        assert 0 == code_revert, "OAM MTU is not reverted successfully. Result: {}".format(res_revert)

        LOG.tc_step("Check openstack cli, application and pods status after modify and revert {} oam mtu".
                    format(origin_standby))
        check_containers(prev_bad_pods, check_app=is_stx_openstack_applied)

        LOG.tc_step("Ensure standby controller is in available state and attempt to swact active controller to {}".
                    format(origin_standby))
        system_helper.wait_for_hosts_states(origin_active, availability=['available'])
        host_helper.swact_host(fail_ok=False)
        host_helper.wait_for_webservice_up(origin_standby)

    prev_bad_pods = kube_helper.get_unhealthy_pods(all_namespaces=True)
    HostsToRecover.add(origin_active)
    LOG.tc_step("Modify {} oam interface MTU to: {}, and "
                "ensure it's applied successfully after unlock".format(origin_active, mtu))
    code, res = host_helper.modify_mtu_on_interfaces(origin_active,
                                                     mtu_val=mtu, network_type='oam', lock_unlock=True,
                                                     fail_ok=True)
    LOG.tc_step("Revert OAM MTU to original value: {}".format(pre_oam_mtu))
    code_revert, res_revert = host_helper.modify_mtu_on_interfaces(origin_active, mtu_val=pre_oam_mtu,
                                                                   network_type='oam',
                                                                   lock_unlock=True, fail_ok=True)
    if 0 == code:
        assert expecting_pass, "OAM MTU is not modified successfully. Result: {}".format(res)
    else:
        assert not expecting_pass, "OAM MTU WAS modified unexpectedly. Result: {}".format(res)

    assert 0 == code_revert, "OAM MTU is not reverted successfully. Result: {}".format(res_revert)

    LOG.tc_step("Check openstack cli, application and pods after modify and revert {} oam mtu".format(origin_active))
    check_containers(prev_bad_pods, check_app=is_stx_openstack_applied)
Exemplo n.º 8
0
def test_restore(restore_setup):
    controller1 = 'controller-1'
    controller0 = 'controller-0'

    lab = restore_setup["lab"]
    is_aio_lab = lab.get('system_type', 'Standard') == 'CPE'
    is_sx = is_aio_lab and (len(lab['controller_nodes']) < 2)

    tis_backup_files = restore_setup['tis_backup_files']
    backup_src = RestoreVars.get_restore_var('backup_src'.upper())
    backup_src_path = RestoreVars.get_restore_var('backup_src_path'.upper())

    controller_node = lab[controller0]
    con_ssh = ControllerClient.get_active_controller(name=lab['short_name'],
                                                     fail_ok=True)
    sys_prompt = Prompt.TIS_NODE_PROMPT_BASE.format('.*' +
                                                    lab['name'].split('_')[0])
    controller_prompt = '{}|{}'.format(sys_prompt, Prompt.CONTROLLER_0)
    controller_node.telnet_conn.set_prompt(controller_prompt)

    if not con_ssh:
        LOG.info("Establish ssh connection with {}".format(controller0))
        controller_node.ssh_conn = install_helper.ssh_to_controller(
            controller_node.host_ip, initial_prompt=controller_prompt)
        controller_node.ssh_conn.deploy_ssh_key()
        con_ssh = controller_node.ssh_conn
        ControllerClient.set_active_controller(con_ssh)

    LOG.info("Restore system from backup....")
    system_backup_file = [
        file for file in tis_backup_files if "system.tgz" in file
    ].pop()
    images_backup_file = [
        file for file in tis_backup_files if "images.tgz" in file
    ].pop()

    LOG.tc_step("Restoring {}".format(controller0))

    LOG.info("System config restore from backup file {} ...".format(
        system_backup_file))

    if backup_src.lower() == 'usb':
        system_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH,
                                            system_backup_file)
    else:
        system_backup_path = "{}{}".format(HostLinuxUser.get_home(),
                                           system_backup_file)

    compute_configured = install_helper.restore_controller_system_config(
        system_backup=system_backup_path, is_aio=is_aio_lab)[2]

    # return

    LOG.info('re-connect to the active controller using ssh')
    con_ssh.close()
    controller_node.ssh_conn = install_helper.ssh_to_controller(
        controller_node.host_ip, initial_prompt=controller_prompt)
    LOG.info("Source Keystone user admin environment ...")
    LOG.info("set prompt to:{}, telnet_conn:{}".format(
        controller_prompt, controller_node.telnet_conn))

    controller_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc")
    con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
    controller_node.ssh_conn = con_ssh
    ControllerClient.set_active_controller(con_ssh)

    make_sure_all_hosts_locked(con_ssh)

    if backup_src.lower() == 'local':
        images_backup_path = "{}{}".format(HostLinuxUser.get_home(),
                                           images_backup_file)
        common.scp_from_test_server_to_active_controller(
            "{}/{}".format(backup_src_path, images_backup_file),
            HostLinuxUser.get_home())
    else:
        images_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH,
                                            images_backup_file)

    LOG.info(
        "Images restore from backup file {} ...".format(images_backup_file))

    new_prompt = r'{}.*~.*\$ |controller\-0.*~.*\$ '.format(
        lab['name'].split('_')[0])
    LOG.info('set prompt to:{}'.format(new_prompt))
    con_ssh.set_prompt(new_prompt)

    install_helper.restore_controller_system_images(
        images_backup=images_backup_path,
        tel_net_session=controller_node.telnet_conn)
    # this is a workaround for CGTS-8190
    install_helper.update_auth_url(con_ssh)

    LOG.tc_step(
        "Verifying  restoring controller-0 is complete and is in available state ..."
    )
    LOG.debug('Wait for system ready in 60 seconds')
    time.sleep(60)

    timeout = HostTimeout.REBOOT + 60
    availability = HostAvailState.AVAILABLE
    is_available = system_helper.wait_for_hosts_states(
        controller0,
        availability=HostAvailState.AVAILABLE,
        fail_ok=True,
        timeout=timeout)
    if not is_available:
        LOG.warn(
            'After {} seconds, the first node:{} does NOT reach {}'.format(
                timeout, controller0, availability))
        LOG.info('Check if drbd is still synchronizing data')
        con_ssh.exec_sudo_cmd('drbd-overview')
        is_degraded = system_helper.wait_for_hosts_states(
            controller0,
            availability=HostAvailState.DEGRADED,
            fail_ok=True,
            timeout=300)
        if is_degraded:
            LOG.warn('Node: {} is degraded: {}'.format(
                controller0, HostAvailState.DEGRADED))
            con_ssh.exec_sudo_cmd('drbd-overview')
        else:
            LOG.fatal('Node:{} is NOT in Available nor Degraded status')
            # the customer doc does have wording regarding this situation, continue
            # assert False, 'Node:{} is NOT in Available nor Degraded status'

    # delete the system backup files from sysadmin home
    LOG.tc_step("Copying backup files to /opt/backups ... ")
    if backup_src.lower() == 'local':
        con_ssh.exec_cmd("rm -f {} {}".format(system_backup_path,
                                              images_backup_path))

        cmd_rm_known_host = r'sed -i "s/^[^#]\(.*\)"/#\1/g /etc/ssh/ssh_known_hosts; \sync'
        con_ssh.exec_sudo_cmd(cmd_rm_known_host)

        # transfer all backup files to /opt/backups from test server
        with con_ssh.login_as_root():
            con_ssh.scp_on_dest(source_user=TestFileServer.get_user(),
                                source_ip=TestFileServer.get_server(),
                                source_pswd=TestFileServer.get_password(),
                                source_path=backup_src_path + "/*",
                                dest_path=StxPath.BACKUPS + '/',
                                timeout=1200)

    else:
        # copy all backupfiles from USB to /opt/backups
        cmd = " cp  {}/* {}".format(BackupRestore.USB_BACKUP_PATH,
                                    StxPath.BACKUPS)
        con_ssh.exec_sudo_cmd(cmd, expect_timeout=600)

    LOG.tc_step("Checking if backup files are copied to /opt/backups ... ")
    assert int(con_ssh.exec_cmd("ls {} | wc -l".format(StxPath.BACKUPS))[1]) >= 2, \
        "Missing backup files in {}".format(StxPath.BACKUPS)

    if is_aio_lab:
        LOG.tc_step("Restoring Cinder Volumes ...")
        restore_volumes()

        LOG.tc_step('Run restore-complete (CGTS-9756)')
        cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format(
            HostLinuxUser.get_password())
        controller_node.telnet_conn.login()
        controller_node.telnet_conn.exec_cmd(
            cmd, extra_expects=[' will reboot on completion'])

        LOG.info('- wait untill reboot completes, ')
        time.sleep(120)
        LOG.info('- confirm the active controller is actually back online')
        controller_node.telnet_conn.login()

        LOG.tc_step(
            "reconnecting to the active controller after restore-complete")
        con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)

        if not compute_configured:
            LOG.tc_step(
                'Latest 18.07 EAR1 or Old-load on AIO/CPE lab: config its '
                'compute functionalities')
            # install_helper.run_cpe_compute_config_complete(controller_node, controller0)

            # LOG.info('closing current ssh connection')
            # con_ssh.close()

            LOG.tc_step('Run restore-complete (CGTS-9756)')
            controller_node.telnet_conn.login()

            cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.\
                format(HostLinuxUser.get_password())
            controller_node.telnet_conn.exec_cmd(cmd,
                                                 extra_expects=' will reboot ')
            controller_node.telnet_conn.close()

            LOG.info(
                'Wait until "config_controller" reboot the active controller')
            time.sleep(180)

            controller_node.telnet_conn = install_helper.open_telnet_session(
                controller_node)
            controller_node.telnet_conn.login()
            time.sleep(120)

            con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
            controller_node.ssh_conn = con_ssh

            ControllerClient.set_active_controller(con_ssh)

            host_helper.wait_for_hosts_ready(controller0)

        LOG.tc_step('Install the standby controller: {}'.format(controller1))
        if not is_sx:
            install_non_active_node(controller1, lab)

    elif len(lab['controller_nodes']) >= 2:
        LOG.tc_step('Install the standby controller: {}'.format(controller1))
        install_non_active_node(controller1, lab)

        boot_interfaces = lab['boot_device_dict']

        hostnames = system_helper.get_hosts()
        storage_hosts = [host for host in hostnames if 'storage' in host]
        compute_hosts = [
            host for host in hostnames
            if 'storage' not in host and 'controller' not in host
        ]

        if len(storage_hosts) > 0:
            # con_ssh.exec_sudo_cmd('touch /etc/ceph/ceph.client.None.keyring')
            for storage_host in storage_hosts:
                LOG.tc_step("Restoring {}".format(storage_host))
                install_helper.open_vlm_console_thread(
                    storage_host,
                    boot_interface=boot_interfaces,
                    vlm_power_on=True)

                LOG.info(
                    "Verifying {} is Locked, Diabled and Online ...".format(
                        storage_host))
                system_helper.wait_for_hosts_states(
                    storage_host,
                    administrative=HostAdminState.LOCKED,
                    operational=HostOperState.DISABLED,
                    availability=HostAvailState.ONLINE)

                LOG.info("Unlocking {} ...".format(storage_host))
                rc, output = host_helper.unlock_host(storage_host,
                                                     available_only=True)
                assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format(
                    storage_host, rc, output)

            LOG.info("Veryifying the Ceph cluster is healthy ...")
            storage_helper.wait_for_ceph_health_ok(timeout=600)

            LOG.info("Importing images ...")
            image_backup_files = install_helper.get_backup_files(
                IMAGE_BACKUP_FILE_PATTERN, StxPath.BACKUPS, con_ssh)
            LOG.info("Image backup found: {}".format(image_backup_files))
            imported = install_helper.import_image_from_backup(
                image_backup_files)
            LOG.info("Images successfully imported: {}".format(imported))

        LOG.tc_step("Restoring Cinder Volumes ...")
        restore_volumes()

        LOG.tc_step('Run restore-complete (CGTS-9756), regular lab')
        controller_node.telnet_conn.login()
        cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format(
            HostLinuxUser.get_password())
        controller_node.telnet_conn.exec_cmd(
            cmd, extra_expects='controller-0 login:'******'rebuild ssh connection')
        con_ssh = install_helper.ssh_to_controller(controller_node.host_ip)
        controller_node.ssh_conn = con_ssh

        LOG.tc_step("Restoring Compute Nodes ...")
        if len(compute_hosts) > 0:
            for compute_host in compute_hosts:
                LOG.tc_step("Restoring {}".format(compute_host))
                install_helper.open_vlm_console_thread(
                    compute_host,
                    boot_interface=boot_interfaces,
                    vlm_power_on=True)

                LOG.info(
                    "Verifying {} is Locked, Diabled and Online ...".format(
                        compute_host))
                system_helper.wait_for_hosts_states(
                    compute_host,
                    administrative=HostAdminState.LOCKED,
                    operational=HostOperState.DISABLED,
                    availability=HostAvailState.ONLINE)
                LOG.info("Unlocking {} ...".format(compute_host))
                rc, output = host_helper.unlock_host(compute_host,
                                                     available_only=True)
                assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format(
                    compute_host, rc, output)

        LOG.info("All nodes {} are restored ...".format(hostnames))
    else:
        LOG.warn('Only 1 controller, but not AIO lab!!??')

    LOG.tc_step("Delete backup files from {} ....".format(StxPath.BACKUPS))
    con_ssh.exec_sudo_cmd("rm -rf {}/*".format(StxPath.BACKUPS))

    LOG.tc_step('Perform post-restore testing/checking')
    post_restore_test(con_ssh)

    LOG.tc_step("Waiting until all alarms are cleared ....")
    timeout = 300
    healthy, alarms = system_helper.wait_for_all_alarms_gone(timeout=timeout,
                                                             fail_ok=True)
    if not healthy:
        LOG.warn('Alarms exist: {}, after waiting {} seconds'.format(
            alarms, timeout))
        rc, message = con_ssh.exec_sudo_cmd('drbd-overview')

        if rc != 0 or (r'[===>' not in message
                       and r'] sync\'ed: ' not in message):
            LOG.warn('Failed to get drbd-overview information')

        LOG.info('Wait for the system to be ready in {} seconds'.format(
            HostTimeout.REBOOT))
        system_helper.wait_for_all_alarms_gone(timeout=HostTimeout.REBOOT,
                                               fail_ok=False)

    LOG.tc_step("Verifying system health after restore ...")
    rc, failed = system_helper.get_system_health_query(con_ssh=con_ssh)
    assert rc == 0, "System health not OK: {}".format(failed)

    collect_logs()
Exemplo n.º 9
0
def make_sure_all_hosts_locked(con_ssh, max_tries=5):
    """
    Make sure all the hosts are locked before doing system restore.

    Args:
        con_ssh:
            - ssh connection to the target lab

        max_tries:
            - number of times to try before fail the entire test case when any hosts keep failing
            to lock.

    Return:
        None

    """

    LOG.info('System restore procedure requires to lock all nodes except the '
             'active controller/controller-0')

    LOG.info('current host list before trying to lock them')
    cli.system('host-list')

    base_cmd = 'host-lock'
    locked_offline = {
        'administrative': HostAdminState.LOCKED,
        'availability': HostAvailState.OFFLINE
    }

    for tried in range(1, max_tries + 1):
        hosts = [
            h for h in system_helper.get_hosts(administrative='unlocked',
                                               con_ssh=con_ssh)
            if h != 'controller-0'
        ]
        if not hosts:
            LOG.info(
                'all hosts all locked except the controller-0 after tried:{}'.
                format(tried))
            break

        cmd = base_cmd
        if tried > 1:
            cmd = base_cmd + ' -f'

        locking = []
        already_locked = 0
        for host in hosts:
            LOG.info('try:{} locking:{}'.format(tried, host))
            admin_state = system_helper.get_host_values(host,
                                                        'administrative',
                                                        con_ssh=con_ssh)[0]
            if admin_state != 'locked':
                code, output = cli.system(cmd + ' ' + host,
                                          ssh_client=con_ssh,
                                          fail_ok=True)
                if 0 != code:
                    LOG.warn('Failed to lock host:{} using CLI:{}'.format(
                        host, cmd))
                else:
                    locking.append(host)
            else:
                already_locked += 1

        if locking:
            LOG.info(
                'Wating for those accepted locking instructions to be locked:  try:{}'
                .format(tried))
            system_helper.wait_for_hosts_states(locking,
                                                con_ssh=con_ssh,
                                                timeout=600,
                                                **locked_offline)

        elif already_locked == len(hosts):
            LOG.info(
                'all hosts all locked except the controller-0 after tried:{}'.
                format(tried))
            break

        else:
            LOG.info('All hosts were rejecting to lock after tried:{}'.format(
                tried))
    else:
        cli.system('host-list', ssh_client=con_ssh)
        LOG.info('Failed to lock or force-lock some of the hosts')
        assert False, 'Failed to lock or force-lock some of the hosts after tried:{} times'.\
            format(max_tries)

    cli.system('host-list', ssh_client=con_ssh)

    code, output = cli.system('host-list', ssh_client=con_ssh, fail_ok=True)
    LOG.debug('code:{}, output:{}'.format(code, output))