def install_non_active_node(node_name, lab): """ Install the non-active controller node, usually it is controller-1, the second controller on a non-AIO SX system. Args: node_name: - the name of the host/node, usually 'controller-1' lab: - lab to test """ boot_interfaces = lab['boot_device_dict'] LOG.tc_step("Restoring {}".format(node_name)) install_helper.open_vlm_console_thread(node_name, boot_interface=boot_interfaces, vlm_power_on=True) LOG.info( "Verifying {} is Locked, Disabled and Online ...".format(node_name)) system_helper.wait_for_hosts_states(node_name, administrative=HostAdminState.LOCKED, operational=HostOperState.DISABLED, availability=HostAvailState.ONLINE) LOG.info("Unlocking {} ...".format(node_name)) rc, output = host_helper.unlock_host(node_name, available_only=False) assert rc == 0 or rc == 4, "Host {} failed to unlock: rc = {}, msg: {}".format( node_name, rc, output) if rc == 4: LOG.warn('{} now is in degraded status'.format(node_name)) LOG.info('{} is installed'.format(node_name))
def upgrade_controller0(): """ Upgrades controller-0 Returns: """ # upgrade controller-0 LOG.tc_step("Upgrading controller-0......") controller0 = 'controller-0' LOG.info("Ensure controller-0 is provisioned before upgrade.....") ensure_host_provisioned(controller0) LOG.info("Host {} is provisioned for upgrade.....".format(controller0)) # open vlm console for controller-0 for boot through mgmt interface LOG.info("Opening a vlm console for controller-0 .....") install_helper.open_vlm_console_thread(controller0) LOG.info("Starting {} upgrade.....".format(controller0)) upgrade_host(controller0, lock=True) LOG.info("controller-0 is upgraded successfully.....") # unlock upgraded controller-0 LOG.tc_step("Unlocking controller-0 after upgrade......") host_helper.unlock_host(controller0, available_only=True) LOG.info("Host {} unlocked after upgrade......".format(controller0))
def _test_storage_profile(personality, from_backing, to_backing): """ This test creates a storage profile and then applies it to a node with identical hardware, assuming one exists. Storage profiles do not apply on controller nodes. Storage profiles can be applied on controller+compute nodes, compute nodes and storage nodes. Arguments: - personality (string) - controller, compute or storage - from_backing (string) - image, remote or None - to_backing (string) - image, remote or None Test Steps: 1. Query system and determine which nodes have compatible hardware. 2. Create a storage profile on one of those nodes 3. Apply the created storage profile on a compatible node* 4. Ensure the storage profiles have been successfully applied. * If the node is a compute node or a controller+compute, we will also change the backend if required for additional coverage. Returns: - Nothing """ global PROFILES_TO_DELETE PROFILES_TO_DELETE = [] # Skip if test is not applicable to hardware under test if personality == 'controller' and not system_helper.is_aio_system(): skip("Test does not apply to controller hosts without subtype compute") hosts = system_helper.get_hosts(personality=personality) if not hosts: skip("No hosts of type {} available".format(personality)) if (from_backing == "remote" or to_backing == "remote") and not system_helper.is_storage_system(): skip("This test doesn't apply to systems without storage hosts") LOG.tc_step("Identify hardware compatible hosts") hash_to_hosts = get_hw_compatible_hosts(hosts) # Pick the hardware group that has the most compatible hosts current_size = 0 candidate_hosts = [] for value in hash_to_hosts: candidate_size = len(hash_to_hosts[value]) if candidate_size > current_size: current_size = candidate_size candidate_hosts = hash_to_hosts[value] LOG.info( "This is the total set of candidate hosts: {}".format(candidate_hosts)) if len(candidate_hosts) < 2: skip("Insufficient hardware compatible hosts to run test") # Rsync lab setup dot files between controllers con_ssh = ControllerClient.get_active_controller() _rsync_files_to_con1(con_ssh=con_ssh, file_to_check="force.txt") # Take the hardware compatible hosts and check if any of them already have # the backend that we want. This will save us test time. new_to_backing = None if personality == "compute": from_hosts = [] to_hosts = [] for host in candidate_hosts: host_backing = host_helper.get_host_instance_backing(host) if host_backing == from_backing: from_hosts.append(host) elif host_backing == to_backing: to_hosts.append(host) else: pass LOG.info( "Candidate hosts that already have the right from backing {}: {}". format(from_backing, from_hosts)) LOG.info( "Candidate hosts that already have the right to backing {}: {}". format(to_backing, to_hosts)) # Determine what hosts to use if not from_hosts and to_hosts: to_host = random.choice(to_hosts) candidate_hosts.remove(to_host) from_host = random.choice(candidate_hosts) elif not to_hosts and from_hosts: from_host = random.choice(from_hosts) candidate_hosts.remove(from_host) to_host = random.choice(candidate_hosts) elif not to_hosts and not from_hosts: to_host = random.choice(candidate_hosts) candidate_hosts.remove(to_host) from_host = random.choice(candidate_hosts) else: to_host = random.choice(to_hosts) from_host = random.choice(from_hosts) LOG.info("From host is: {}".format(from_host)) LOG.info("To host is: {}".format(to_host)) LOG.tc_step( "Check from host backing and convert to {} if necessary".format( from_backing)) host_helper.set_host_storage_backing(from_host, from_backing) system_helper.wait_for_host_values( from_host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False) LOG.tc_step( "Check to host backing and convert to {} if necessary".format( to_backing)) new_to_backing = host_helper.set_host_storage_backing( to_host, to_backing) elif personality == "controller": # For now, we don't want to host reinstall controller-0 since it will default to # pxeboot, but this could be examined as a possible enhancement. from_host = "controller-0" to_host = "controller-1" LOG.info("From host is: {}".format(from_host)) LOG.info("To host is: {}".format(to_host)) LOG.tc_step( "Check from host backing and convert to {} if necessary".format( from_backing)) host_helper.set_host_storage_backing(from_host, from_backing) LOG.tc_step( "Check to host backing and convert to {} if necessary".format( to_backing)) new_to_backing = host_helper.set_host_storage_backing( to_host, to_backing) else: # Backing doesn't apply to storage nodes so just pick from compatible hardware from_host = random.choice(candidate_hosts) candidate_hosts.remove(from_host) to_host = random.choice(candidate_hosts) LOG.tc_step( "Create storage and interface profiles on the from host {}".format( from_host)) prof_name = 'storprof_{}_{}'.format( from_host, time.strftime('%Y%m%d_%H%M%S', time.localtime())) storage_helper.create_storage_profile(from_host, profile_name=prof_name) PROFILES_TO_DELETE.append(prof_name) # Deleting VMs in case the remaining host(s) cannot handle all VMs # migrating on lock, particularly important in the case of AIO-DX systems. LOG.tc_step( "Delete all VMs and lock the host before applying the storage profile") vm_helper.delete_vms() HostsToRecover.add(to_host, scope='function') system_helper.wait_for_host_values(from_host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False) system_helper.wait_for_host_values(to_host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False) # Negative test #1 - attempt to apply profile on unlocked host (should be rejected) LOG.tc_step('Apply the storage-profile {} onto unlocked host:{}'.format( prof_name, to_host)) cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name) rc, msg = cli.system(cmd, fail_ok=True) assert rc != 0, msg host_helper.lock_host(to_host, swact=True) # 3 conditions to watch for: no partitions, ready partitions and in-use # partitions on the compute. If in-use, delete and freshly install host. # If ready, delete all ready partitions to make room for potentially new # partitions. If no partitions, just delete nova-local lvg. if personality == "compute": # Negative test #2 - attempt to apply profile onto host with existing # nova-local (should be rejected) LOG.tc_step( 'Apply the storage-profile {} onto host with existing nova-local:{}' .format(prof_name, to_host)) cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name) rc, msg = cli.system(cmd, fail_ok=True) assert rc != 0, msg # If we were simply switching backing (without applying a storage # profile), the nova-local lvg deletion can be omitted according to design LOG.tc_step("Delete nova-local lvg on to host {}".format(to_host)) cli.system("host-lvg-delete {} nova-local".format(to_host)) in_use = storage_helper.get_host_partitions(to_host, "In-Use") if in_use: # Negative test #3 - attempt to apply profile onto host with existing # in-use partitions (should be rejected) LOG.tc_step('Apply the storage-profile {} onto host with existing \ in-use partitions:{}'.format(prof_name, to_host)) cmd = 'host-apply-storprofile {} {}'.format(to_host, prof_name) rc, msg = cli.system(cmd, fail_ok=True) assert rc != 0, msg LOG.tc_step( "In-use partitions found. Must delete the host and freshly install before proceeding." ) LOG.info("Host {} has in-use partitions {}".format( to_host, in_use)) lab = InstallVars.get_install_var("LAB") lab.update(create_node_dict(lab['compute_nodes'], 'compute')) lab['boot_device_dict'] = create_node_boot_dict(lab['name']) install_helper.open_vlm_console_thread(to_host) LOG.tc_step("Delete the host {}".format(to_host)) cli.system("host-bulk-export") cli.system("host-delete {}".format(to_host)) assert len( system_helper.get_controllers()) > 1, "Host deletion failed" cli.system("host-bulk-add hosts.xml") system_helper.wait_for_host_values( to_host, timeout=6000, availability=HostAvailState.ONLINE) wait_for_disks(to_host) ready = storage_helper.get_host_partitions(to_host, "Ready") if ready: LOG.tc_step( "Ready partitions have been found. Must delete them before profile application" ) LOG.info("Host {} has Ready partitions {}".format(to_host, ready)) for uuid in reversed(ready): storage_helper.delete_host_partition(to_host, uuid) # Don't bother restoring in this case since the system should be # functional after profile is applied. LOG.tc_step('Apply the storage-profile {} onto host:{}'.format( prof_name, to_host)) cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name)) LOG.tc_step("Unlock to host") host_helper.unlock_host(to_host) to_host_backing = host_helper.get_host_instance_backing(to_host) LOG.info("To host backing was {} and is now {}".format( new_to_backing, to_host_backing)) assert to_host_backing == from_backing, "Host backing was not changed on storage profile application" if personality == "storage": if not storage_helper.is_ceph_healthy(): skip("Cannot run test when ceph is not healthy") LOG.tc_step("Delete the host {}".format(to_host)) cli.system("host-bulk-export") cli.system("host-delete {}".format(to_host)) cli.system("host-bulk-add hosts.xml") system_helper.wait_for_host_values(to_host, timeout=6000, availability=HostAvailState.ONLINE) wait_for_disks(to_host) LOG.tc_step('Apply the storage-profile {} onto host:{}'.format( prof_name, to_host)) cli.system('host-apply-storprofile {} {}'.format(to_host, prof_name)) # Re-provision interfaces through lab_setup.sh LOG.tc_step("Reprovision the host as necessary") files = ['interfaces'] con_ssh = ControllerClient.get_active_controller() delete_lab_setup_files(con_ssh, to_host, files) rc, msg = install_helper.run_lab_setup() assert rc == 0, msg LOG.tc_step("Unlock to host") host_helper.unlock_host(to_host) if personality == "controller": # Note, install helper doesn't work on all labs. Some labs don't # display BIOS type which causes install helper to fail lab = InstallVars.get_install_var("LAB") lab.update(create_node_dict(lab['controller_nodes'], 'controller')) lab['boot_device_dict'] = create_node_boot_dict(lab['name']) install_helper.open_vlm_console_thread(to_host) LOG.tc_step("Delete the host {}".format(to_host)) cli.system("host-bulk-export") cli.system("host-delete {}".format(to_host)) assert len(system_helper.get_controllers()) > 1, "Host deletion failed" cli.system("host-bulk-add hosts.xml") system_helper.wait_for_host_values(to_host, timeout=6000, availability=HostAvailState.ONLINE) wait_for_disks(to_host) LOG.tc_step("Apply the storage-profile {} onto host:{}".format( prof_name, to_host)) cli.system("host-apply-storprofile {} {}".format(to_host, prof_name)) # Need to re-provision everything on node through lab_setup (except storage) LOG.tc_step("Reprovision the host as necessary") files = [ 'interfaces', 'cinder_device', 'vswitch_cpus', 'shared_cpus', 'extend_cgts_vg', 'addresses' ] con_ssh = ControllerClient.get_active_controller() delete_lab_setup_files(con_ssh, to_host, files) rc, msg = install_helper.run_lab_setup() assert rc == 0, msg LOG.tc_step("Unlock to host") host_helper.unlock_host(to_host) to_host_backing = host_helper.get_host_instance_backing(to_host) LOG.info("To host backing was {} and is now {}".format( new_to_backing, to_host_backing)) assert to_host_backing == from_backing, "Host backing was not changed on storage profile application"
def test_install_cloned_image(install_clone_setup): controller1 = 'controller-1' lab = InstallVars.get_install_var('LAB') install_output_dir = ProjVar.get_var('LOG_DIR') controller0_node = lab['controller-0'] hostnames = install_clone_setup['hostnames'] system_mode = install_clone_setup['system_mode'] lab_name = lab['name'] LOG.info("Starting install-clone on AIO lab {} .... ".format(lab_name)) LOG.tc_step("Booting controller-0 ... ") if controller0_node.telnet_conn is None: controller0_node.telnet_conn = install_helper.open_telnet_session( controller0_node, install_output_dir) try: controller0_node.telnet_conn.login() except: LOG.info("Telnet Login failed. Attempting to reset password") try: controller0_node.telnet_conn.login(reset=True) except: if controller0_node.telnet_conn: controller0_node.telnet_conn.close() controller0_node.telnet_conn = None if controller0_node.telnet_conn: install_helper.wipe_disk_hosts(hostnames) # power off hosts LOG.tc_step("Powring off system hosts ... ") install_helper.power_off_host(hostnames) install_helper.boot_controller(boot_usb=True, small_footprint=True, clone_install=True) # establish telnet connection with controller LOG.tc_step( "Establishing telnet connection with controller-0 after install-clone ..." ) node_name_in_ini = '{}.*\~\$ '.format(controller0_node.host_name) normalized_name = re.sub(r'([^\d])0*(\d+)', r'\1\2', node_name_in_ini) # controller_prompt = Prompt.TIS_NODE_PROMPT_BASE.format(lab['name'].split('_')[0]) \ # + '|' + Prompt.CONTROLLER_0 \ # + '|{}'.format(node_name_in_ini) \ # + '|{}'.format(normalized_name) if controller0_node.telnet_conn: controller0_node.telnet_conn.close() output_dir = ProjVar.get_var('LOG_DIR') controller0_node.telnet_conn = install_helper.open_telnet_session( controller0_node, output_dir) controller0_node.telnet_conn.login() controller0_node.telnet_conn.exec_cmd("xterm") LOG.tc_step("Verify install-clone status ....") install_helper.check_clone_status( tel_net_session=controller0_node.telnet_conn) LOG.info("Source Keystone user admin environment ...") #controller0_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc") LOG.tc_step("Checking controller-0 hardware ....") install_helper.check_cloned_hardware_status('controller-0') if system_mode == 'duplex': LOG.tc_step("Booting controller-1 ... ") boot_interfaces = lab['boot_device_dict'] install_helper.open_vlm_console_thread('controller-1', boot_interface=boot_interfaces, vlm_power_on=True, wait_for_thread=True) LOG.info("waiting for {} to boot ...".format(controller1)) LOG.info("Verifying {} is Locked, Disabled and Online ...".format( controller1)) system_helper.wait_for_hosts_states( controller1, check_interval=20, use_telnet=True, con_telnet=controller0_node.telnet_conn, administrative=HostAdminState.LOCKED, operational=HostOperState.DISABLED, availability=HostAvailState.ONLINE) LOG.info("Unlocking {} ...".format(controller1)) rc, output = host_helper.unlock_host( controller1, use_telnet=True, con_telnet=controller0_node.telnet_conn) assert rc == 0, "Host {} unlock failed: {}".format(controller1, output) LOG.info("Host {} unlocked successfully ...".format(controller1)) LOG.info("Host controller-1 booted successfully... ") LOG.tc_step("Checking controller-1 hardware ....") install_helper.check_cloned_hardware_status(controller1) # LOG.tc_step("Customizing the cloned system ....") LOG.info("Changing the OAM IP configuration ... ") install_helper.update_oam_for_cloned_system(system_mode=system_mode) LOG.tc_step("Downloading lab specific license, config and scripts ....") software_version = system_helper.get_sw_version() load_path = BuildServerPath.LATEST_HOST_BUILD_PATHS[software_version] install_helper.download_lab_config_files( lab, install_clone_setup['build_server'], load_path) LOG.tc_step("Running lab cleanup to removed source attributes ....") install_helper.run_setup_script(script='lab_cleanup') LOG.tc_step( "Running lab setup script to upadate cloned system attributes ....") rc, output = install_helper.run_lab_setup() assert rc == 0, "Lab setup run failed: {}".format(output) time.sleep(30) LOG.tc_step( "Checking config status of controller-0 and perform lock/unlock if necessary..." ) if system_helper.get_host_values( 'controller-0', 'config_status')[0] == 'Config out-of-date': host_helper.lock_unlock_controllers() LOG.tc_step("Verifying system health after restore ...") system_helper.wait_for_all_alarms_gone(timeout=300) rc, failed = system_helper.get_system_health_query() assert rc == 0, "System health not OK: {}".format(failed)
def test_system_upgrade(vms_with_upgrade, upgrade_setup, check_system_health_query_upgrade): LOG.info("Boot VM before upgrade ") vms = vms_with_upgrade vm_helper.ping_vms_from_natbox(vms) lab = upgrade_setup['lab'] current_version = upgrade_setup['current_version'] upgrade_version = upgrade_setup['upgrade_version'] controller0 = lab['controller-0'] upgrade_helper.ensure_host_provisioned(controller0.name) force = False LOG.tc_step("Checking system health for upgrade .....") if check_system_health_query_upgrade[0] == 0: LOG.info("System health OK for upgrade......") elif check_system_health_query_upgrade[0] == 2: LOG.info( "System health indicate minor alarms; using --force option to start upgrade......" ) force = True else: assert False, "System health query upgrade failed: {}".format( check_system_health_query_upgrade[1]) LOG.tc_step("Starting upgrade from release {} to target release {}".format( current_version, upgrade_version)) upgrade_helper.system_upgrade_start(force=force) LOG.info("upgrade started successfully......") # upgrade standby controller LOG.tc_step("Upgrading controller-1") upgrade_helper.upgrade_host("controller-1", lock=True) LOG.info("Host controller-1 is upgraded successfully......") vm_helper.ping_vms_from_natbox(vms) # unlock upgraded controller-1 LOG.tc_step("Unlocking controller-1 after upgrade......") host_helper.unlock_host("controller-1", available_only=True, check_hypervisor_up=False) LOG.info("Host controller-1 unlocked after upgrade......") # Swact to standby controller-1 LOG.tc_step("Swacting to controller-1 .....") rc, output = host_helper.swact_host(hostname="controller-0") assert rc == 0, "Failed to swact: {}".format(output) LOG.info("Swacted and controller-1 has become active......") # upgrade controller-0 LOG.tc_step("Upgrading controller-0......") LOG.info("Ensure controller-0 is provisioned before upgrade.....") upgrade_helper.ensure_host_provisioned(controller0.name) LOG.info("Host {} is provisioned for upgrade.....".format( controller0.name)) # open vlm console for controller-0 for boot through mgmt interface LOG.info("Opening a vlm console for controller-0 .....") install_helper.open_vlm_console_thread("controller-0") LOG.info("Starting {} upgrade.....".format(controller0.name)) upgrade_helper.upgrade_host(controller0.name, lock=True) LOG.info("controller-0 is upgraded successfully.....") # unlock upgraded controller-0 LOG.tc_step("Unlocking controller-0 after upgrade......") host_helper.unlock_host(controller0.name, available_only=True) LOG.info("Host {} unlocked after upgrade......".format(controller0.name)) vm_helper.ping_vms_from_natbox(vms) upgrade_hosts = install_helper.get_non_controller_system_hosts() LOG.info( "Starting upgrade of the other system hosts: {}".format(upgrade_hosts)) for host in upgrade_hosts: LOG.tc_step("Starting {} upgrade.....".format(host)) if "storage" in host: # wait for replication to be healthy storage_helper.wait_for_ceph_health_ok() upgrade_helper.upgrade_host(host, lock=True) LOG.info("{} is upgraded successfully.....".format(host)) LOG.tc_step("Unlocking {} after upgrade......".format(host)) host_helper.unlock_host(host, available_only=True) LOG.info("Host {} unlocked after upgrade......".format(host)) LOG.info("Host {} upgrade complete.....".format(host)) vm_helper.ping_vms_from_natbox(vms) # Activate the upgrade LOG.tc_step("Activating upgrade....") upgrade_helper.activate_upgrade() LOG.info("Upgrade activate complete.....") # Make controller-0 the active controller # Swact to standby controller-0 LOG.tc_step("Making controller-0 active.....") rc, output = host_helper.swact_host(hostname="controller-1") assert rc == 0, "Failed to swact: {}".format(output) LOG.info("Swacted to controller-0 ......") # Complete upgrade LOG.tc_step("Completing upgrade from {} to {}".format( current_version, upgrade_version)) upgrade_helper.complete_upgrade() LOG.info("Upgrade is complete......") LOG.info("Lab: {} upgraded successfully".format(lab['name'])) # Delete the previous load LOG.tc_step("Deleting {} load... ".format(current_version)) upgrade_helper.delete_imported_load() LOG.tc_step("Delete previous load version {}".format(current_version))
def test_system_upgrade_controllers(upgrade_setup, check_system_health_query_upgrade): lab = upgrade_setup['lab'] current_version = upgrade_setup['current_version'] upgrade_version = upgrade_setup['upgrade_version'] # run system upgrade-start # must be run in controller-0 active_controller = system_helper.get_active_controller_name() LOG.tc_step("Checking if active controller is controller-0......") assert "controller-0" in active_controller, "The active controller is not " \ "controller-0. Make controller-0 " \ "active before starting upgrade" force = False LOG.tc_step("Checking system health for upgrade .....") if check_system_health_query_upgrade[0] == 0: LOG.info("System health OK for upgrade......") elif check_system_health_query_upgrade[0] == 2: LOG.info( "System health indicate minor alarms; using --force option to start upgrade......" ) force = True else: assert False, "System health query upgrade failed: {}".format( check_system_health_query_upgrade[1]) LOG.info("Starting upgrade from release {} to target release {}".format( current_version, upgrade_version)) upgrade_helper.system_upgrade_start(force=force) LOG.tc_step("upgrade started successfully......") # upgrade standby controller LOG.tc_step("Upgrading controller-1") upgrade_helper.upgrade_host("controller-1", lock=True) LOG.tc_step("Host controller-1 is upgraded successfully......") # unlock upgraded controller-1 LOG.tc_step("Unlocking controller-1 after upgrade......") host_helper.unlock_host("controller-1", available_only=True, check_hypervisor_up=False) LOG.tc_step("Host controller-1 unlocked after upgrade......") time.sleep(60) # Before Swacting ensure the controller-1 is in available state if not system_helper.wait_for_host_values( "controller-1", timeout=360, fail_ok=True, operational=HostOperState.ENABLED, availability=HostAvailState.AVAILABLE): err_msg = " Swacting to controller-1 is not possible because controller-1 is not in available state " \ "within the specified timeout" assert False, err_msg # Swact to standby controller-1 LOG.tc_step("Swacting to controller-1 .....") rc, output = host_helper.swact_host(hostname="controller-0") assert rc == 0, "Failed to swact: {}".format(output) LOG.info("Swacted and controller-1 has become active......") time.sleep(60) # upgrade controller-0 LOG.tc_step("Upgrading controller-0......") controller0 = lab['controller-0'] LOG.info("Ensure controller-0 is provisioned before upgrade.....") upgrade_helper.ensure_host_provisioned(controller0.name) LOG.info("Host {} is provisioned for upgrade.....".format( controller0.name)) # open vlm console for controller-0 for boot through mgmt interface LOG.info("Opening a vlm console for controller-0 .....") install_helper.open_vlm_console_thread("controller-0") LOG.info("Starting {} upgrade.....".format(controller0.name)) upgrade_helper.upgrade_host(controller0.name, lock=True) LOG.info("controller-0 is upgraded successfully.....") # unlock upgraded controller-0 LOG.tc_step("Unlocking controller-0 after upgrade......") host_helper.unlock_host(controller0.name, available_only=True) LOG.info("Host {} unlocked after upgrade......".format(controller0.name))
def test_system_upgrade(upgrade_setup, check_system_health_query_upgrade): lab = upgrade_setup['lab'] current_version = upgrade_setup['current_version'] upgrade_version = upgrade_setup['upgrade_version'] bld_server = upgrade_setup['build_server'] collect_kpi = upgrade_setup['col_kpi'] missing_manifests = False cinder_configuration = False force = False controller0 = lab['controller-0'] if not upgrade_helper.is_host_provisioned(controller0.name): rc, output = upgrade_helper.upgrade_host_lock_unlock(controller0.name) assert rc == 0, "Failed to lock/unlock host {}: {}".format( controller0.name, output) # update health query # system_upgrade_health = list(upgrade_helper.get_system_health_query_upgrade()) system_upgrade_health = list( upgrade_helper.get_system_health_query_upgrade_2()) LOG.tc_step("Checking system health for upgrade .....") if system_upgrade_health[0] == 0: LOG.info("System health OK for upgrade......") elif system_upgrade_health[0] == 2: if system_upgrade_health[2] and "lock_unlock" in system_upgrade_health[ 2].keys(): controller_nodes = system_upgrade_health[2]["lock_unlock"][0] LOG.info("Locking/Unlocking required for {} ......".format( controller_nodes)) if 'controller-1' in controller_nodes: rc, output = upgrade_helper.upgrade_host_lock_unlock( 'controller-1') assert rc == 0, "Failed to lock/unlock host {}: {}".format( 'controller-1', output) if 'controller-0' in controller_nodes: rc, output = upgrade_helper.upgrade_host_lock_unlock( 'controller-0') assert rc == 0, "Failed to lock/unlock host {}: {}".format( 'controller-0', output) time.sleep(60) # system_upgrade_health[2]["swact"][0] = False if system_upgrade_health[2]["swact"][0]: LOG.info("Swact Required: {}".format( system_upgrade_health[2]["swact"][1])) host_helper.swact_host('controller-0') time.sleep(60) host_helper.swact_host('controller-1') time.sleep(60) if system_upgrade_health[2]["force_upgrade"][0]: LOG.info("{}; using --force option to start upgrade......".format( system_upgrade_health[2]["force_upgrade"][1])) force = True else: assert False, "System health query upgrade failed: {}".format( system_upgrade_health[1]) # if system_upgrade_health[0] == 0: # LOG.info("System health OK for upgrade......") # if system_upgrade_health[0] == 1: # assert False, "System health query upgrade failed: {}".format(system_upgrade_health[1]) # # if system_upgrade_health[0] == 4 or system_upgrade_health[0] == 2: # LOG.info("System health indicate missing manifests; lock/unlock controller-0 to resolve......") # missing_manifests = True # if any("Cinder configuration" in k for k in system_upgrade_health[1].keys()): # cinder_configuration = True # # if system_upgrade_health[0] == 3 or system_upgrade_health[0] == 2: # # LOG.info("System health indicate minor alarms; using --force option to start upgrade......") # force = True # # if missing_manifests: # LOG.info("Locking/Unlocking to resolve missing manifests in controller......") # # lock_unlock_hosts = [] # if any("controller-1" in k for k in system_upgrade_health[1].keys()): # lock_unlock_hosts.append('controller-1') # if any("controller-0" in k for k in system_upgrade_health[1].keys()): # lock_unlock_hosts.append('controller-0') # cinder_configuration = False # # for host in lock_unlock_hosts: # rc, output = upgrade_helper.upgrade_host_lock_unlock(host) # assert rc == 0, "Failed to lock/unlock host {}: {}".format(host, output) # # if cinder_configuration: # LOG.info("Invalid Cinder configuration: Swact to controller-1 and back to synchronize.......") # host_helper.swact_host('controller-0') # time.sleep(60) # host_helper.swact_host('controller-1') LOG.tc_step("Starting upgrade from release {} to target release {}".format( current_version, upgrade_version)) upgrade_helper.system_upgrade_start(force=force) upgrade_helper.wait_for_upgrade_states("started") LOG.info("upgrade started successfully......") if collect_kpi: upgrade_helper.collect_upgrade_start_kpi(lab, collect_kpi) # upgrade standby controller LOG.tc_step("Upgrading controller-1") upgrade_helper.upgrade_host("controller-1", lock=True) LOG.info("Host controller-1 is upgraded successfully......") # unlock upgraded controller-1 LOG.tc_step("Unlocking controller-1 after upgrade......") host_helper.unlock_host("controller-1", timeout=(HostTimeout.CONTROLLER_UNLOCK + 10), available_only=True, check_hypervisor_up=False) LOG.info("Host controller-1 unlocked after upgrade......") time.sleep(60) # Before Swacting ensure the controller-1 is in available state if not system_helper.wait_for_host_values( "controller-1", timeout=600, fail_ok=True, operational=HostOperState.ENABLED, availability=HostAvailState.AVAILABLE): err_msg = " Swacting to controller-1 is not possible because controller-1 is not in available state " \ "within the specified timeout" assert False, err_msg # Swact to standby contime.sleep(60) troller-1 LOG.tc_step("Swacting to controller-1 .....") rc, output = host_helper.swact_host(hostname="controller-0") assert rc == 0, "Failed to swact: {}".format(output) LOG.info("Swacted and controller-1 has become active......") time.sleep(60) # upgrade controller-0 LOG.tc_step("Upgrading controller-0......") controller0 = lab['controller-0'] # open vlm console for controller-0 for boot through mgmt interface if 'vbox' not in lab['name']: LOG.info("Opening a vlm console for controller-0 .....") install_helper.open_vlm_console_thread("controller-0", upgrade=True) LOG.info("Starting {} upgrade.....".format(controller0.name)) upgrade_helper.upgrade_host(controller0.name, lock=True) LOG.info("controller-0 is upgraded successfully.....") # unlock upgraded controller-0 LOG.tc_step("Unlocking controller-0 after upgrade......") host_helper.unlock_host(controller0.name, available_only=True) LOG.info("Host {} unlocked after upgrade......".format(controller0.name)) upgrade_hosts = install_helper.get_non_controller_system_hosts() LOG.info( "Starting upgrade of the other system hosts: {}".format(upgrade_hosts)) for host in upgrade_hosts: LOG.tc_step("Starting {} upgrade.....".format(host)) if "storage" in host: # wait for replication to be healthy ceph_health_timeout = 300 if 'vbox' in lab['name']: ceph_health_timeout = 3600 storage_helper.wait_for_ceph_health_ok(timeout=ceph_health_timeout) upgrade_helper.upgrade_host(host, lock=True) LOG.info("{} is upgraded successfully.....".format(host)) LOG.tc_step("Unlocking {} after upgrade......".format(host)) host_helper.unlock_host(host, available_only=True) LOG.info("Host {} unlocked after upgrade......".format(host)) LOG.info("Host {} upgrade complete.....".format(host)) # Activate the upgrade LOG.tc_step("Activating upgrade....") upgrade_helper.activate_upgrade() LOG.info("Upgrade activate complete.....") # Make controller-0 the active controller # Swact to standby controller-0 LOG.tc_step("Making controller-0 active.....") rc, output = host_helper.swact_host(hostname="controller-1") assert rc == 0, "Failed to swact: {}".format(output) LOG.info("Swacted to controller-0 ......") # Complete upgrade LOG.tc_step("Completing upgrade from {} to {}".format( current_version, upgrade_version)) upgrade_helper.complete_upgrade() LOG.info("Upgrade is complete......") LOG.info("Lab: {} upgraded successfully".format(lab['name'])) # Delete the previous load LOG.tc_step("Deleting {} load... ".format(current_version)) upgrade_helper.delete_imported_load() LOG.tc_step("Delete previous load version {}".format(current_version)) LOG.tc_step( "Downloading images to upgraded {} lab ".format(upgrade_version)) install_helper.download_image( lab, bld_server, BuildServerPath.GUEST_IMAGE_PATHS[upgrade_version]) load_path = upgrade_setup['load_path'] LOG.tc_step( "Downloading heat temples to upgraded {} lab ".format(upgrade_version)) install_helper.download_heat_templates(lab, bld_server, load_path) LOG.tc_step("Downloading lab config scripts to upgraded {} lab ".format( upgrade_version)) install_helper.download_lab_config_files(lab, bld_server, load_path)
def test_restore(restore_setup): controller1 = 'controller-1' controller0 = 'controller-0' lab = restore_setup["lab"] is_aio_lab = lab.get('system_type', 'Standard') == 'CPE' is_sx = is_aio_lab and (len(lab['controller_nodes']) < 2) tis_backup_files = restore_setup['tis_backup_files'] backup_src = RestoreVars.get_restore_var('backup_src'.upper()) backup_src_path = RestoreVars.get_restore_var('backup_src_path'.upper()) controller_node = lab[controller0] con_ssh = ControllerClient.get_active_controller(name=lab['short_name'], fail_ok=True) sys_prompt = Prompt.TIS_NODE_PROMPT_BASE.format('.*' + lab['name'].split('_')[0]) controller_prompt = '{}|{}'.format(sys_prompt, Prompt.CONTROLLER_0) controller_node.telnet_conn.set_prompt(controller_prompt) if not con_ssh: LOG.info("Establish ssh connection with {}".format(controller0)) controller_node.ssh_conn = install_helper.ssh_to_controller( controller_node.host_ip, initial_prompt=controller_prompt) controller_node.ssh_conn.deploy_ssh_key() con_ssh = controller_node.ssh_conn ControllerClient.set_active_controller(con_ssh) LOG.info("Restore system from backup....") system_backup_file = [ file for file in tis_backup_files if "system.tgz" in file ].pop() images_backup_file = [ file for file in tis_backup_files if "images.tgz" in file ].pop() LOG.tc_step("Restoring {}".format(controller0)) LOG.info("System config restore from backup file {} ...".format( system_backup_file)) if backup_src.lower() == 'usb': system_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH, system_backup_file) else: system_backup_path = "{}{}".format(HostLinuxUser.get_home(), system_backup_file) compute_configured = install_helper.restore_controller_system_config( system_backup=system_backup_path, is_aio=is_aio_lab)[2] # return LOG.info('re-connect to the active controller using ssh') con_ssh.close() controller_node.ssh_conn = install_helper.ssh_to_controller( controller_node.host_ip, initial_prompt=controller_prompt) LOG.info("Source Keystone user admin environment ...") LOG.info("set prompt to:{}, telnet_conn:{}".format( controller_prompt, controller_node.telnet_conn)) controller_node.telnet_conn.exec_cmd("cd; source /etc/platform/openrc") con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) controller_node.ssh_conn = con_ssh ControllerClient.set_active_controller(con_ssh) make_sure_all_hosts_locked(con_ssh) if backup_src.lower() == 'local': images_backup_path = "{}{}".format(HostLinuxUser.get_home(), images_backup_file) common.scp_from_test_server_to_active_controller( "{}/{}".format(backup_src_path, images_backup_file), HostLinuxUser.get_home()) else: images_backup_path = "{}/{}".format(BackupRestore.USB_BACKUP_PATH, images_backup_file) LOG.info( "Images restore from backup file {} ...".format(images_backup_file)) new_prompt = r'{}.*~.*\$ |controller\-0.*~.*\$ '.format( lab['name'].split('_')[0]) LOG.info('set prompt to:{}'.format(new_prompt)) con_ssh.set_prompt(new_prompt) install_helper.restore_controller_system_images( images_backup=images_backup_path, tel_net_session=controller_node.telnet_conn) # this is a workaround for CGTS-8190 install_helper.update_auth_url(con_ssh) LOG.tc_step( "Verifying restoring controller-0 is complete and is in available state ..." ) LOG.debug('Wait for system ready in 60 seconds') time.sleep(60) timeout = HostTimeout.REBOOT + 60 availability = HostAvailState.AVAILABLE is_available = system_helper.wait_for_hosts_states( controller0, availability=HostAvailState.AVAILABLE, fail_ok=True, timeout=timeout) if not is_available: LOG.warn( 'After {} seconds, the first node:{} does NOT reach {}'.format( timeout, controller0, availability)) LOG.info('Check if drbd is still synchronizing data') con_ssh.exec_sudo_cmd('drbd-overview') is_degraded = system_helper.wait_for_hosts_states( controller0, availability=HostAvailState.DEGRADED, fail_ok=True, timeout=300) if is_degraded: LOG.warn('Node: {} is degraded: {}'.format( controller0, HostAvailState.DEGRADED)) con_ssh.exec_sudo_cmd('drbd-overview') else: LOG.fatal('Node:{} is NOT in Available nor Degraded status') # the customer doc does have wording regarding this situation, continue # assert False, 'Node:{} is NOT in Available nor Degraded status' # delete the system backup files from sysadmin home LOG.tc_step("Copying backup files to /opt/backups ... ") if backup_src.lower() == 'local': con_ssh.exec_cmd("rm -f {} {}".format(system_backup_path, images_backup_path)) cmd_rm_known_host = r'sed -i "s/^[^#]\(.*\)"/#\1/g /etc/ssh/ssh_known_hosts; \sync' con_ssh.exec_sudo_cmd(cmd_rm_known_host) # transfer all backup files to /opt/backups from test server with con_ssh.login_as_root(): con_ssh.scp_on_dest(source_user=TestFileServer.get_user(), source_ip=TestFileServer.get_server(), source_pswd=TestFileServer.get_password(), source_path=backup_src_path + "/*", dest_path=StxPath.BACKUPS + '/', timeout=1200) else: # copy all backupfiles from USB to /opt/backups cmd = " cp {}/* {}".format(BackupRestore.USB_BACKUP_PATH, StxPath.BACKUPS) con_ssh.exec_sudo_cmd(cmd, expect_timeout=600) LOG.tc_step("Checking if backup files are copied to /opt/backups ... ") assert int(con_ssh.exec_cmd("ls {} | wc -l".format(StxPath.BACKUPS))[1]) >= 2, \ "Missing backup files in {}".format(StxPath.BACKUPS) if is_aio_lab: LOG.tc_step("Restoring Cinder Volumes ...") restore_volumes() LOG.tc_step('Run restore-complete (CGTS-9756)') cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format( HostLinuxUser.get_password()) controller_node.telnet_conn.login() controller_node.telnet_conn.exec_cmd( cmd, extra_expects=[' will reboot on completion']) LOG.info('- wait untill reboot completes, ') time.sleep(120) LOG.info('- confirm the active controller is actually back online') controller_node.telnet_conn.login() LOG.tc_step( "reconnecting to the active controller after restore-complete") con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) if not compute_configured: LOG.tc_step( 'Latest 18.07 EAR1 or Old-load on AIO/CPE lab: config its ' 'compute functionalities') # install_helper.run_cpe_compute_config_complete(controller_node, controller0) # LOG.info('closing current ssh connection') # con_ssh.close() LOG.tc_step('Run restore-complete (CGTS-9756)') controller_node.telnet_conn.login() cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.\ format(HostLinuxUser.get_password()) controller_node.telnet_conn.exec_cmd(cmd, extra_expects=' will reboot ') controller_node.telnet_conn.close() LOG.info( 'Wait until "config_controller" reboot the active controller') time.sleep(180) controller_node.telnet_conn = install_helper.open_telnet_session( controller_node) controller_node.telnet_conn.login() time.sleep(120) con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) controller_node.ssh_conn = con_ssh ControllerClient.set_active_controller(con_ssh) host_helper.wait_for_hosts_ready(controller0) LOG.tc_step('Install the standby controller: {}'.format(controller1)) if not is_sx: install_non_active_node(controller1, lab) elif len(lab['controller_nodes']) >= 2: LOG.tc_step('Install the standby controller: {}'.format(controller1)) install_non_active_node(controller1, lab) boot_interfaces = lab['boot_device_dict'] hostnames = system_helper.get_hosts() storage_hosts = [host for host in hostnames if 'storage' in host] compute_hosts = [ host for host in hostnames if 'storage' not in host and 'controller' not in host ] if len(storage_hosts) > 0: # con_ssh.exec_sudo_cmd('touch /etc/ceph/ceph.client.None.keyring') for storage_host in storage_hosts: LOG.tc_step("Restoring {}".format(storage_host)) install_helper.open_vlm_console_thread( storage_host, boot_interface=boot_interfaces, vlm_power_on=True) LOG.info( "Verifying {} is Locked, Diabled and Online ...".format( storage_host)) system_helper.wait_for_hosts_states( storage_host, administrative=HostAdminState.LOCKED, operational=HostOperState.DISABLED, availability=HostAvailState.ONLINE) LOG.info("Unlocking {} ...".format(storage_host)) rc, output = host_helper.unlock_host(storage_host, available_only=True) assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format( storage_host, rc, output) LOG.info("Veryifying the Ceph cluster is healthy ...") storage_helper.wait_for_ceph_health_ok(timeout=600) LOG.info("Importing images ...") image_backup_files = install_helper.get_backup_files( IMAGE_BACKUP_FILE_PATTERN, StxPath.BACKUPS, con_ssh) LOG.info("Image backup found: {}".format(image_backup_files)) imported = install_helper.import_image_from_backup( image_backup_files) LOG.info("Images successfully imported: {}".format(imported)) LOG.tc_step("Restoring Cinder Volumes ...") restore_volumes() LOG.tc_step('Run restore-complete (CGTS-9756), regular lab') controller_node.telnet_conn.login() cmd = 'echo "{}" | sudo -S config_controller --restore-complete'.format( HostLinuxUser.get_password()) controller_node.telnet_conn.exec_cmd( cmd, extra_expects='controller-0 login:'******'rebuild ssh connection') con_ssh = install_helper.ssh_to_controller(controller_node.host_ip) controller_node.ssh_conn = con_ssh LOG.tc_step("Restoring Compute Nodes ...") if len(compute_hosts) > 0: for compute_host in compute_hosts: LOG.tc_step("Restoring {}".format(compute_host)) install_helper.open_vlm_console_thread( compute_host, boot_interface=boot_interfaces, vlm_power_on=True) LOG.info( "Verifying {} is Locked, Diabled and Online ...".format( compute_host)) system_helper.wait_for_hosts_states( compute_host, administrative=HostAdminState.LOCKED, operational=HostOperState.DISABLED, availability=HostAvailState.ONLINE) LOG.info("Unlocking {} ...".format(compute_host)) rc, output = host_helper.unlock_host(compute_host, available_only=True) assert rc == 0, "Host {} failed to unlock: rc = {}, msg: {}".format( compute_host, rc, output) LOG.info("All nodes {} are restored ...".format(hostnames)) else: LOG.warn('Only 1 controller, but not AIO lab!!??') LOG.tc_step("Delete backup files from {} ....".format(StxPath.BACKUPS)) con_ssh.exec_sudo_cmd("rm -rf {}/*".format(StxPath.BACKUPS)) LOG.tc_step('Perform post-restore testing/checking') post_restore_test(con_ssh) LOG.tc_step("Waiting until all alarms are cleared ....") timeout = 300 healthy, alarms = system_helper.wait_for_all_alarms_gone(timeout=timeout, fail_ok=True) if not healthy: LOG.warn('Alarms exist: {}, after waiting {} seconds'.format( alarms, timeout)) rc, message = con_ssh.exec_sudo_cmd('drbd-overview') if rc != 0 or (r'[===>' not in message and r'] sync\'ed: ' not in message): LOG.warn('Failed to get drbd-overview information') LOG.info('Wait for the system to be ready in {} seconds'.format( HostTimeout.REBOOT)) system_helper.wait_for_all_alarms_gone(timeout=HostTimeout.REBOOT, fail_ok=False) LOG.tc_step("Verifying system health after restore ...") rc, failed = system_helper.get_system_health_query(con_ssh=con_ssh) assert rc == 0, "System health not OK: {}".format(failed) collect_logs()