def revert(): reverted = False try: LOG.fixture_step("Manage primary subcloud {} if unmanaged".format(primary_subcloud)) dc_helper.manage_subcloud(primary_subcloud) LOG.fixture_step("Revert NTP config if changed") res = system_helper.modify_ntp(ntp_servers=central_ntp, auth_info=central_auth, check_first=True, clear_alarm=False)[0] if res != -1: LOG.fixture_step("Lock unlock config out-of-date hosts in central region") system_helper.wait_and_clear_config_out_of_date_alarms(auth_info=central_auth, wait_with_best_effort=True) LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(primary_subcloud)) dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud, expected_ntp=central_ntp, clear_alarm=True) if managed_subcloud: LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(managed_subcloud)) dc_helper.wait_for_subcloud_ntp_config(subcloud=managed_subcloud, expected_ntp=central_ntp, clear_alarm=True) if subclouds_to_revert: LOG.fixture_step("Manage unmanaged subclouds and check they are unaffected") for subcloud in subclouds_to_revert: dc_helper.manage_subcloud(subcloud) assert not system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, auth_info=Tenant.get('admin_platform', dc_region=subcloud)) reverted = True finally: if not reverted: for subcloud in subclouds_to_revert: dc_helper.manage_subcloud(subcloud)
def del_alarms(): LOG.fixture_step( "Delete 300.005 alarms and ensure they are removed from alarm-list" ) alarms_tab = system_helper.get_alarms_table(uuid=True) alarm_uuids = table_parser.get_values(table_=alarms_tab, target_header='UUID', **{'Alarm ID': alarm_id}) if alarm_uuids: system_helper.delete_alarms(alarms=alarm_uuids) post_del_alarms = system_helper.get_alarms(alarm_id=alarm_id) assert not post_del_alarms, "300.005 alarm still exits after deletion"
def test_alarms(): output = """+------+----------+-------------+-----------+----------+------------+ | UUID | Alarm ID | Reason Text | Entity ID | Severity | Time Stamp | +------+----------+-------------+-----------+----------+------------+ +------+----------+-------------+-----------+----------+------------+ Mon Apr 3 19:41:50 UTC 2017 controller-0:~$ """ table_ = table_parser.table(output) print("empty table: {}".format(table_)) alarms = system_helper.get_alarms() # system_helper.delete_alarms() # system_helper.get_alarms() system_helper.get_alarms_table()
def check_alarms(before_alarms, timeout=300, auth_info=Tenant.get('admin_platform'), con_ssh=None, fail_ok=False): after_alarms = system_helper.get_alarms(auth_info=auth_info, con_ssh=con_ssh) new_alarms = [] check_interval = 5 for item in after_alarms: if item not in before_alarms: alarm_id, entity_id = item.split('::::') if alarm_id == EventLogID.CPU_USAGE_HIGH: check_interval = 45 elif alarm_id == EventLogID.NTP_ALARM: # NTP alarm handling LOG.info("NTP alarm found, checking ntpq stats") host = entity_id.split('host=')[1].split('.ntp')[0] system_helper.wait_for_ntp_sync(host=host, fail_ok=False, auth_info=auth_info, con_ssh=con_ssh) continue new_alarms.append((alarm_id, entity_id)) res = True remaining_alarms = None if new_alarms: LOG.info("New alarms detected. Waiting for new alarms to clear.") res, remaining_alarms = \ system_helper.wait_for_alarms_gone(new_alarms, fail_ok=True, timeout=timeout, check_interval=check_interval, auth_info=auth_info, con_ssh=con_ssh) if not res: msg = "New alarm(s) found and did not clear within {} seconds. " \ "Alarm IDs and Entity IDs: {}".format(timeout, remaining_alarms) LOG.warning(msg) if not fail_ok: assert res, msg return res, remaining_alarms
def wait_for_con_drbd_sync_complete(): if len( system_helper.get_controllers( administrative=HostAdminState.UNLOCKED)) < 2: LOG.info( "Less than two unlocked controllers on system. Do not wait for drbd sync" ) return False host = 'controller-1' LOG.fixture_step( "Waiting for controller-1 drbd sync alarm gone if present") end_time = time.time() + 1200 while time.time() < end_time: drbd_alarms = system_helper.get_alarms( alarm_id=EventLogID.CON_DRBD_SYNC, reason_text='drbd-', entity_id=host, strict=False) if not drbd_alarms: LOG.info("{} drbd sync alarm is cleared".format(host)) break time.sleep(10) else: assert False, "drbd sync alarm {} is not cleared within timeout".format( EventLogID.CON_DRBD_SYNC) LOG.fixture_step( "Wait for {} becomes available in system host-list".format(host)) system_helper.wait_for_host_values(host, availability=HostAvailState.AVAILABLE, timeout=120, fail_ok=False, check_interval=10) LOG.fixture_step( "Wait for {} drbd-cinder in sm-dump to reach desired state".format( host)) host_helper.wait_for_sm_dump_desired_states(host, 'drbd-', strict=False, timeout=30, fail_ok=False) return True
def __verify_central_alarms(request, scope): region = 'RegionOne' auth_info = Tenant.get('admin_platform', dc_region=region) con_ssh = ControllerClient.get_active_controller(name=region) LOG.fixture_step( "({}) Gathering fm alarms in central region before test {} begins.". format(scope, scope)) before_alarms = system_helper.get_alarms(auth_info=auth_info, con_ssh=con_ssh) def verify_alarms(): LOG.fixture_step( "({}) Verifying system alarms in central region after test {} ended..." .format(scope, scope)) check_helper.check_alarms(before_alarms=before_alarms, auth_info=auth_info, con_ssh=con_ssh) LOG.info("({}) fm alarms verified in central region.".format(scope)) request.addfinalizer(verify_alarms)
def activate_upgrade(con_ssh=None, fail_ok=False): """ Activates upgrade Args: con_ssh (SSHClient): fail_ok (bool): Returns (tuple): (0, dict/list) - success (1, <stderr>) # cli returns stderr, applicable if fail_ok is true """ rc, output = cli.system('upgrade-activate', ssh_client=con_ssh, fail_ok=True) if rc != 0: err_msg = "CLI system upgrade-activate failed: {}".format(output) LOG.warning(err_msg) if fail_ok: return rc, output else: raise exceptions.CLIRejected(err_msg) if not system_helper.wait_for_alarm_gone("250.001", con_ssh=con_ssh, timeout=900, check_interval=60, fail_ok=True): alarms = system_helper.get_alarms(alarm_id="250.001") err_msg = "After activating upgrade alarms are not cleared : {}".format(alarms) LOG.warning(err_msg) if fail_ok: return 1, err_msg else: raise exceptions.HostError(err_msg) if not wait_for_upgrade_activate_complete(fail_ok=True): err_msg = "Upgrade activate failed" LOG.warning(err_msg) if fail_ok: return 1, err_msg else: raise exceptions.HostError(err_msg) LOG.info("Upgrade activation complete") return 0, None
def teardown(): """ If DNS servers are not set, set them. Deprovision internal DNS. """ global UNRESTORED_DNS_SERVERS global HOSTS_AFFECTED if UNRESTORED_DNS_SERVERS: LOG.fixture_step("Restoring DNS entries to: {}".format(UNRESTORED_DNS_SERVERS)) subnet_list = network_helper.get_subnets(network=mgmt_net_id) set_dns_servers(subnet_list, UNRESTORED_DNS_SERVERS, fail_ok=True) UNRESTORED_DNS_SERVERS = [] if system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE): LOG.fixture_step("Config out-of-date alarm(s) present, check {} and lock/unlock if host config out-of-date". format(HOSTS_AFFECTED)) for host in HOSTS_AFFECTED: if system_helper.get_host_values(host, 'config_status')[0] == 'Config out-of-date': LOG.info("Lock/unlock {} to clear config out-of-date status".format(host)) host_helper.lock_unlock_hosts(hosts=host) HOSTS_AFFECTED.remove(host)
def check_volumes_spaces(con_ssh): from keywords import cinder_helper LOG.info('Checking cinder volumes and space usage') usage_threshold = 0.70 free_space, total_space, unit = cinder_helper.get_lvm_usage(con_ssh) if total_space and free_space < usage_threshold * total_space: if total_space: LOG.info( 'cinder LVM over-used: free:{}, total:{}, ration:{}%'.format( free_space, total_space, free_space / total_space * 100)) LOG.info('Deleting known LVM alarms') expected_reason = r'Cinder LVM .* Usage threshold exceeded; threshold: (\d+(\.\d+)?)%, actual: (\d+(\.\d+)?)%' expected_entity = 'host=controller' value_titles = ('UUID', 'Alarm ID', 'Reason Text', 'Entity ID') lvm_pool_usage = system_helper.get_alarms(fields=value_titles, con_ssh=con_ssh) if not lvm_pool_usage: LOG.warn('Cinder LVM pool is used up to 75%, but no alarm for it') else: if len(lvm_pool_usage) > 1: LOG.warn( 'More than one alarm existing for Cinder LVM over-usage') elif len(lvm_pool_usage) < 1: LOG.warn('No LVM cinder over-used alarms, got:{}'.format( lvm_pool_usage)) for lvm_alarm in lvm_pool_usage: alarm_uuid, alarm_id, reason_text, entity_id = lvm_alarm.split( '::::') if re.match(expected_reason, reason_text) and re.search( expected_entity, entity_id): LOG.info('Expected alarm:{}, reason:{}'.format( alarm_uuid, reason_text)) LOG.info('Deleting it') system_helper.delete_alarms(alarms=alarm_uuid)
def test_alarm_suppression(alarm_test_prep): """ Verify suppression and unsuppression of active alarm and query alarms. Test Setup: - Unsuppress all alarms Generate alarms Test Steps: Suppress alarms Verify alarm supressed Generate alarm again Verify suppressed alarms no in active Unsuppressed alarm Verify unsuppressed in active alarm list. Delete last active alarm Test Teardown: - Unsuppress all alarms """ LOG.tc_step('Suppress generated alarm and Verify it is suppressed') alarm_uuid = alarm_test_prep query_active_alarm = system_helper.get_alarms_table(query_key='uuid', query_value=alarm_uuid) alarm_id = table_parser.get_values(table_=query_active_alarm, target_header='Alarm ID', **{"UUID": alarm_uuid})[0] assert '300.005' == alarm_id # alarm_id = ''.join(alarm_id) system_helper.suppress_event(alarm_id=alarm_id) LOG.tc_step('Generate Alarm again and Verify not in the Active list') system_helper.generate_event(event_id=alarm_id) alarms = system_helper.get_alarms(alarm_id=alarm_id) assert not alarms, "300.005 alarm appears in the active alarms table after regenerating" LOG.tc_step('UnSuppress alarm and verify it is unsuppressed') system_helper.unsuppress_event(alarm_id=alarm_id)
def restore_default_parameters(): LOG.fixture_step( 'Check MNFA service parameter values and revert if needed') mnfa_threshold_current_val = system_helper.get_service_parameter_values( service='platform', section='maintenance', name='mnfa_threshold') mnfa_timeout_default_current_val = system_helper.get_service_parameter_values( service='platform', section='maintenance', name='mnfa_timeout') alarms = system_helper.get_alarms( alarm_id=EventLogID.CONFIG_OUT_OF_DATE) if alarms or mnfa_threshold_current_val != mnfa_threshold_default_val or mnfa_timeout_default_val != \ mnfa_timeout_default_current_val: system_helper.modify_service_parameter( service='platform', section='maintenance', name='mnfa_threshold', apply=False, value=mnfa_threshold_default_val[0]) system_helper.modify_service_parameter( service='platform', check_first=False, section='maintenance', name='mnfa_timeout', apply=True, value=mnfa_timeout_default_val[0])
def ntp_precheck(request, check_alarms): LOG.info("Gather NTP config and subcloud management info") central_auth = Tenant.get('admin_platform', dc_region='RegionOne') central_ntp = system_helper.get_ntp_servers(auth_info=central_auth) primary_subcloud = ProjVar.get_var('PRIMARY_SUBCLOUD') subcloud_auth = Tenant.get('admin_platform', dc_region=primary_subcloud) subcloud_ntp = system_helper.get_ntp_servers(auth_info=subcloud_auth) if not central_ntp == subcloud_ntp: dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud) managed_subclouds = dc_helper.get_subclouds(mgmt='managed', avail='online') ssh_map = ControllerClient.get_active_controllers_map() managed_subclouds = [subcloud for subcloud in managed_subclouds if subcloud in ssh_map] if primary_subcloud in managed_subclouds: managed_subclouds.remove(primary_subcloud) managed_subcloud = None if managed_subclouds: managed_subcloud = managed_subclouds.pop() LOG.fixture_step("Leave only one subcloud besides primary subcloud to be managed: {}".format(managed_subcloud)) subclouds_to_revert = [] if managed_subclouds: LOG.info("Unmange: {}".format(managed_subclouds)) for subcloud in managed_subclouds: if not system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, auth_info=Tenant.get('admin_platform', subcloud)): subclouds_to_revert.append(subcloud) dc_helper.unmanage_subcloud(subcloud) def revert(): reverted = False try: LOG.fixture_step("Manage primary subcloud {} if unmanaged".format(primary_subcloud)) dc_helper.manage_subcloud(primary_subcloud) LOG.fixture_step("Revert NTP config if changed") res = system_helper.modify_ntp(ntp_servers=central_ntp, auth_info=central_auth, check_first=True, clear_alarm=False)[0] if res != -1: LOG.fixture_step("Lock unlock config out-of-date hosts in central region") system_helper.wait_and_clear_config_out_of_date_alarms(auth_info=central_auth, wait_with_best_effort=True) LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(primary_subcloud)) dc_helper.wait_for_subcloud_ntp_config(subcloud=primary_subcloud, expected_ntp=central_ntp, clear_alarm=True) if managed_subcloud: LOG.fixture_step("Lock unlock config out-of-date hosts in {}".format(managed_subcloud)) dc_helper.wait_for_subcloud_ntp_config(subcloud=managed_subcloud, expected_ntp=central_ntp, clear_alarm=True) if subclouds_to_revert: LOG.fixture_step("Manage unmanaged subclouds and check they are unaffected") for subcloud in subclouds_to_revert: dc_helper.manage_subcloud(subcloud) assert not system_helper.get_alarms(alarm_id=EventLogID.CONFIG_OUT_OF_DATE, auth_info=Tenant.get('admin_platform', dc_region=subcloud)) reverted = True finally: if not reverted: for subcloud in subclouds_to_revert: dc_helper.manage_subcloud(subcloud) request.addfinalizer(revert) return primary_subcloud, managed_subcloud, central_ntp
def test_system_patch_orchestration(patch_orchestration_setup): """ This test verifies the patch orchestration operation procedures for release patches. The patch orchestration automatically patches all hosts on a system in the following order: controllers, storages, and computes. The test creates a patch orchestration strategy or plan for automated patching operation with the following options to customize the test: --controller-apply-type : specifies how controllers are patched serially or in parallel. By default controllers are patched always in serial regardless of the selection. --storage-apply-type : specifies how the storages are patched. Possible values are: serial, parallel or ignore. The default value is serial. --compute-apply-type : specifies how the computes are patched. Possible values are: serial, parallel or ignore. The default value is serial. --max-parallel-compute-hosts: specifies the maximum number of computes to patch in parallel. Possible values [2 - 100]The default is 2. --instance-action - For reboot-required patches, specifies how the VM instances are moved from compute hosts being patched. Possible choices are: start-stop - VMs are stopped before compute host is patched. migrate - VMs are either live migrated or cold migrated off the compute before applying the patches. Args: patch_orchestration_setup: Returns: """ lab = patch_orchestration_setup['lab'] patching_helper.check_system_health(check_patch_ignored_alarms=False) LOG.info("Starting patch orchestration for lab {} .....".format(lab)) patches = patch_orchestration_setup['patches'] patch_ids = ' '.join(patches.keys()) LOG.tc_step("Uploading patches {} ... ".format(patch_ids)) patch_dest_dir = HostLinuxUser.get_home() + '/patches' rc = patching_helper.run_patch_cmd('upload-dir', args=patch_dest_dir)[0] assert rc in [0, 1], "Fail to upload patches in dir {}".format(patch_dest_dir) uploaded = patching_helper.get_available_patches() if rc == 0: LOG.info("Patches uploaded: {}".format(uploaded)) else: LOG.info("Patches are already in repo") if len(uploaded) > 0: LOG.tc_step("Applying patches ...") uploaded_patch_ids = ' '.join(uploaded) applied = patching_helper.apply_patches( patch_ids=uploaded_patch_ids)[1] LOG.info("Patches applied: {}".format(applied)) else: LOG.info("No Patches are applied; Patches may be already applied: {}") partial_patches_ids = patching_helper.get_patches_in_state( (PatchState.PARTIAL_APPLY, PatchState.PARTIAL_REMOVE)) if len(partial_patches_ids) > 0: current_alarms_ids = system_helper.get_alarms(mgmt_affecting=True, combine_entries=False) affecting_alarms = [ id_ for id_ in current_alarms_ids if id_[0] not in orchestration_helper.IGNORED_ALARM_IDS ] if len(affecting_alarms) > 0: assert system_helper.wait_for_alarms_gone(alarms=affecting_alarms, timeout=240, fail_ok=True)[0],\ "Alarms present: {}".format(affecting_alarms) LOG.tc_step("Installing patches through orchestration .....") patching_helper.orchestration_patch_hosts( controller_apply_type=patch_orchestration_setup[ 'controller_apply_strategy'], storage_apply_type=patch_orchestration_setup[ 'storage_apply_strategy'], compute_apply_type=patch_orchestration_setup[ 'compute_apply_strategy'], max_parallel_computes=patch_orchestration_setup[ 'max_parallel_computes'], instance_action=patch_orchestration_setup['instance_action'], alarm_restrictions=patch_orchestration_setup['alarm_restrictions']) LOG.info( " Applying Patch orchestration strategy completed for {} ....". format(partial_patches_ids)) LOG.tc_step("Deleting patches orchestration strategy .....") delete_patch_strategy() LOG.info("Deleted patch orchestration strategy .....") else: pytest.skip("All patches in patch-dir are already in system.")
def __get_alarms(scope): LOG.fixture_step("({}) Gathering system health info before test {} " "begins.".format(scope, scope)) alarms = system_helper.get_alarms() return alarms