def Trigger(tc): i = 0 workloads = api.GetWorkloads(tc.node) while i < tc.iterations: api.Logger.info(f"Starting iteration {i}") api.Logger.info(f"Clearing learn-db") if not learn_utils.ClearLearnData(tc.node): api.Logger.error("Failed to clear learn info at iteration %d" % i) return api.types.status.FAILURE api.Logger.info(f"Re-learning all endpoints") if not arp_utils.SendGratArp(workloads): api.Logger.error("Failed to send arp probes at iteration %d" % i) return api.types.status.FAILURE api.Logger.info(f"Sending some ARP replies") for j in range(2): arp_utils.SendArpReply(workloads) misc_utils.Sleep(2) learn_utils.DumpLearnData() if not learn_utils.ValidateLearnInfo(tc.node): api.Logger.error("Failed to validate learn data at iteration %d" % i) return api.types.status.FAILURE i += 1 misc_utils.Sleep(tc.interval) return api.types.status.SUCCESS
def Trigger(tc): # Read Age and State. Sleep for ttl of the entry max_retry = 5 interval = 2 retry = 0 while retry < max_retry: ret, data = learn_utils.ReadLearnIPOperData(tc.node, tc.learn_ip_obj) if not ret or data is None: return api.types.status.FAILURE if data['state'] == learn_pb2.EP_STATE_CREATED and data['ttl'] > 0: api.Logger.debug("IP endpoint is in Created state with ttl %d" % data['ttl']) break else: misc_utils.Sleep(interval) retry += 1 learn_utils.DumpLearnIP(tc.node, tc.learn_ip_obj) if retry == max_retry: api.Logger.error( "IP endpoint not seen in Learning state even after %d retries" % max_retry) return api.types.status.FAILURE misc_utils.Sleep(data['ttl']) return api.types.status.SUCCESS
def Verify(tc): max_retry = 2 interval = 2 retry = 0 deviceLearnAgeTimeout = EzAccessStoreClient[ tc.node].GetDevice().GetLearnAgeTimeout() for learn_ip_obj in tc.learn_ip_obj_list: retry = 0 while retry < max_retry: ret, data = learn_utils.ReadLearnIPOperData(tc.node, learn_ip_obj) if not ret or data is None: return api.types.status.FAILURE if data['state'] != learn_pb2.EP_STATE_PROBING: retry += 1 misc_utils.Sleep(interval) continue else: break if retry == max_retry: api.Logger.error("One of the IP endpoints not in Probing state") learn_utils.DumpLearnIP(tc.node) return api.types.status.FAILURE api.Logger.verbose("All IP endpoints are in Probing state now") learn_utils.DumpLearnIP(tc.node) misc_utils.Sleep( 93 ) # sleep for 30s thrice to let 3 probes be sent + few seconds of delay for learn_ip_obj in tc.learn_ip_obj_list: ret, data = learn_utils.ReadLearnIPOperData(tc.node, learn_ip_obj) if not ret or data != None: api.Logger.error("One of IP endpoints still not deleted") return api.types.status.FAILURE api.Logger.verbose("All IP endpoints are aged out") ret, data = learn_utils.ReadLearnMACOperData(tc.node, tc.learn_mac_obj) if not ret or data is None or data['ttl'] == 0: api.Logger.error("MAC expected to have a non-zero age") return api.types.status.FAILURE misc_utils.Sleep( deviceLearnAgeTimeout) # sleep to let mac entry get deleted ret, data = learn_utils.ReadLearnMACOperData(tc.node, tc.learn_mac_obj) if not ret or data != None: api.Logger.error( "MAC expected to be deleted by now but still hanging around") learn_utils.DumpLearnMAC(tc.node, tc.learn_mac_obj) return api.types.status.FAILURE api.Logger.verbose("MAC got flushed after age out") stats = learn_utils.GetLearnStatistics([tc.node]) if stats[tc.node]['macageouts'] != 1 or stats[tc.node]['ipageouts'] != len( tc.learn_ip_obj_list): api.Logger.error("Ageout statistics not seen as expected") return api.types.status.FAILURE return api.types.status.SUCCESS
def setDataPortStatePerUplink(naples_nodes, oper, id): uplink_list = [] if id == 'Uplink0': uplink_list.append(UPLINK_PREFIX1) elif id == 'Uplink1': uplink_list.append(UPLINK_PREFIX2) else: uplink_list.append(UPLINK_PREFIX1) uplink_list.append(UPLINK_PREFIX2) if GlobalOptions.dryrun: return api.types.status.SUCCESS for node in naples_nodes: node_uuid = EzAccessStoreClient[node].GetNodeUuid(node) #node_uuid = 750763714960 for uplink in uplink_list: intf_uuid = uplink % node_uuid cmd = ("debug update port --admin-state %s --port " + intf_uuid) % oper ret, resp = pdsctl.ExecutePdsctlCommand(node, cmd, yaml=False) if ret != True: api.Logger.error("oper:%s uplink failed at node %s : %s" % (oper, node, resp)) return api.types.status.FAILURE misc_utils.Sleep(1) #give a short gap before printing status pdsctl.ExecutePdsctlShowCommand(node, "port status", yaml=False) return api.types.status.SUCCESS
def FlapSwitchPort(tc, num_ports=1, down_time=0, port='any'): naples_nodes = api.GetNaplesHostnames() api.Logger.info("Flapping switch port on %s ..." % naples_nodes) port_num = 1 if num_ports == 1 and port is 'any': port_num = random.choice([1, 2]) ret = api.ShutDataPorts(naples_nodes, num_ports, start_port=port_num) if ret != api.types.status.SUCCESS: api.Logger.error("Failed to Shut the switch port:%s" % start_port) return api.types.status.FAILURE ret = DetectUpLinkState(naples_nodes, PORT_OPER_STATUS_DOWN, any) if ret != api.types.status.SUCCESS: api.Logger.error("Failed to detect any uplink(%s) in DOWN state." % start_port) return api.types.status.FAILURE misc_utils.Sleep(down_time) ret = api.UnShutDataPorts(naples_nodes, num_ports, start_port=port_num) if ret != api.types.status.SUCCESS: api.Logger.error("Failed to UnShut the switch port:%s" % start_port) return api.types.status.FAILURE ret = DetectUpLinkState(naples_nodes, PORT_OPER_STATUS_UP, all) if ret != api.types.status.SUCCESS: api.Logger.error("Failed to detect any(%s) uplink in UP state." % start_port) return api.types.status.FAILURE return api.types.status.SUCCESS
def HitlessTriggerUpdateRequest(tc): result = api.types.status.SUCCESS if api.IsDryrun(): return result backgroun_req = api.Trigger_CreateExecuteCommandsRequest(serial=False) # start upgrade manager process for node in tc.nodes: cmd = "/nic/tools/start-upgmgr.sh -n " api.Logger.info("Starting Upgrade Manager %s" % (cmd)) api.Trigger_AddNaplesCommand(backgroun_req, node, cmd, background=True) api.Trigger(backgroun_req) # wait for upgrade manager to comeup misc_utils.Sleep(10) for node in tc.nodes: # initiate upgrade client objects # Generate Upgrade objects UpgradeClient.GenerateUpgradeObjects(node, api.GetNicMgmtIP(node)) upg_obj = UpgradeClient.GetUpgradeObject(node) upg_obj.SetPkgName(tc.pkg_name) upg_obj.SetUpgMode(upgrade_pb2.UPGRADE_MODE_HITLESS) upg_status = upg_obj.UpgradeReq() api.Logger.info( f"Hitless Upgrade request for {node} returned status {upg_status}") if upg_status != upgrade_pb2.UPGRADE_STATUS_OK: api.Logger.error(f"Failed to start upgrade manager on {node}") result = api.types.status.FAILURE continue return result
def Verify(tc): if tc.skip: return api.types.status.SUCCESS misc_utils.Sleep(5) # let metaswitch carry this to other side learn_utils.DumpLearnData() ret = __validate_move_stats(tc.mv_ctx['src_wl'].node_name, tc.mv_ctx['dst_wl'].node_name) if ret != api.types.status.SUCCESS: return api.types.status.FAILURE api.Logger.verbose( "Move statistics are matching expectation on both nodes") # Validate flow moves on source and destination workloads ret = __validate_flow_move(tc, tc.mv_ctx['src_wl'].node_name, tc.mv_ctx['ip_prefix'], 'L2R') if ret != api.types.status.SUCCESS: api.Logger.error("Failed to validate flows on node %s" % tc.mv_ctx['src_wl'].node_name) return api.types.status.FAILURE ret = __validate_flow_move(tc, tc.mv_ctx['dst_wl'].node_name, tc.mv_ctx['ip_prefix'], 'R2L') if ret != api.types.status.SUCCESS: api.Logger.error("Failed to validate flows on node %s" % tc.mv_ctx['dst_wl'].node_name) return api.types.status.FAILURE # Validate with traffic after moving return move_utils.ValidateEPMove()
def Verify(tc): # Read Age and State. Check age if it is in Learning state still. Retry if it is in Probing state. max_retry = 2 interval = 2 retry = 0 deviceLearnAgeTimeout = EzAccessStoreClient[ tc.node].GetDevice().GetLearnAgeTimeout() while retry < max_retry: ret, data = learn_utils.ReadLearnIPOperData(tc.node, tc.learn_ip_obj) if not ret or data is None: return api.types.status.FAILURE if data['state'] == learn_pb2.EP_STATE_CREATED and data['ttl'] > ( deviceLearnAgeTimeout - 5): api.Logger.debug( "IP endpoint is in Created state with ttl %d after refresh" % data['ttl']) return api.types.status.SUCCESS else: misc_utils.Sleep(interval) retry += 1 learn_utils.DumpLearnIP(tc.node, tc.learn_ip_obj) if retry == max_retry: api.Logger.error( "IP endpoint did not have the expected age even after %d retries" % max_retry) return api.types.status.FAILURE return api.types.status.SUCCESS
def __trim_memory(): res = pdsctlUtils.TrimMemory() if not res: api.Logger.error(f"Failed to trim memory") return api.types.status.FAILURE # wait few seconds for memory to be reclaimed miscUtils.Sleep(10) return api.types.status.SUCCESS
def checkUpgradeStatusViaConsole(tc): result = api.types.status.SUCCESS status_in_progress = True retry_count = 0 while status_in_progress: misc_utils.Sleep(1) retry_count += 1 if retry_count == 300: # break if status is still in-progress after max retries result = api.types.status.FAILURE break status_in_progress = False for node in tc.nodes: (resp, exit_code) = api.RunNaplesConsoleCmd( node, "grep -vi in-progress /update/pds_upg_status.txt", True) api.Logger.verbose("checking upgrade for node: %s, exit_code:%s " % (node, exit_code)) if exit_code != 0: status_in_progress = True break else: api.Logger.info( "Status other than in-progress found in %s, /update/pds_upg_status.txt" % node) lines = resp.split('\r\n') for line in lines: api.Logger.info(line.strip()) if retry_count % 10 == 0: api.Logger.info( "Checking for status not in-progress in file /update/pds_upg_status.txt, retries: %s" % retry_count) if status_in_progress: continue for node in tc.nodes: (resp, exit_code) = api.RunNaplesConsoleCmd( node, "grep -i success /update/pds_upg_status.txt", True) api.Logger.info( "Checking for success status in file /update/pds_upg_status.txt" ) if exit_code != 0: result = api.types.status.FAILURE else: api.Logger.info( "Success Status found in /update/pds_upg_status.txt") if status_in_progress: api.Logger.error("Upgrade Failed: Status is still IN-PROGRESS") return result
def getFirstOperDownPort(node): misc_utils.Sleep(3) if GlobalOptions.dryrun: return api.types.status.SUCCESS node_uuid = EzAccessStoreClient[node].GetNodeUuid(node) for uplink in [UPLINK_PREFIX1, UPLINK_PREFIX2]: intf_uuid = uplink % node_uuid cmd = "port status -p " + intf_uuid ret, resp = pdsctl.ExecutePdsctlShowCommand(node, cmd, yaml=False) if ret == True and "UP DOWN" in resp: return uplinkDict[uplink]
def Teardown(tc): if tc.skip: return api.types.status.SUCCESS stats_utils.Clear() ctx = tc.mv_ctx ip_prefix = tc.mv_ctx['ip_prefix'] src_wl = tc.mv_ctx['src_wl'] dst_wl = tc.mv_ctx['dst_wl'] move_utils.MoveEpIPEntry(dst_wl.node_name, src_wl.node_name, ip_prefix) misc_utils.Sleep(5) # let metaswitch carry it to the other side learn_utils.DumpLearnData() ret = __validate_move_stats(dst_wl.node_name, src_wl.node_name) if ret != api.types.status.SUCCESS: return api.types.status.FAILURE api.Logger.verbose( "Move statistics are matching expectation on both nodes") # Validate flow move on src and dst. ret = __validate_flow_move(tc, tc.mv_ctx['src_wl'].node_name, tc.mv_ctx['ip_prefix'], 'R2L') if ret != api.types.status.SUCCESS: api.Logger.error("Failed to validate flows on node %s" % tc.mv_ctx['src_wl'].node_name) return api.types.status.FAILURE ret = __validate_flow_move(tc, tc.mv_ctx['dst_wl'].node_name, tc.mv_ctx['ip_prefix'], 'L2R') if ret != api.types.status.SUCCESS: api.Logger.error("Failed to validate flows on node %s" % tc.mv_ctx['dst_wl'].node_name) return api.types.status.FAILURE # Also validate new flows on src ret = __validate_flows(tc, tc.mv_ctx['src_wl'].node_name) if ret != api.types.status.SUCCESS: api.Logger.error("Failed to validate flows on node %s" % node) return api.types.status.FAILURE # Terminate background ping and check for loss ret = __verify_background_ping(tc) if ret != api.types.status.SUCCESS: return ret # Validate with traffic after moving back if move_utils.ValidateEPMove() != api.types.status.SUCCESS: return api.types.status.FAILURE return flow_utils.clearFlowTable(None)
def verifyDataPortState(naples_nodes, admin, oper): ret = api.types.status.SUCCESS if GlobalOptions.dryrun: return ret retry_remaining = verifyRetry ret = verifyDataPortStateHelper(naples_nodes, admin, oper) while api.types.status.FAILURE == ret and retry_remaining > 0: misc_utils.Sleep(1) retry_remaining = retry_remaining - 1 ret = verifyDataPortStateHelper(naples_nodes, admin, oper) return ret
def __poll_upgrade_status(tc, status, **kwargs): not_found = True retry = 0 while not_found: misc_utils.Sleep(1) not_found = False for node in tc.nodes: api.Logger.info( f"retry {retry}: Checking upgrade status {status.name} on {node}" ) not_found = not CheckUpgradeStatus(node, status) retry += 1
def __detectUpLinkState(node, state, cb, tries=6): PORT_TYPE_MGMT = 2 while tries: uplink = GetUplinkStatus(node) #print("uplink List : %s"%uplink) arr = [ port['status']['linkstatus']["operstate"] == state for port in uplink if port['spec']['porttype'] != PORT_TYPE_MGMT ] if cb(arr): return api.types.status.SUCCESS misc_utils.Sleep(1) tries -= 1 return api.types.status.FAILURE
def Teardown(tc): ret = learn_utils.SetWorkloadIntfOperState(tc.workload, 'up') if not ret: api.Logger.error("Failed to bringup interface for workload %s%s" % (tc.learn_mac_obj.GID(), tc.node)) return api.types.status.FAILURE arp_utils.SendGratArp([tc.workload]) learn_utils.DumpLearnMAC(tc.node, tc.learn_mac_obj) misc_utils.Sleep( 10) # to let remote mappings for this VNIC, sync in other nodes if not learn_utils.ValidateLearnInfo(): api.Logger.error("Learn validation failed") return api.types.status.FAILURE api.Logger.verbose("Aged out Endpoints are learnt again") return api.types.status.SUCCESS
def switchPortFlap(tc): flap_count = 1 num_ports = 2 interval = 2 down_time = 2 naples_nodes = api.GetNaplesHostnames() api.Logger.info("Flapping switch port on %s ..." % naples_nodes) ret = api.FlapDataPorts(naples_nodes, num_ports, down_time, flap_count, interval) if ret != api.types.status.SUCCESS: api.Logger.error("Failed to flap the switch port") return ret misc_utils.Sleep(60) #give a gap to for bgp to reconcile return api.types.status.SUCCESS
def __verify_learning(): api.Logger.verbose("Verifying if all VNIC and Mappings are learnt") for node in api.GetNaplesHostnames(): if not bgp_utils.ValidateBGPOverlayNeighborship(node): api.Logger.error("Failed in BGP Neighborship validation for node: %s" %node) return api.types.status.FAILURE learn_utils.DumpLearnData() # sleep for some time to let metaswitch advertise these local mappings to other naples. # TODO: have to find out if there is any event to wait on api.Logger.debug("Sleeping for sometime letting remote mappings to get programmed") misc_utils.Sleep(40) if not learn_utils.ValidateLearnInfo(): return api.types.status.FAILURE return api.types.status.SUCCESS
def Verify(tc): misc_utils.Sleep(40) # letting remote mappings to get programmed if not learn_utils.ValidateLearnInfo(): api.Logger.error("Learn validation failed") return api.types.status.FAILURE workload_pairs = config_api.GetWorkloadPairs( config_api.WORKLOAD_PAIR_TYPE_LOCAL_ONLY, config_api.WORKLOAD_PAIR_SCOPE_INTER_SUBNET) workload_pairs.extend( config_api.GetWorkloadPairs( config_api.WORKLOAD_PAIR_TYPE_REMOTE_ONLY, config_api.WORKLOAD_PAIR_SCOPE_INTRA_SUBNET)) workload_pairs.extend( config_api.GetWorkloadPairs( config_api.WORKLOAD_PAIR_TYPE_REMOTE_ONLY, config_api.WORKLOAD_PAIR_SCOPE_INTER_SUBNET)) return conn_utils.ConnectivityTest(workload_pairs, ['icmp'], ['ipv4'], [64], 0, 'all')
def check_underlay_bgp_peer_connectivity(sleep_time=0, timeout_val=0): api.Logger.info("Starting BGP underlay validation ...") timeout = timeout_val # [seconds] timeout_start = time.time() retry_count = 1 while True: if ValidateBGPUnderlayNeighborshipInfo(): return api.types.status.SUCCESS if timeout_val == 0 or time.time() >= timeout_start + timeout: break retry_count += 1 api.Logger.verbose("BGP underlay is still not up, will do retry({0}) " "after {1} sec...".format(retry_count, sleep_time)) if sleep_time > 0: misc_utils.Sleep(sleep_time) api.Logger.error("BGP underlay validation failed ...") return api.types.status.FAILURE
def __verify_move_stats(home, new_home, max_retry=4, interval=5): retry = 0 while retry < max_retry: misc_utils.Sleep(interval) # let metaswitch carry this to other side learn_utils.DumpLearnData() ret = __validate_move_stats(home, new_home) if ret == api.types.status.SUCCESS: break else: retry += 1 api.Logger.verbose( f"Retrying Move statistics validation - retry {retry}") if retry == max_retry: api.Logger.error( "Failed to validate move statistics even after retries") return api.types.status.FAILURE return api.types.status.SUCCESS
def switchPortOp(naples_nodes, oper, id): num_ports = 1 start_port_id = 1 if id == 'Switchport1': start_port_id = 2 elif id == "Switchports": num_ports = 2 api.Logger.info(f"Oper: {oper} for {id} on {naples_nodes} ...") if oper == 'down': ret = api.ShutDataPorts(naples_nodes, num_ports, start_port_id) else: ret = api.UnShutDataPorts(naples_nodes, num_ports, start_port_id) misc_utils.Sleep( 60) #TBD: temporary fix to wait 60 seconds for bgp sessions if ret != api.types.status.SUCCESS: api.Logger.error(f"Failed to bring {oper} : {id}") return ret return api.types.status.SUCCESS
def ProcessObjectsByOperation(oper, select_objs, spec=None): supported_ops = [ 'Create', 'Read', 'Delete', 'Update' ] res = api.types.status.SUCCESS if oper is None or oper not in supported_ops: return res for obj in select_objs: if getattr(obj, oper)(spec): if not getattr(obj, 'Read')(): api.Logger.error(f"read after {oper} failed for object: {obj}") res = api.types.status.FAILURE else: api.Logger.error(f"{oper} failed for object: {obj}") res = api.types.status.FAILURE if oper == 'Delete': if hasattr(obj, 'VerifyDepsOperSt'): # needed until delay_delete is enabled, since read can happen when obj is being deleted misc_utils.Sleep(3) if not obj.VerifyDepsOperSt(oper): api.Logger.error(f"Dependent object oper state not as expected after {oper} on {obj}") res = api.types.status.FAILURE return res
def Trigger(tc): # Read Age and State. Select IP Entry that is in Learning State with higher age. max_retry = 5 interval = 2 retry = 0 age_max = 0 ret = learn_utils.SetWorkloadIntfOperState(tc.workload, 'down') if not ret: api.Logger.error("Failed to bringdown interface for workload %s%s" % (tc.learn_mac_obj.GID(), tc.node)) return api.types.status.FAILURE api.Logger.verbose("Brought interface down for workload %s" % tc.learn_mac_obj) for learn_ip_obj in tc.learn_ip_obj_list: ret, data = learn_utils.ReadLearnIPOperData(tc.node, learn_ip_obj) if not ret or data is None: api.Logger.error("Failed to read IP endpoint %s from node %s" % (learn_ip_obj.IP, tc.node)) return api.types.status.FAILURE if data['state'] == learn_pb2.EP_STATE_CREATED and data[ 'ttl'] > age_max: age_max = data['ttl'] if age_max == 0: # All entries are in Probing state already return api.types.status.SUCCESS learn_utils.DumpLearnIP(tc.node) if retry == max_retry: api.Logger.error( "IP endpoint not seen in Learning state even after %d retries" % max_retry) return api.types.status.FAILURE misc_utils.Sleep(age_max) return api.types.status.SUCCESS
def Trigger(tc): ret = tc.subnet.ModifyHostInterface() if not ret: api.Logger.error( "Failed to modify host interface association for subnet") return api.types.status.FAILURE if api.IsDryrun(): old_intf = new_intf = 'dryrun' else: old_intf = intf_client.FindHostInterface( tc.subnet.Node, tc.hostifidx).GetInterfaceName() new_intf = intf_client.FindHostInterface( tc.subnet.Node, tc.subnet.HostIfIdx[0]).GetInterfaceName() api.Logger.debug( f"Subnet moved from HostInterface {old_intf} to {new_intf}") misc_utils.Sleep(3) # needed until delay_delete is enabled ret = tc.subnet.VerifyDepsOperSt('Delete') if not ret: api.Logger.error( f"Oper state is not as expected after host interface is modified") return api.types.status.FAILURE __modify_workload_interface(tc) return api.types.status.SUCCESS
def CheckRolloutStatus(tc): retries = 0 start_ts = datetime.now() result = api.types.status.FAILURE status_found = False while retries < 100: api.Logger.info("------Issuing Rollout get %s retry------"%retries) misc_utils.Sleep(2) retries += 1 # get rollout status req = api.Trigger_CreateExecuteCommandsRequest(serial=False) for n in tc.Nodes: cmd = 'curl -k https://' + api.GetNicIntMgmtIP(n) + ':'+utils.GetNaplesMgmtPort()+'/api/v1/naples/rollout/' api.Trigger_AddHostCommand(req, n, cmd) api.Logger.info("Sending rollout status get request: %s"%(cmd)) tc.resp = api.Trigger(req) try: for cmd in tc.resp.commands: api.PrintCommandResults(cmd) except Exception as e: api.Logger.error(f"Exception occured in sending rollout status get.{e}") continue for cmd in tc.resp.commands: if cmd.exit_code != 0: api.Logger.info("Rollout status get request returned failure") continue resp = json.loads(cmd.stdout) try: for item in resp['Status']['status']: status_found = True if not item['Op'] == 4: api.Logger.info("opcode is bad for %s"%cmd.node_name) result = api.types.status.FAILURE if "fail" in tc.iterators.option: if not item['opstatus'] == 'failure': api.Logger.info("opstatus is bad for %s"%cmd.node_name) result = api.types.status.FAILURE if tc.iterators.option not in item['Message']: api.Logger.info("message is bad") result = api.types.status.FAILURE else: if not item['opstatus'] == 'success': api.Logger.info("opstatus(%s) is bad for %s"%(item['opstatus'], cmd.node_name)) result = api.types.status.FAILURE else: api.Logger.info("Rollout status is SUCCESS for %s"%cmd.node_name) result = api.types.status.SUCCESS except Exception as e: api.Logger.error("resp: ", json.dumps(resp, indent=1)) #api.Logger.error(f"Exception occured in parsing response: {e}") result = api.types.status.FAILURE continue if status_found: break end_ts = datetime.now() # find time elapsed in retrieving rollout status and adjust the wait time for traffic test. timedelta = end_ts - start_ts time_elapsed = timedelta.days * 24 * 3600 + timedelta.seconds if time_elapsed < 100: time_elapsed = 100 misc_utils.Sleep(time_elapsed) tc.sleep = (tc.sleep - time_elapsed) if (tc.sleep > time_elapsed) else 10 return result
def __validate_trigger(): misc_utils.Sleep(40) # letting metaswitch sync data learn_utils.DumpLearnData() if move_utils.ValidateEPMove() != api.types.status.SUCCESS: return api.types.status.FAILURE return api.types.status.SUCCESS
def Verify(tc): result = api.types.status.SUCCESS if api.IsDryrun(): # no upgrade done in case of dryrun return result upg_switchover_time = 70 # wait for upgrade to complete. status can be found from the presence of /update/pds_upg_status.txt api.Logger.info( f"Sleep for {upg_switchover_time} secs before checking for Upgrade status" ) misc_utils.Sleep(upg_switchover_time) if checkUpgradeStatusViaConsole(tc) != api.types.status.SUCCESS: api.Logger.error( "Failed in validation of Upgrade Manager completion status via Console" ) result = api.types.status.FAILURE if not naples_utils.EnableReachability(tc.nodes): api.Logger.error( f"Failed to reach naples {tc.nodes} post upgrade switchover") result = api.types.status.FAILURE # verify mgmt connectivity if VerifyMgmtConnectivity(tc) != api.types.status.SUCCESS: api.Logger.error("Failed in Mgmt Connectivity Check after Upgrade .") result = api.types.status.FAILURE if result != api.types.status.SUCCESS: api.Logger.info("DUMP Upgrade Manager Logs") # Failure could be due to upgrade failure before/after switchover or # management connectivity failure. Hence dump the upgrade_mgr.log # via console for debug purpose. api.Logger.SetSkipLogPrefix(True) for node in tc.nodes: (resp, exit_code) = api.RunNaplesConsoleCmd(node, "cat /obfl/upgrademgr.log", True) if exit_code != 0: api.Logger.info("Failed to dump /obfl/upgrademgr.log from " "node: %s, exit_code:%s " % (node, exit_code)) else: api.Logger.info("Dump /obfl/upgrademgr.log from " "node: %s, exit_code:%s " % (node, exit_code)) lines = resp.split('\r\n') for line in lines: api.Logger.info(line.strip()) api.Logger.SetSkipLogPrefix(False) return api.types.status.FAILURE # push configs after upgrade UpdateConfigAfterUpgrade(tc) # verify PDS instances if check_pds_instance(tc) != api.types.status.SUCCESS: api.Logger.error("Failed in check_pds_instances") result = api.types.status.FAILURE if check_pds_agent_debug_data(tc) != api.types.status.SUCCESS: api.Logger.error("Failed in check_pds_agent_debug_data") result = api.types.status.FAILURE # TODO: verify BGP Underlay (REMOVE WHEN PING API IS UPDATED) if bgp_utils.check_underlay_bgp_peer_connectivity( sleep_time=15, timeout_val=120) != api.types.status.SUCCESS: api.Logger.error("Failed in underlay connectivity check") #return api.types.status.FAILURE # verify connectivity if VerifyConnectivity(tc) != api.types.status.SUCCESS: api.Logger.error("Failed in Connectivity Check after Upgrade .") if not skip_connectivity_failure: result = api.types.status.FAILURE if tc.upgrade_mode: tc.sleep = 100 # If rollout status is failure, then no need to wait for traffic test if result == api.types.status.SUCCESS: api.Logger.info("Sleep for %s secs for traffic test to complete" % tc.sleep) misc_utils.Sleep(tc.sleep) pkt_loss_duration = 0 # terminate background traffic and calculate packet loss duration if tc.background: if ping.TestTerminateBackgroundPing(tc, tc.pktsize,\ pktlossverif=tc.pktlossverif) != api.types.status.SUCCESS: api.Logger.error( "Failed in Ping background command termination.") result = api.types.status.FAILURE # calculate max packet loss duration for background ping pkt_loss_duration = ping.GetMaxPktLossDuration( tc, interval=tc.interval) if pkt_loss_duration != 0: indent = "-" * 10 if tc.pktlossverif: result = api.types.status.FAILURE api.Logger.error( f"{indent} Packet Loss duration during UPGRADE of {tc.nodes} is {pkt_loss_duration} secs {indent}" ) if tc.allowed_down_time and (pkt_loss_duration > tc.allowed_down_time): api.Logger.error( f"{indent} Exceeded allowed Loss Duration {tc.allowed_down_time} secs {indent}" ) # Failing test based on longer traffic loss duration is commented for now. # enable below line when needed. #result = api.types.status.FAILURE else: api.Logger.info("No Packet Loss Found during UPGRADE Test") if upgrade_utils.VerifyUpgLog(tc.nodes, tc.GetLogsDir()): api.Logger.error("Failed to verify the upgrademgr logs...") if result == api.types.status.SUCCESS: api.Logger.info(f"Upgrade: Completed Successfully for {tc.nodes}") else: api.Logger.info(f"Upgrade: Failed for {tc.nodes}") return result
def Verify(tc): result = api.types.status.SUCCESS if api.IsDryrun(): return result # Stop Trex traffic if tc.trex: traffic_gen.stop_trex_traffic(tc.trex_peers) # Check upgrade status if tc.failure_stage != None: # TODO : details check on stage etc status = UpgStatus.UPG_STATUS_FAILED else: status = UpgStatus.UPG_STATUS_SUCCESS for node in tc.nodes: if not upgrade_utils.CheckUpgradeStatus(node, status): result = api.types.status.FAILURE # validate the configuration result = upgrade_utils.HitlessUpgradeValidateConfig(tc) if result != api.types.status.SUCCESS: api.Logger.info("Ignoring the configuration validation failure") result = api.types.status.SUCCESS # verify mgmt connectivity if traffic.VerifyMgmtConnectivity(tc.nodes) != api.types.status.SUCCESS: api.Logger.error("Failed in Mgmt Connectivity Check after Upgrade .") result = api.types.status.FAILURE if result != api.types.status.SUCCESS: api.Logger.info("DUMP Upgrade Manager Logs") # Failure could be due to upgrade failure before/after switchover or # management connectivity failure. Hence dump the upgrade_mgr.log # via console for debug purpose. api.Logger.SetSkipLogPrefix(True) for node in tc.nodes: (resp, exit_code) = api.RunNaplesConsoleCmd(node, "cat /obfl/upgrademgr.log", True) if exit_code != 0: api.Logger.info("Failed to dump /obfl/upgrademgr.log from " "node: %s, exit_code:%s " % (node, exit_code)) else: api.Logger.info("Dump /obfl/upgrademgr.log from " "node: %s, exit_code:%s " % (node, exit_code)) lines = resp.split('\r\n') for line in lines: api.Logger.verbose(line.strip()) api.Logger.SetSkipLogPrefix(False) return api.types.status.FAILURE check_pds_agent_debug_data(tc) # verify workload connectivity if VerifyConnectivity(tc) != api.types.status.SUCCESS: api.Logger.error("Failed in Connectivity Check after Upgrade.") if not SKIP_CONNECTIVITY_FAILURE: result = api.types.status.FAILURE tc.sleep = 100 # If rollout status is failure, then no need to wait for traffic test if result == api.types.status.SUCCESS: api.Logger.info("Sleep for %s secs for traffic test to complete" % tc.sleep) misc_utils.Sleep(tc.sleep) # terminate background traffic and calculate packet loss duration result = ping_traffic_stop_and_verify(tc) if result == api.types.status.SUCCESS and tc.iperf: result = iperf_traffic_stop_and_verify(tc) if upgrade_utils.VerifyUpgLog(tc.nodes, tc.GetLogsDir()): api.Logger.error("Failed to verify the upgrademgr logs...") nodes = ",".join(tc.nodes) if result == api.types.status.SUCCESS: api.Logger.info(f"Upgrade: Completed Successfully for {nodes}") else: api.Logger.error(f"Upgrade: Failed for {nodes}") return result
def Verify(tc): if tc.skip: return api.types.status.SUCCESS if tc.pktloss_verify: if tc.background and tc.bg_cmd_resp is None: api.Logger.error("Failed in background Ping cmd trigger") return api.types.status.FAILURE if tc.resp is None: api.Logger.error("Received empty response for config request") return api.types.status.FAILURE else: for cmd in tc.resp.commands: api.PrintCommandResults(cmd) if cmd.exit_code != 0: api.Logger.error("Rollout request failed") return api.types.status.FAILURE # wait for upgrade to complete. status can be found from the presence of /update/pds_upg_status.txt api.Logger.info("Sleep for 70 secs before checking for /update/pds_upg_status.txt") misc_utils.Sleep(70) status_in_progress = True while status_in_progress: misc_utils.Sleep(1) req = api.Trigger_CreateExecuteCommandsRequest(serial=False) for node in tc.Nodes: api.Trigger_AddNaplesCommand(req, node, "grep -v in-progress /update/pds_upg_status.txt", timeout=2) api.Logger.info("Checking for status not in-progress in file /update/pds_upg_status.txt") resp = api.Trigger(req) status_in_progress = False for cmd_resp in resp.commands: #api.PrintCommandResults(cmd_resp) if cmd_resp.exit_code != 0: status_in_progress = True #api.Logger.info("File /update/pds_upg_status.txt not found") else: api.Logger.info("Status other than in-progress found in /update/pds_upg_status.txt") # # push interface config updates after upgrade completes # UpdateConfigAfterUpgrade(tc) for i in range(10): api.Logger.info("Sending ARPing, retry count %s"%i) # Send Grat Arp for learning arping.SendGratArp(tc.wloads) misc_utils.Sleep(1) result = CheckRolloutStatus(tc) # ensure connectivity after upgrade if VerifyConnectivity(tc) != api.types.status.SUCCESS: api.Logger.error("Failed in Connectivity Check Post Upgrade.") result = api.types.status.FAILURE error_str = None if tc.pktloss_verify: # If rollout status is failure, then no need to wait for traffic test if result == api.types.status.SUCCESS: api.Logger.info("Sleep for %s secs for traffic test to complete"%tc.sleep) misc_utils.Sleep(tc.sleep) pkt_loss_duration = 0 # terminate background traffic and calculate packet loss duration if tc.background: if ping.TestTerminateBackgroundPing(tc, tc.pktsize,\ pktlossverif=tc.pktlossverif) != api.types.status.SUCCESS: api.Logger.error("Failed in Ping background command termination.") result = api.types.status.FAILURE # calculate max packet loss duration for background ping pkt_loss_duration = ping.GetMaxPktLossDuration(tc, interval=tc.interval) if pkt_loss_duration != 0: indent = "-" * 10 if tc.pktlossverif: result = api.types.status.FAILURE api.Logger.error(f"{indent} Packet Loss duration during UPGRADE of {tc.Nodes} is {pkt_loss_duration} secs {indent}") if tc.allowed_down_time and (pkt_loss_duration > tc.allowed_down_time): api.Logger.error(f"{indent} Exceeded allowed Loss Duration {tc.allowed_down_time} secs {indent}") result = api.types.status.FAILURE else: api.Logger.info("No Packet Loss Found during UPGRADE Test") if upgrade_utils.VerifyUpgLog(tc.Nodes, tc.GetLogsDir()): api.Logger.error("Failed to verify the upgrade logs") result = api.types.status.FAILURE return result