def start_ib_acme(nodes): ssa_tools_utils.pdsh_run(nodes, 'mkdir -p %s' % LOG_DIR) for i in xrange(0, NUM_OF_QUERIES): cmd = 'nohup %s/ib_stress.sh %d > /dev/null &' % ( SSA_SCRIPTS, IB_ACME_DELAY) ssa_tools_utils.pdsh_run(nodes, cmd) return 0
def test_0_0_0(ibmsnet): test_header = inspect.getframeinfo(inspect.currentframe()).function phase = 'Subnet UP ' rch_global_dict['test_description'][test_header] = phase status = 0 log_dir = '%s/%s' % (rch_global_dict['log_dir'], test_header) os.mkdir(log_dir) print commands.getoutput('chmod 777 %s' % log_dir) for i in xrange(0, rch_global_dict['subnet_up_num']): print '[%s] Subnet UP #%d out of %d' % (time.strftime( "%b %d %H:%M:%S"), i, rch_global_dict['subnet_up_num']) core_node = random.choice(rch_global_dict['core_nodes']) ssa_tools_utils.pdsh_run(core_node, 'sudo kill -s HUP `pidof opensm valgrind`') time.sleep( random.randint(rch_global_dict['min_delay'], rch_global_dict['max_delay'])) for cmd in [ '%s/maintain.py -t %s --setup status > %s/ssa_status.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']), '%s/maintain.py -t %s -e > %s/ssa_errors.log' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']), '%s/logcollector.py -t %s -o %s' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['log_dir']) ]: print cmd o = commands.getoutput(cmd) o = commands.getoutput("cat %s/ssa_errors.log" % rch_global_dict['log_dir']) o_status = commands.getoutput("cat %s/ssa_status.log" % rch_global_dict['log_dir']) if o.find('Found errors found on') > 0 or o_status.find('STOP') >= 0: print 'There are errors in %s/ssa_errors.log' % rch_global_dict[ 'log_dir'] status = 1 test_report(test_header, phase, status) return status
def start_stress(ibmsnet): test_header = inspect.getframeinfo(inspect.currentframe()).function phase = 'Stress Test' rch_global_dict['test_description'][test_header] = phase status = 0 counter_delay = 30 number_of_queries = 10 acm_nodes = rch_global_dict['acm_nodes'] #acm_nodes = random.sample(rch_global_dict['acm_nodes'],len(rch_global_dict['acm_nodes'])/2) print 'Start counters on %s' % ','.join(acm_nodes) counters_log = '%s/`hostname`_counters.log' % (rch_global_dict['scretch_folder']) ssa_tools_utils.pdsh_run(acm_nodes, 'mkdir -p %s' % rch_global_dict['scretch_folder']) ssa_tools_utils.pdsh_run(acm_nodes, 'nohup %s/server_counters.sh `/usr/bin/pgrep ibacm` %d > %s &' % ( ssa_tools_utils.SSA_SCRIPTS, counter_delay, counters_log)) print 'Start ACM stress on %s' % ','.join(acm_nodes) for i in xrange(0, number_of_queries): ib_acme_log = '%s/`hostname`_ibstress_%d.log' % (rch_global_dict['scretch_folder'], i) cmd = 'nohup %s/ib_stress.sh %d > %s &' % ( ssa_tools_utils.SSA_SCRIPTS, i, ib_acme_log) ssa_tools_utils.pdsh_run(acm_nodes,cmd) test_report(test_header, phase, status) return status
def stop_stress(ibmsnet): test_header = inspect.getframeinfo(inspect.currentframe()).function phase = 'Stress Test' rch_global_dict['test_description'][test_header] = phase status = 0 acm_nodes = rch_global_dict['acm_nodes'] o = ssa_tools_utils.pdsh_run(acm_nodes, 'killall ib_stress.sh server_counters.sh 2>/dev/null') time.sleep(5) o = ssa_tools_utils.pdsh_run(acm_nodes, 'sudo mv %s/* %s/ 2>/dev/null' % (rch_global_dict['scretch_folder'], rch_global_dict['test_folder'])) time.sleep(10) o = commands.getoutput('%s/maintain.py -t %s -e' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'])) o = commands.getoutput('%s/logcollector.py -t %s -o %s' % (ssa_tools_utils.SSA_HOME, rch_global_dict['topology'], rch_global_dict['test_folder'])) for cmd in ['grep -rni "failed|ERR" %s' % rch_global_dict['scretch_folder'],]: o = commands.getoutput(cmd) print "%s\n%s" % (cmd, o) if o != '': status = 1 test_report(test_header, phase, status) return status
def sanity_test_1(cores, als, acms, data): status = 0 osmlid = commands.getoutput( "/usr/sbin/ibstat |grep -a5 Act|grep SM|awk '{print $NF}'").rstrip( '\n') for core in cores: if data[core][LID] == osmlid: core_master = core break access_svc = als[0] acm_svc = acms[0] print '===================================================================' print '========================= SANITY TEST 1 ===========================' print '===================================================================' for acm in acms: if acm == acm_svc: continue status = test_acm_by_lid_query(acm, data[acm][LID], data[acm_svc][LID]) if status != 0: return status status = test_acm_by_gid_query(acm, data[acm][GID], data[acm_svc][GID]) if status != 0: return status stop_services(core_master, access_svc, acm_svc) # Disconect ACM from fabric (remote_lid, remote_port) = get_node_remote(acm_svc) cmd = 'ibportstate %s %s disable' % (remote_lid, remote_port) print cmd ssa_tools_utils.pdsh_run(core_master, cmd) print 'Wait' time.sleep(120) for acm in acms: if acm == acm_svc: continue status = test_acm_by_lid_query(acm, data[acm][LID], data[acm_svc][LID], initial_query=1, print_err=0) if status == 0: print 'ERROR. ACM %s LID %s still exists in %s LID %s cache' % \ (acm_svc, str(data[acm_svc][LID]), acm, str(data[acm][LID])) cmd = 'ibportstate %s %s enable' % (remote_lid, remote_port) print cmd ssa_tools_utils.pdsh_run(core_master, cmd) return 1 status = test_acm_by_gid_query(acm, data[acm][GID], data[acm_svc][GID], print_err=0) if status == 0: print 'ERROR. ACM %s GID %s still exists in %s GID %s cache' % \ (acm_svc, str(data[acm_svc][GID]), acm, str(data[acm][GID])) cmd = 'ibportstate %s %s enable' % (remote_lid, remote_port) print cmd ssa_tools_utils.pdsh_run(core_master, cmd) return 1 # Reconnect ACM back to fabric cmd = 'ibportstate %s %s enable' % (remote_lid, remote_port) print cmd # command can be run on any node except for the disconected ACM ssa_tools_utils.pdsh_run(core_master, cmd) time.sleep(60) start_services(core_master, access_svc, acm_svc) status = not status print '===================================================================' print '==================== SANITY TEST 1 COMPLETE =======================' print '===================================================================' return status
def stop_ib_acme(nodes): ssa_tools_utils.pdsh_run(nodes, 'killall ib_stress.sh 2>/dev/null') return 0
def stop_counters(nodes): ssa_tools_utils.pdsh_run(nodes, 'killall server_counters.sh 2>/dev/null') return 0
def start_counters(nodes): status = 0 counters_log = '%s/`hostname`_counters.log' % LOG_DIR ssa_tools_utils.pdsh_run(nodes, 'mkdir -p %s' % LOG_DIR) ssa_tools_utils.pdsh_run(nodes, 'nohup %s/server_counters.sh `pgrep "ibacm|ibssa|opensm"` %d > %s &' % ( SSA_SCRIPTS, COUNTER_DELAY, counters_log)) return 0
rch_global_dict['exclude_tests'].append(options.exclude.split(',')) else: tests = options.include.split(',') if options.reinstall: sources = " " if options.ssa_sources: sources = "%s %s" % (options.ssa_sources, rch_global_dict['scretch_folder']) coverage = " " if options.coverage: coverage = "export COVERAGE=1" else: coverage = "export COVERAGE=0" print "%s; sudo -E %s/ssa_install.sh %s" % (coverage, ssa_tools_utils.SSA_HOME, sources) ssa_tools_utils.pdsh_run(rch_global_dict['nodes'], '%s; sudo -E %s/ssa_install.sh %s' % (coverage, ssa_tools_utils.SSA_HOME, sources)) setup(rch_global_dict['topology'], 'start') gDictParams = {} #Check and save setup ssa_setup = open(setup(rch_global_dict['topology'], 'status'), 'r').readlines() #get everything from check_setup rch_global_dict['ssa_setup'] = ''.join(ssa_setup[ssa_setup.index('************* check_setup ********************\n'):]) if not options.restart: for i in ssa_setup: if i.find('STOPPED') >= 0: print i.rstrip('\n') status = 1 if status: print 'Not all nodes are running'