def get_vals_for_attach(): global nprocs, pgm, pgmArgs, mship, rship, argsFilename, delArgsFile, \ try0Locally, lineLabels, jobAlias, mergingOutput, conSocket global stdinGoesToWho, myExitStatus, manSocket, jobid, username, cwd, totalview global outXmlDoc, outXmlEC, outXmlFile, linesPerRank, gdb, gdbAttachJobid global execs, users, cwds, paths, args, envvars, limits, hosts, hostList global singinitPID, singinitPORT, doingBNR, myHost, myIP sjobid = gdbAttachJobid.split('@') # jobnum and originating host msgToSend = {'cmd': 'mpdlistjobs'} mpd_send_one_msg(conSocket, msgToSend) msg = recv_one_msg_with_timeout(conSocket, 5) if not msg: mpd_raise('no msg recvd from mpd before timeout') if msg['cmd'] != 'local_mpdid': # get full id of local mpd for filters later mpd_raise( 'did not recv local_mpdid msg from local mpd; instead, recvd: %s' % msg) else: if len(sjobid) == 1: sjobid.append(msg['id']) got_info = 0 while 1: msg = mpd_recv_one_msg(conSocket) if not msg.has_key('cmd'): print 'mpdlistjobs: INVALID msg=:%s:' % (msg) exit(-1) if msg['cmd'] == 'mpdlistjobs_info': got_info = 1 smjobid = msg['jobid'].split( ' ') # jobnum, mpdid, and alias (if present) if sjobid[0] == smjobid[0] and sjobid[1] == smjobid[ 1]: # jobnum and mpdid rank = int(msg['rank']) users[(rank, rank)] = msg['username'] hosts[(rank, rank)] = msg['host'] execs[(rank, rank)] = msg['pgm'] cwds[(rank, rank)] = cwd paths[(rank, rank)] = environ['PATH'] args[(rank, rank)] = [msg['clipid']] envvars[(rank, rank)] = {} limits[(rank, rank)] = {} elif msg['cmd'] == 'mpdlistjobs_trailer': if not got_info: print 'no info on this jobid; probably invalid' exit(-1) break else: print 'invaild msg from mpd :%s:' % (msg) exit(-1) nprocs = len(execs.keys()) # all dicts are the same len here
def recv_one_msg_with_timeout(sock, timeout): oldTimeout = alarm(timeout) msg = mpd_recv_one_msg(sock) # fails WITHOUT a msg if sigalrm occurs alarm(oldTimeout) return (msg)
conSocket = socket(AF_UNIX, SOCK_STREAM) # note: UNIX socket try: conSocket.connect(consoleName) except Exception, errmsg: print 'mpdringtest: cannot connect to local mpd (%s); possible causes:' % consoleName print ' 1. no mpd running on this host' print ' 2. mpd is running but was started without a "console" (-n option)' print 'you can start an mpd with the "mpd" command; to get help, run:' print ' mpd -h' exit(-1) msgToSend = 'realusername=%s\n' % username mpd_send_one_line(conSocket, msgToSend) msgToSend = {'cmd': 'mpdringtest', 'numloops': numLoops} starttime = time() mpd_send_one_msg(conSocket, msgToSend) msg = mpd_recv_one_msg(conSocket) etime = time() - starttime if not msg: print 'mpdringtest terminated early' elif msg['cmd'] != 'mpdringtest_done': if msg['cmd'] == 'already_have_a_console': print 'mpd already has a console (e.g. for long ringtest); try later' else: print 'unexpected message from mpd: %s' % (msg) else: print 'time for %d loops =' % numLoops, etime, 'seconds' def sigint_handler(signum, frame): exit(-1)
def mpdboot(): global myHost, fullDirName, topMPDBoot, user mpd_set_my_id('mpdboot_rank_notset') fullDirName = path.abspath(path.split(argv[0])[0]) rshCmd = 'ssh' user = mpd_get_my_username() mpdCmd = path.join(fullDirName, 'mpd.py') mpdbootCmd = path.join(fullDirName, 'mpdboot.py') hostsFilename = 'mpd.hosts' totalNum = 1 # may get chgd below debug = 0 verbosity = 0 localConsoleArg = '' remoteConsoleArg = '' myConsoleVal = '' oneMPDPerHost = 1 entryHost = '' entryPort = '' topMPDBoot = 1 myHost = gethostname() myNcpus = 1 myIfhn = '' try: shell = path.split(environ['SHELL'])[-1] except: shell = 'csh' argidx = 1 # skip arg 0 while argidx < len(argv): if argv[argidx] == '-h' or argv[argidx] == '--help': usage() elif argv[argidx] == '-zentry': # entry host and port if ':' not in argv[argidx + 1]: print 'invalid pair of entry host and entry port for -zentry option' usage() (entryHost, entryPort) = argv[argidx + 1].split(':') try: ip = gethostbyname_ex(entryHost)[2] # may fail if invalid host except: print 'invalid entry host ', entryHost stdout.flush() usage() if not entryPort.isdigit(): print 'invalid (nonumeric) entry port ', entryPort stdout.flush() usage() entryHost = entryHost entryPort = entryPort argidx += 2 elif argv[argidx] == '-zrank': topMPDBoot = 0 myBootRank = int(argv[argidx + 1]) argidx += 2 elif argv[argidx] == '-zhosts': zhosts = argv[argidx + 1] zhosts = zhosts.split(',') hostsAndInfo = [] for zhost in zhosts: (host, ncpus, ifhn) = zhost.split(':') hostsAndInfo.append({ 'host': host, 'ncpus': ncpus, 'ifhn': ifhn }) argidx += 2 elif argv[argidx] == '-r': # or --rsh= rshCmd = argv[argidx + 1] argidx += 2 elif argv[argidx].startswith('--rsh'): splitArg = argv[argidx].split('=') try: rshCmd = splitArg[1] except: print 'mpdboot: invalid argument:', argv[argidx] usage() argidx += 1 elif argv[argidx] == '-u': # or --user= user = argv[argidx + 1] argidx += 2 elif argv[argidx].startswith('--user'): splitArg = argv[argidx].split('=') try: user = splitArg[1] except: print 'mpdboot: invalid argument:', argv[argidx] usage() argidx += 1 elif argv[argidx] == '-m': # or --mpd= mpdCmd = argv[argidx + 1] argidx += 2 elif argv[argidx].startswith('--mpd'): splitArg = argv[argidx].split('=') try: mpdCmd = splitArg[1] except: print 'mpdboot: invalid argument:', argv[argidx] usage() argidx += 1 elif argv[argidx] == '-f': # or --file= hostsFilename = argv[argidx + 1] argidx += 2 elif argv[argidx].startswith('--file'): splitArg = argv[argidx].split('=') try: hostsFilename = splitArg[1] except: print 'mpdboot: invalid argument:', argv[argidx] usage() argidx += 1 elif argv[argidx].startswith('--ncpus'): splitArg = argv[argidx].split('=') try: myNcpus = splitArg[1] except: print 'mpdboot: invalid argument:', argv[argidx] usage() argidx += 1 elif argv[argidx].startswith('--ifhn'): splitArg = argv[argidx].split('=') myIfhn = splitArg[1] myHost = splitArg[1] argidx += 1 elif argv[argidx] == '-n': # or --totalnum= totalNum = int(argv[argidx + 1]) argidx += 2 elif argv[argidx].startswith('--totalnum'): splitArg = argv[argidx].split('=') try: totalNum = int(splitArg[1]) except: print 'mpdboot: invalid argument:', argv[argidx] usage() argidx += 1 elif argv[argidx] == '-d' or argv[argidx] == '--debug': debug = 1 argidx += 1 elif argv[argidx] == '-s' or argv[argidx] == '--shell': shell = 'bourne' argidx += 1 elif argv[argidx] == '-v' or argv[argidx] == '--verbose': verbosity = 1 argidx += 1 elif argv[argidx] == '-1': oneMPDPerHost = 0 argidx += 1 elif argv[argidx] == '--loccons': localConsoleArg = '--loccons' argidx += 1 elif argv[argidx] == '--remcons': remoteConsoleArg = '--remcons' argidx += 1 else: print 'mpdboot: unrecognized argument:', argv[argidx] usage() if topMPDBoot: lines = [] if totalNum > 1: try: f = open(hostsFilename, 'r') for line in f: lines.append(line) except: print 'unable to open (or read) hostsfile %s' % (hostsFilename) exit(-1) hostsAndInfo = [{'host': myHost, 'ncpus': myNcpus, 'ifhn': myIfhn}] for line in lines: line = line.strip() if not line or line[0] == '#': continue splitLine = re.split(r'\s+', line) host = splitLine[0] ncpus = 1 # default if ':' in host: (host, ncpus) = host.split(':', 1) ncpus = int(ncpus) ifhn = '' # default for kv in splitLine[1:]: (k, v) = kv.split('=', 1) if k == 'ifhn': ifhn = v hostsAndInfo.append({'host': host, 'ncpus': ncpus, 'ifhn': ifhn}) if oneMPDPerHost and totalNum > 1: oldHosts = hostsAndInfo[:] hostsAndInfo = [] for x in oldHosts: keep = 1 for y in hostsAndInfo: if mpd_same_ips(x['host'], y['host']): keep = 0 break if keep: hostsAndInfo.append(x) if len(hostsAndInfo) < totalNum: # one is local print 'totalNum=%d num hosts=%d' % (totalNum, len(hostsAndInfo)) print 'there are not enough hosts on which to start all processes' exit(-1) myBootRank = 0 if localConsoleArg: myConsoleVal = '-n' else: if remoteConsoleArg: myConsoleVal = '-n' anMPDalreadyHere = 0 for i in range(myBootRank): if mpd_same_ips(hostsAndInfo[i]['host'], myHost): # if one before me on this host myConsoleVal = '-n' anMPDalreadyHere = 1 break if not anMPDalreadyHere: try: system('%s/mpdallexit.py > /dev/null' % (fullDirName)) # stop any current mpds except: pass mpd_set_my_id('mpdboot_%s_%d' % (myHost, myBootRank)) if debug: mpd_print(1, 'starting') (parent, lchild, rchild) = mpd_get_ranks_in_binary_tree(myBootRank, totalNum) if debug: mpd_print(1, 'p=%d l=%d r=%d' % (parent, lchild, rchild)) if myIfhn: ifhnVal = '--if %s' % (myIfhn) elif hostsAndInfo[myBootRank]['ifhn']: ifhnVal = '--if %s' % (hostsAndInfo[myBootRank]['ifhn']) else: ifhnVal = '' if entryHost: cmd = '%s %s -h %s -p %s -d -e --ncpus %s %s' % \ (mpdCmd,myConsoleVal,entryHost,entryPort,myNcpus,ifhnVal) else: cmd = '%s %s -d -e --ncpus %s %s' % \ (mpdCmd,myConsoleVal,myNcpus,ifhnVal) if verbosity: mpd_print(1, 'starting local mpd on %s' % (myHost)) if debug: mpd_print(1, 'cmd to run local mpd = :%s:' % (cmd)) if not access(mpdCmd, X_OK): err_exit('cannot access mpd cmd :%s:' % (mpdCmd)) locMPD = Popen4(cmd, 0) locMPDFD = locMPD.fromchild locMPDPort = locMPDFD.readline().strip() if locMPDPort.isdigit(): # can't do this until he's already in his ring locMPDSocket = mpd_get_inet_socket_and_connect(myHost, int(locMPDPort)) if locMPDSocket: msgToSend = { 'cmd': 'ping', 'host': 'ping', 'port': 0 } # dummy host & port mpd_send_one_msg(locMPDSocket, { 'cmd': 'ping', 'host': myHost, 'port': 0 }) msg = mpd_recv_one_msg(locMPDSocket) # RMB: WITH TIMEOUT ?? if not msg or not msg.has_key('cmd') or msg['cmd'] != 'ping_ack': err_exit( '%d: unable to ping local mpd; invalid msg from mpd :%s:' % (myBootRank, msg)) locMPDSocket.close() else: err_exit('failed to connect to mpd') else: err_exit('%d: invalid port from mpd %s' % (myBootRank, str(locMPDPort))) if not entryHost: entryHost = myHost entryPort = locMPDPort if rshCmd == 'ssh': xOpt = '-x' else: xOpt = '' lfd = 0 rfd = 0 fdsToSelect = [] if debug: debugArg = '-d' else: debugArg = '' if verbosity: verboseArg = '-v' else: verboseArg = '' if lchild >= 0: zhosts = [ "%s:%s:%s" % (h['host'], h['ncpus'], h['ifhn']) for h in hostsAndInfo ] if hostsAndInfo[lchild]['ifhn']: ifhnVal = '--ifhn=%s' % (hostsAndInfo[lchild]['ifhn']) else: ifhnVal = '' cmd = "%s %s %s -n '%s --ncpus=%s %s -r %s -m %s -n %d %s %s %s -zentry %s:%s -zrank %s -zhosts %s </dev/null ' " % \ (rshCmd, xOpt, hostsAndInfo[lchild]['host'], mpdbootCmd, hostsAndInfo[lchild]['ncpus'],ifhnVal, rshCmd, mpdCmd, totalNum, debugArg, verboseArg, remoteConsoleArg, entryHost, entryPort, lchild, ','.join(zhosts) ) if verbosity: mpd_print(1, 'starting remote mpd on %s' % (hostsAndInfo[lchild])) if debug: mpd_print(1, 'cmd to run lchild boot = :%s:' % (cmd)) lchildMPDBoot = Popen4(cmd, 0) lfd = lchildMPDBoot.fromchild fdsToSelect.append(lfd) if rchild >= 0: zhosts = [ "%s:%s:%s" % (h['host'], h['ncpus'], h['ifhn']) for h in hostsAndInfo ] if hostsAndInfo[rchild]['ifhn']: ifhnVal = '--ifhn=%s' % (hostsAndInfo[rchild]['ifhn']) else: ifhnVal = '' cmd = "%s %s %s -n '%s --ncpus=%s %s -r %s -m %s -n %d %s %s %s -zentry %s:%s -zrank %s -zhosts %s </dev/null ' " % \ (rshCmd, xOpt, hostsAndInfo[rchild]['host'], mpdbootCmd, hostsAndInfo[rchild]['ncpus'],ifhnVal, rshCmd, mpdCmd, totalNum, debugArg, verboseArg, remoteConsoleArg, entryHost, entryPort, rchild, ','.join(zhosts) ) if verbosity: mpd_print(1, 'starting remote mpd on %s' % (hostsAndInfo[rchild])) if debug: mpd_print(1, 'cmd to run rchild boot = :%s:' % (cmd)) rchildMPDBoot = Popen4(cmd, 0) rfd = rchildMPDBoot.fromchild fdsToSelect.append(rfd) lfd_first_line = 1 rfd_first_line = 1 while fdsToSelect: try: (readyFDs, unused1, unused2) = select(fdsToSelect, [], [], 0.1) except error, errmsg: mpd_raise('mpdboot: select failed: errmsg=:%s:' % (errmsg)) if lfd and lfd in readyFDs: line = lfd.readline() if line: if line.find('RC=MPDBOOT_ERREXIT') >= 0: err_exit('RC=MPDBOOT_ERREXIT') else: if not verbosity and lfd_first_line: lfd_first_line = 0 mpd_print( 1, "error trying to start mpd(boot) at %d %s; output:" % (lchild, hostsAndInfo[lchild])) print ' ', line, stdout.flush() else: lfd.close() fdsToSelect.remove(lfd) if rfd and rfd in readyFDs: line = rfd.readline() if line: if line.find('RC=MPDBOOT_ERREXIT') >= 0: err_exit('RC=MPDBOOT_ERREXIT') else: if not verbosity and rfd_first_line: rfd_first_line = 0 mpd_print( 1, "error trying to start mpd(boot) at %d %s; output:" % (rchild, hostsAndInfo[rchild])) print ' ', line, stdout.flush() else: rfd.close() fdsToSelect.remove(rfd)
print ' %s' % (host) elif msg['reason'] == 'invalid_username': print 'mpdrun: invalid username %s at host %s' % \ (msg['username'],msg['host']) else: print 'mpdrun: job failed; reason=:%s:' % (msg['reason']) myExitStatus = -1 # used in main exit(myExitStatus) # really forces jump back into main else: mpd_raise('unexpected message from mpd: %s' % (msg)) conSocket.close() if jobTimeout: alarm(jobTimeout) (manSocket, addr) = listenSocket.accept() msg = mpd_recv_one_msg(manSocket) if (not msg or not msg.has_key('cmd') or msg['cmd'] != 'man_checking_in'): mpd_raise('mpdrun: from man, invalid msg=:%s:' % (msg)) msgToSend = { 'cmd': 'ring_ncpus', 'ring_ncpus': currRingNCPUs, 'ringsize': currRingSize } mpd_send_one_msg(manSocket, msgToSend) msg = mpd_recv_one_msg(manSocket) if (not msg or not msg.has_key('cmd')): mpd_raise('mpdrun: from man, invalid msg=:%s:' % (msg)) if (msg['cmd'] == 'job_started'): jobid = msg['jobid'] if outXmlEC: outXmlEC.setAttribute('jobid', jobid.strip())