def work_get(getter, uid): data[uid] = list() done = False while not done: msgs = getter.get() for msg in msgs: msg = ru.as_string(msg) if msg['idx'] is None: done = True break data[uid].append(msg['src'])
def work_get(getter, uid): data[uid] = list() done = False n = 0 while not done: msgs = getter.get() for msg in msgs: msg = ru.as_string(msg) if msg['idx'] is None: final = True done = True else: data[uid].append(msg['src']) n += 1 getter.stop()
def rm_config_hook(cls, name, cfg, rm, logger, profiler): profiler.prof('flux_start') flux = ru.which('flux') if not flux: raise Exception("Couldn't find flux") try: import sys print(sys.path) import flux except: raise Exception("Couldn't import flux") # cmd = 'flux start -o,-v,-S,log-filename=out'.split() # proc = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.STDOUT) # proc.stdin.write(ru.as_bytes('flux getattr local-uri\necho "OK"\n')) check = 'flux env; echo "OK"; while true; do echo "ok"; sleep 1; done' start = 'flux start -o,-v,-S,log-filename=out' cmd = '/bin/bash -c "echo \\\"%s\\\" | %s"' % (check, start) proc = sp.Popen(cmd, shell=True, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.STDOUT) flux_env = dict() while True: line = ru.as_string(proc.stdout.readline().strip()) logger.debug('=== %s', line) if line.startswith('export '): k, v = line.split(' ', 1)[1].strip().split('=', 1) flux_env[k] = v.strip('"') logger.debug('%s = %s' % (k, v.strip('"'))) elif line == 'OK': break assert ('FLUX_URI' in flux_env) # TODO check perf implications flux_url = ru.Url(flux_env['FLUX_URI']) flux_url.host = ru.get_hostname() flux_url.scheme = 'ssh' flux_env['FLUX_URI'] = str(flux_url) profiler.prof('flux_started') # ---------------------------------------------------------------------- def _watch_flux(flux_env): logger.info('=== starting flux watcher') for k, v in flux_env.items(): os.environ[k] = v ret = None while not ret: out, err, ret = ru.sh_callout('flux ping -c 1 all') logger.debug('=== flux watcher out: %s', out) if ret: logger.error('=== flux watcher err: %s', err) break time.sleep(0.1) logger.info('flux stopped?') # FIXME: trigger termination # ---------------------------------------------------------------------- flux_watcher = mt.Thread(target=_watch_flux, args=[flux_env]) flux_watcher.daemon = True flux_watcher.start() logger.info("flux startup successful: [%s]", flux_env['FLUX_URI']) lm_info = {'flux_env': flux_env, 'flux_pid': proc.pid} return lm_info
def _parse_pbspro_vnodes(self): # PBS Job ID val = os.environ.get('PBS_JOBID') if val: pbspro_jobid = val else: msg = "$PBS_JOBID not set!" self._log.error(msg) raise RuntimeError(msg) # Get the output of qstat -f for this job output = subprocess.check_output(["qstat", "-f", pbspro_jobid]) # Get the (multiline) 'exec_vnode' entry vnodes_str = '' for line in output.splitlines(): line = ru.as_string(line) # Detect start of entry if 'exec_vnode = ' in line: vnodes_str += line.strip() elif vnodes_str: # Find continuing lines if " = " not in line: vnodes_str += line.strip() else: break # Get the RHS of the entry rhs = vnodes_str.split('=',1)[1].strip() self._log.debug("input: %s", rhs) nodes_list = [] # Break up the individual node partitions into vnode slices while True: idx = rhs.find(')+(') node_str = rhs[1:idx] nodes_list.append(node_str) rhs = rhs[idx + 2:] if idx < 0: break vnodes_list = [] cpus_list = [] # Split out the slices into vnode name and cpu count for node_str in nodes_list: slices = node_str.split('+') for _slice in slices: vnode, cpus = _slice.split(':') cpus = int(cpus.split('=')[1]) self._log.debug("vnode: %s cpus: %s", vnode, cpus) vnodes_list.append(vnode) cpus_list.append(cpus) self._log.debug("vnodes: %s", vnodes_list) self._log.debug("cpus: %s", cpus_list) cpus_list = list(set(cpus_list)) min_cpus = int(min(cpus_list)) if len(cpus_list) > 1: self._log.debug("Detected vnodes of different sizes: %s, the minimal is: %d.", cpus_list, min_cpus) node_list = [] for vnode in vnodes_list: node_list.append(vnode) # only unique node names node_list = list(set(node_list)) self._log.debug("Node list: %s", node_list) # Return the list of node names return node_list
def _handle_unit_stdio(self, unit): sandbox = unit['unit_sandbox_path'] uid = unit['uid'] self._prof.prof('staging_stdout_start', uid=uid) # TODO: disable this at scale? if unit.get('stdout_file') and os.path.isfile(unit['stdout_file']): with open(unit['stdout_file'], 'r') as stdout_f: try: txt = ru.as_string(stdout_f.read()) except UnicodeDecodeError: txt = "unit stdout is binary -- use file staging" unit['stdout'] += rpu.tail(txt) self._prof.prof('staging_stdout_stop', uid=uid) self._prof.prof('staging_stderr_start', uid=uid) # TODO: disable this at scale? if unit.get('stderr_file') and os.path.isfile(unit['stderr_file']): with open(unit['stderr_file'], 'r') as stderr_f: try: txt = ru.as_string(stderr_f.read()) except UnicodeDecodeError: txt = "unit stderr is binary -- use file staging" unit['stderr'] += rpu.tail(txt) # to help with ID mapping, also parse for PRTE output: # [batch3:122527] JOB [3673,4] EXECUTING with open(unit['stderr_file'], 'r') as stderr_f: for line in stderr_f.readlines(): line = line.strip() if not line: continue if line[0] == '[' and line.endswith('EXECUTING'): elems = line.replace('[', '').replace(']', '').split() tid = elems[2] self._log.info('PRTE IDMAP: %s:%s' % (tid, uid)) unit['stderr'] += rpu.tail(txt) self._prof.prof('staging_stderr_stop', uid=uid) self._prof.prof('staging_uprof_start', uid=uid) unit_prof = "%s/%s.prof" % (sandbox, uid) if os.path.isfile(unit_prof): try: with open(unit_prof, 'r') as prof_f: txt = ru.as_string(prof_f.read()) for line in txt.split("\n"): if line: ts, event, comp, tid, _uid, state, msg = \ line.split(',') self._prof.prof(ts=float(ts), event=event, comp=comp, tid=tid, uid=_uid, state=state, msg=msg) except Exception as e: self._log.error("Pre/Post profile read failed: `%s`" % e) self._prof.prof('staging_uprof_stop', uid=uid)