def _configure(self): self.launch_command = self._find_executable([ 'mpirun', # General case 'mpirun_rsh', # Gordon @ SDSC 'mpirun-mpich-mp', # Mac OSX MacPorts 'mpirun-openmpi-mp' # Mac OSX MacPorts ]) # alas, the way to transplant env variables to the target node differs # per mpi(run) version... out, err, ret = ru.sh_callout('%s -v' % self.launch_command) if ret != 0: out, err, ret = ru.sh_callout('%s -info' % self.launch_command) self.launch_version = '' for line in out.splitlines(): if 'HYDRA build details:' in line: self.launch_version += 'hydra-' if 'version:' in line.lower(): self.launch_version += line.split(':')[1].strip().lower() break if not self.launch_version: self.launch_version = 'unknown'
def test_upload_and_download(config): cfg = config() tmp_dir = '/tmp/' replica_url = cfg.replica_url with open(TEMP_FILENAME, "wb") as f: f.write("x" * (FILE_SIZE * pow(2, 20))) # clear old file out, err, ret = ru.sh_callout(["irm", TEMP_FILENAME]) _ = rs.replica.LogicalDirectory(replica_url) f = rs.replica.LogicalFile(replica_url + TEMP_FILENAME) f.upload(tmp_dir + TEMP_FILENAME, "irods:///path/is/ignored/?resource=" + IRODS_RESOURCE) # myfile.upload(tmp_dir + TEMP_FILENAME, \ # "irods:///this/path/is/ignored") os.remove(tmp_dir + TEMP_FILENAME) myfile = rs.replica.LogicalFile(replica_url + TEMP_FILENAME) myfile.download(TEMP_FILENAME) os.remove(tmp_dir + TEMP_FILENAME)
def test_sh_callout(): out, err, ret = ru.sh_callout('echo TRUE') assert(out == 'TRUE\n'), out assert(err == ''), err assert(ret == 0), ret out, err, ret = ru.sh_callout('false') assert(out == ''), out assert(err == ''), err assert(ret == 1), ret out, err, ret = ru.sh_callout('echo FALSE 1>&2; exit 2', shell=True) assert(out == ''), out assert(err == 'FALSE\n'), err assert(ret == 2), ret
def test_upload_and_download(config): cfg = config() replica_url = cfg.replica_url with open(TEMP_FILENAME, "wb") as f: f.write("x" * (FILE_SIZE * pow(2, 20))) # clear old file out, err, ret = ru.sh_callout(["irm", TEMP_FILENAME]) d = rs.replica.LogicalDirectory(replica_url) f = rs.replica.LogicalFile(replica_url + TEMP_FILENAME) f.upload(home_dir + TEMP_FILENAME, \ "irods:///this/path/is/ignored/?resource="+IRODS_RESOURCE) #myfile.upload(home_dir + TEMP_FILENAME, \ # "irods:///this/path/is/ignored") print "Deleting file locally : %s" % (home_dir + TEMP_FILENAME) os.remove(home_dir + TEMP_FILENAME) print "Downloading logical file %s to current/default directory" % \ (replica_url + TEMP_FILENAME) myfile.download(TEMP_FILENAME) print "Deleting downloaded file locally : %s" % (home_dir + TEMP_FILENAME) os.remove(home_dir + TEMP_FILENAME)
def _get_mpi_info(self, exe): ''' returns version and flavor of MPI version. ''' if not exe: raise ValueError('no executable found') version = None flavor = self.MPI_FLAVOR_UNKNOWN out, _, ret = ru.sh_callout('%s -v' % exe) if ret: out, _, ret = ru.sh_callout('%s --version' % exe) if ret: out, _, ret = ru.sh_callout('%s -info' % exe) if not ret: for line in out.splitlines(): if 'hydra build details:' in line.lower(): version = line.split(':', 1)[1].strip() flavor = self.MPI_FLAVOR_HYDRA break if 'mvapich2' in line.lower(): version = line flavor = self.MPI_FLAVOR_HYDRA break if 'version:' in line.lower(): version = line.split(':', 1)[1].strip() flavor = self.MPI_FLAVOR_OMPI break if '(open mpi)' in line.lower(): version = line.split(')', 1)[1].strip() flavor = self.MPI_FLAVOR_OMPI break if not flavor: raise RuntimeError('cannot identify MPI flavor [%s]' % exe) self._log.debug('mpi version: %s [%s]', version, flavor) return version, flavor
def start_components(self, cfg=None): ''' check if any components are defined under `cfg['components']` and start them ''' self._prof.prof('start_components_start', uid=self._uid) timeout = self._cfg.heartbeat.timeout if cfg is None: cfg = self._cfg # we pass a copy of the complete session config to all components, but # merge it into the component specific config settings (no overwrite), # and then remove the `bridges` and `components` sections # scfg = ru.Config(cfg=cfg) if 'bridges' in scfg: del (scfg['bridges']) if 'components' in scfg: del (scfg['components']) for cname, ccfg in cfg.get('components', {}).items(): for _ in range(ccfg.get('count', 1)): ccfg.uid = ru.generate_id(cname, ns=self._sid) ccfg.cmgr = self.uid ccfg.kind = cname ccfg.sid = cfg.sid ccfg.base = cfg.base ccfg.path = cfg.path ccfg.heartbeat = cfg.heartbeat ccfg.merge(scfg, policy=ru.PRESERVE, log=self._log) fname = '%s/%s.json' % (cfg.path, ccfg.uid) ccfg.write(fname) self._log.info('create component %s [%s]', cname, ccfg.uid) out, err, ret = ru.sh_callout('radical-pilot-component %s' % fname) self._log.debug('out: %s', out) self._log.debug('err: %s', err) if ret: raise RuntimeError('bridge startup failed') self._uids.append(ccfg.uid) self._log.info('created component %s [%s]', cname, ccfg.uid) # all components should start now, for their heartbeats # to appear. failed = self._hb.wait_startup(self._uids, timeout=timeout * 10) if failed: raise RuntimeError('could not start all components %s' % failed) self._prof.prof('start_components_stop', uid=self._uid)
def _get_mpi_info(self, exe): ''' returns version and flavor of MPI version. ''' version = None flavor = self.MPI_FLAVOR_UNKNOWN out, err, ret = ru.sh_callout('%s -v' % exe) if ret: out, err, ret = ru.sh_callout('%s --version' % exe) if ret: out, err, ret = ru.sh_callout('%s -info' % exe) if not ret: for line in out.splitlines(): if 'hydra build details:' in line.lower(): version = line.split(':', 1)[1].strip() flavor = self.MPI_FLAVOR_HYDRA break if 'mvapich2' in line.lower(): version = line flavor = self.MPI_FLAVOR_HYDRA break if 'version:' in line.lower(): version = line.split(':', 1)[1].strip() flavor = self.MPI_FLAVOR_OMPI break if '(open mpi):' in line.lower(): version = line.split(')', 1)[1].strip() flavor = self.MPI_FLAVOR_OMPI break if not flavor: raise RuntimeError('cannot identify MPI flavor [%s]' % exe) self._log.debug('mpi version: %s [%s]', version, flavor) return version, flavor
def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger, profiler): """ This hook is symmetric to the config hook above, and is called during shutdown sequence, for the sake of freeing allocated resources. """ if 'dvm_uri' in lm_info: try: logger.info('terminating dvm') orterun = ru.which('orterun') if not orterun: raise Exception("Couldn't find orterun") ru.sh_callout('%s --hnp %s --terminate' % (orterun, lm_info['dvm_uri'])) profiler.prof(event='orte_dvm_stop', uid=cfg['pilot_id']) except Exception as e: # use the same event name as for runtime failures - those are # not distinguishable at the moment from termination failures profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) logger.exception('dvm termination failed')
def _configure(self): self.launch_command = ru.which('srun') out, err, ret = ru.sh_callout('%s -V' % self.launch_command) if ret: raise RuntimeError('cannot use srun [%s] [%s]' % (out, err)) self._version = out.split()[-1] self._log.debug('using srun from %s [%s]', self.launch_command, self._version)
def _cmd(cmd): _, _, ret = ru.sh_callout(cmd) if ret == 0: return True else: # print 'cmd: %s' % cmd # print 'out: %s' % out # print 'err: %s' % err return False
def test_gtod(): ''' test ''' out, _, _ = ru.sh_callout('radical-gtod') t1 = float(out) t2 = rg.gtod() t3 = time.time() assert (t3 - 0.1 < t1 < t3 + 0.1) assert (t3 - 0.1 < t2 < t3 + 0.1)
def lrms_shutdown_hook(cls, name, cfg, lrms, lm_info, logger, profiler): """ This hook is symmetric to the config hook above, and is called during shutdown sequence, for the sake of freeing allocated resources. """ if 'dvm_uri' in lm_info: try: logger.info('terminating dvm') orterun = ru.which('orterun') if not orterun: raise Exception("Couldn't find orterun") ru.sh_callout('%s --hnp %s --terminate' % (orterun, lm_info['dvm_uri'])) profiler.prof(event='orte_dvm_stop', uid=cfg['pilot_id']) except Exception as e: # use the same event name as for runtime failures - those are # not distinguishable at the moment from termination failures profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id'], msg=e) logger.exception('dvm termination failed')
def stage_output(self): if os.path.isfile('./staging_output.txt'): if not os.path.isfile('./staging_output.tgz'): cmd = 'tar zcvf staging_output.tgz $(cat staging_output.txt)' out, err, ret = ru.sh_callout(cmd, shell=True) if ret: self._log.debug('out: %s', out) self._log.debug('err: %s', err) self._log.error('output tarring failed: %s', cmd)
def _configure(self): # we only support Cobalt on Theta right now, and since we know that # Theta is a Cray, we know that aprun is available. Alas, aprun # provides the only way (we could find so far) to determing the list of # nodes we have available (`COBALT_NODELIST` seems broken). So we run # `aprun` with the rank of nodes we *think* we have, and with `-N 1` to # place one rank per node, and run `hostname` - that gives is the list # of hostnames. The number of nodes we receive from `$COBALT_PARTSIZE`. n_nodes = int(os.environ['COBALT_PARTSIZE']) out, _, _ = ru.sh_callout('aprun -n %d -N 1 hostname' % n_nodes) node_list = out.split() assert (len(node_list) == n_nodes), node_list # we also want to learn the core count per node cmd = 'cat /proc/cpuinfo | grep processor | wc -l' out, _, _ = ru.sh_callout('aprun -n %d -N 1 %s' % (n_nodes, cmd)) core_counts = list(set([int(x) for x in out.split()])) assert (len(core_counts) == 1), core_counts cores_per_node = core_counts[0] gpus_per_node = self._cfg.get('gpus_per_node', 0) lfs_per_node = { 'path': ru.expand_env(self._cfg.get('lfs_path_per_node')), 'size': self._cfg.get('lfs_size_per_node', 0) } mem_per_node = self._cfg.get('mem_per_node', 0) self._log.info("Found unique core counts: %s", cores_per_node) # node names are unique, so can serve as node uids self.node_list = [[node, node] for node in node_list] self.cores_per_node = cores_per_node self.gpus_per_node = gpus_per_node self.lfs_per_node = lfs_per_node self.mem_per_node = mem_per_node
def _shell(self, data): ''' We expect data to have a single entry: 'cmd', containing the command line to be called as string. ''' try: out, err, ret = ru.sh_callout(data['cmd']) except Exception as e: self._log.exception('_shell failed: %s' % (data)) out = None err = 'shell failed: %s' % e ret = 1 return out, err, ret
def start_bridges(self, cfg=None): ''' check if any bridges are defined under `cfg['bridges']` and start them ''' self._prof.prof('start_bridges_start', uid=self._uid) timeout = self._cfg.heartbeat.timeout if cfg is None: cfg = self._cfg for bname, bcfg in cfg.get('bridges', {}).items(): bcfg.uid = bname bcfg.channel = bname bcfg.cmgr = self.uid bcfg.sid = cfg.sid bcfg.path = cfg.path bcfg.heartbeat = cfg.heartbeat fname = '%s/%s.json' % (cfg.path, bcfg.uid) bcfg.write(fname) self._log.info('create bridge %s [%s]', bname, bcfg.uid) out, err, ret = ru.sh_callout('radical-pilot-bridge %s' % fname) self._log.debug('bridge startup out: %s', out) self._log.debug('bridge startup err: %s', err) if ret: raise RuntimeError('bridge startup failed') self._uids.append(bcfg.uid) self._log.info('created bridge %s [%s]', bname, bcfg.uid) # all bridges should start now, for their heartbeats # to appear. # self._log.debug('wait for %s', self._uids) failed = self._hb.wait_startup(self._uids, timeout=timeout) # self._log.debug('waited for %s: %s', self._uids, failed) if failed: raise RuntimeError('could not start all bridges %s' % failed) self._prof.prof('start_bridges_stop', uid=self._uid)
def _watch_flux(flux_env): logger.info('=== starting flux watcher') for k, v in flux_env.items(): os.environ[k] = v ret = None while not ret: out, err, ret = ru.sh_callout('flux ping -c 1 all') logger.debug('=== flux watcher out: %s', out) if ret: logger.error('=== flux watcher err: %s', err) break time.sleep(0.1) logger.info('flux stopped?')
def lrms_config_hook(cls, name, cfg, lrms, logger, profiler): """ FIXME: this config hook will manipulate the LRMS nodelist. Not a nice thing to do, but hey... :P What really should be happening is that the LRMS digs information on node reservation out of the config and configures the node list accordingly. This config hook should be limited to starting the DVM. """ dvm_command = ru.which('orte-dvm') if not dvm_command: raise Exception("Couldn't find orte-dvm") # Now that we found the orte-dvm, get ORTE version out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"', shell=True) orte_info = dict() for line in out.split('\n'): line = line.strip() if not line: continue key, val = line.split(':', 1) if 'Open RTE' == key.strip(): orte_info['version'] = val.strip() elif 'Open RTE repo revision' == key.strip(): orte_info['version_detail'] = val.strip() assert (orte_info.get('version')) logger.info("Found Open RTE: %s / %s", orte_info['version'], orte_info.get('version_detail')) # Use (g)stdbuf to disable buffering. # We need this to get the "DVM ready", # without waiting for orte-dvm to complete. # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff. stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + orte-dvm + debug_args dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command] # Additional (debug) arguments to orte-dvm if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'): debug_strings = [ '--debug-devel', '--mca odls_base_verbose 100', '--mca rml_base_verbose 100' ] else: debug_strings = [] # Split up the debug strings into args and add them to the dvm_args [dvm_args.extend(ds.split()) for ds in debug_strings] vm_size = len(lrms.node_list) logger.info("Start DVM on %d nodes ['%s']", vm_size, ' '.join(dvm_args)) profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id']) dvm_uri = None dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT) while True: line = dvm_process.stdout.readline().strip() if line.startswith('VMURI:'): if len(line.split(' ')) != 2: raise Exception("Unknown VMURI format: %s" % line) label, dvm_uri = line.split(' ', 1) if label != 'VMURI:': raise Exception("Unknown VMURI format: %s" % line) logger.info("ORTE DVM URI: %s" % dvm_uri) elif line == 'DVM ready': if not dvm_uri: raise Exception("VMURI not found!") logger.info("ORTE DVM startup successful!") profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id']) break else: # Check if the process is still around, # and log output in debug mode. if dvm_process.poll() is None: logger.debug("ORTE: %s", line) else: # Process is gone: fatal! raise Exception("ORTE DVM process disappeared") profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) # ---------------------------------------------------------------------- def _watch_dvm(): logger.info('starting DVM watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: logger.debug('dvm output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `orte-submit` call will trigger # termination anyway. os.kill(os.getpid()) logger.info('DVM stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher") dvm_watcher.start() lm_info = {'dvm_uri': dvm_uri, 'version_info': {name: orte_info}} # we need to inform the actual LM instance about the DVM URI. So we # pass it back to the LRMS which will keep it in an 'lm_info', which # will then be passed as part of the slots via the scheduler return lm_info
def get_backfill(partition=None, max_cores=None, max_walltime=None): ''' Return a set of [partition, cores walltime] tuples which fit into the current backfill. By default we split the backfillable cores into chunks of 10 nodes (where one node is used for the agent), and in walltimes of at most 60 min. ''' if max_cores is None: max_cores = 160 if max_walltime is None: max_walltime = 60 # -------------------------------------------------------------------------- def _duration_to_walltime(timestr): ''' convert a timestring of the forms: 00:00:00:00 days:hours:min:sec 00:00:00 hours:min:sec 00:00 min:sec 00 sec INFINITY into a number of minutes. Any result larger than `max_walltime` is truncated to `max_walltime`. `INFINITY` is also mapped to `max_walltime`. ''' if timestr == 'INFINITY': return max_walltime walltime = 0.0 elems = timestr.split(':') if len(elems) >= 4: walltime += 24 * 60 * int(elems[-4]) if len(elems) >= 3: walltime += 60 * int(elems[-3]) if len(elems) >= 2: walltime += int(elems[-2]) if len(elems) >= 1: walltime += int(elems[-1]) / 60 return min(walltime, max_walltime) # -------------------------------------------------------------------------- if partition: part = '-p %s' % partition else: part = '' out, err, ret = ru.sh_callout('showbf --blocking %s' % part) if err: raise RuntimeError('showbf failed [%s]: %s' % (ret, err)) ret = list() for line in out.splitlines(): part, cores, nodes, duration, start_offset, start_date = line.split() if part.startswith('-') or \ part == 'Partition': continue cores = int(cores) walltime = int(_duration_to_walltime(duration)) while cores > max_cores: cores -= max_cores ret.append([part, max_cores, walltime]) if cores: ret.append([part, cores, walltime]) return ret
def _lsfscript_generator(url, logger, jd, ppn, lsf_version, queue): """ generates an LSF script from a SAGA job description """ lsf_bsubs = '' command = '' env_string = '' if jd.executable: command += "%s " % (jd.executable) if jd.arguments: command += ' '.join(jd.arguments) if jd.queue and queue: lsf_bsubs += "#BSUB -q %s \n" % queue elif jd.queue and not queue: lsf_bsubs += "#BSUB -q %s \n" % jd.queue elif not jd.queue and queue: lsf_bsubs += "#BSUB -q %s \n" % queue if jd.name: lsf_bsubs += "#BSUB -J %s \n" % jd.name if jd.job_contact: lsf_bsubs += "#BSUB -u %s \n" % jd.job_contact if jd.working_directory: lsf_bsubs += "#BSUB -cwd %s \n" \ % jd.working_directory if jd.wall_time_limit: lsf_bsubs += "#BSUB -W %s:%s \n" \ % (jd.wall_time_limit / 60, jd.wall_time_limit % 60) # if working directory is set, we want stdout to end up in the # working directory as well, unless it containes a specific # path name - otherwise we pass `output` as is. if jd.output: if os.path.isabs(jd.output): path = '' elif jd.working_directory: path = '%s/' % jd.working_directory else: path = '' lsf_bsubs += "#BSUB -o %s%s \n" % (path, jd.output) # same holds for stderr if jd.error: if os.path.isabs(jd.error): path = '' elif jd.working_directory: path = '%s/' % jd.working_directory else: path = '' lsf_bsubs += "#BSUB -e %s%s \n" % (path, jd.error) env_string += "export RADICAL_SAGA_SMT=%d" % SMT if jd.environment: for k, v in jd.environment.iteritems(): env_string += " %s=%s" % (k, v) if jd.project and ':' in jd.project: account, reservation = jd.project.split(':', 1) lsf_bsubs += "#BSUB -P %s \n" % account lsf_bsubs += "#BSUB -U %s \n" % reservation elif jd.project: lsf_bsubs += "#BSUB -P %s \n" % jd.project # Request enough nodes to cater for the number of gpus and cores requested if not jd.total_cpu_count: total_cpu_count = 1 else: total_cpu_count = jd.total_cpu_count if not jd.total_gpu_count: total_gpu_count = 1 else: total_gpu_count = jd.total_gpu_count hostname = url.host if not hostname or 'localhost' in hostname: out, _, ret = ru.sh_callout('hostname -f') if ret: hostname = os.environ.get('HOSTNAME', '') else: hostname = out.strip() if not hostname: raise RuntimeError('cannot determine target host f or %s' % url) if 'summitdev' in hostname: cpn = 20 * SMT elif 'summit' in hostname: cpn = 42 * SMT else: raise ValueError('LSF host (%s) not yet supported' % hostname) if 'summitdev' in hostname: gpn = 4 elif 'summit' in hostname: gpn = 6 cpu_nodes = int(total_cpu_count / cpn) if total_cpu_count > (cpu_nodes * cpn): cpu_nodes += 1 gpu_nodes = int(total_gpu_count / gpn) if total_gpu_count > (gpu_nodes * gpn): gpu_nodes += 1 nodes = max(cpu_nodes, gpu_nodes) lsf_bsubs += "#BSUB -nnodes %s \n" % str(nodes) lsf_bsubs += "#BSUB -alloc_flags 'gpumps smt%d' \n" % SMT # escape double quotes and dollar signs, otherwise 'echo |' # further down won't work # only escape '$' in args and exe. not in the bsubs command = command.replace('$', '\\$') lsfscript = "\n#!/bin/bash \n%s\n%s\n%s" % (lsf_bsubs, env_string, command) lsfscript = lsfscript.replace('"', '\\"') return lsfscript
def work(self, units): if not isinstance(units, list): units = [units] self.advance(units, rps.UMGR_STAGING_INPUT, publish=True, push=False) # we first filter out any units which don't need any input staging, and # advance them again as a bulk. We work over the others one by one, and # advance them individually, to avoid stalling from slow staging ops. no_staging_units = list() staging_units = list() for unit in units: # no matter if we perform any staging or not, we will push the full # unit info to the DB on the next advance, and will pass control to # the agent. unit['$all'] = True unit['control'] = 'agent_pending' # check if we have any staging directives to be enacted in this # component actionables = list() for sd in unit['description'].get('input_staging', []): if sd['action'] in [rpc.TRANSFER, rpc.TARBALL]: actionables.append(sd) if actionables: staging_units.append([unit, actionables]) else: no_staging_units.append(unit) # Optimization: if we obtained a large bulk of units, we at this point # attempt a bulk mkdir for the unit sandboxes, to free the agent of # performing that operation. That implies that the agent needs to check # sandbox existence before attempting to create them now. # # Note that this relies on the umgr scheduler to assigning the sandbox # to the unit. # # Note further that we need to make sure that all units are actually # pointing into the same target file system, so we need to cluster by # filesystem before checking the bulk size. For simplicity we actually # cluster by pilot ID, which is sub-optimal for unit bulks which go to # different pilots on the same resource (think OSG). # # Note further that we skip the bulk-op for all units for which we # actually need to stage data, since the mkdir will then implicitly be # done anyways. # # Caveat: we can actually only (reasonably) do this if we know some # details about the pilot, because otherwise we'd have to much guessing # to do about the pilot configuration (sandbox, access schema, etc), so # we only attempt this optimization for units scheduled to pilots for # which we learned those details. units_by_pid = dict() for unit in no_staging_units: sbox = unit['unit_sandbox'] pid = unit['pilot'] if pid not in units_by_pid: units_by_pid[pid] = list() units_by_pid[pid].append(sbox) # now trigger the bulk mkdir for all filesystems which have more than # a certain units tohandle in this bulk: for pid in units_by_pid: with self._pilots_lock: pilot = self._pilots.get(pid) if not pilot: # we don't feel inclined to optimize for unknown pilots self._log.debug('pid unknown - skip optimizion', pid) continue session_sbox = self._session._get_session_sandbox(pilot) unit_sboxes = units_by_pid[pid] if len(unit_sboxes) >= UNIT_BULK_MKDIR_THRESHOLD: # no matter the bulk mechanism, we need a SAGA handle to the # remote FS sbox_fs = ru.Url(session_sbox) # deep copy sbox_fs.path = '/' sbox_fs_str = str(sbox_fs) if sbox_fs_str not in self._fs_cache: self._fs_cache[sbox_fs_str] = rs.filesystem.Directory( sbox_fs, session=self._session) saga_dir = self._fs_cache[sbox_fs_str] # we have two options for a bulk mkdir: # 1) ask SAGA to create the sandboxes in a bulk op # 2) create a tarball with all unit sandboxes, push it over, and # untar it (one untar op then creates all dirs). We implement # both if UNIT_BULK_MKDIR_MECHANISM == 'saga': tc = rs.task.Container() for sbox in unit_sboxes: tc.add(saga_dir.make_dir(sbox, ttype=rs.TASK)) tc.run() tc.wait() elif UNIT_BULK_MKDIR_MECHANISM == 'tar': tmp_path = tempfile.mkdtemp(prefix='rp_agent_tar_dir') tmp_dir = os.path.abspath(tmp_path) tar_name = '%s.%s.tgz' % (self._session.uid, self.uid) tar_tgt = '%s/%s' % (tmp_dir, tar_name) tar_url = ru.Url('file://localhost/%s' % tar_tgt) for sbox in unit_sboxes: os.makedirs('%s/%s' % (tmp_dir, ru.Url(sbox).path)) cmd = "cd %s && tar zchf %s *" % (tmp_dir, tar_tgt) out, err, ret = ru.sh_callout(cmd, shell=True) self._log.debug('tar : %s', cmd) self._log.debug('tar : %s\n---\n%s\n---\n%s', out, err, ret) if ret: raise RuntimeError('failed callout %s: %s' % (cmd, err)) tar_rem_path = "%s/%s" % (str(session_sbox), tar_name) self._log.debug('sbox: %s [%s]', session_sbox, type(session_sbox)) self._log.debug('copy: %s -> %s', tar_url, tar_rem_path) saga_dir.copy(tar_url, tar_rem_path, flags=rs.filesystem.CREATE_PARENTS) # ru.sh_callout('rm -r %s' % tmp_path) # get a job service handle to the target resource and run # the untar command. Use the hop to skip the batch system js_url = pilot['js_hop'] self._log.debug('js : %s', js_url) if js_url in self._js_cache: js_tmp = self._js_cache[js_url] else: js_tmp = rs.job.Service(js_url, session=self._session) self._js_cache[js_url] = js_tmp cmd = "tar zmxvf %s/%s -C /" % (session_sbox.path, tar_name) j = js_tmp.run_job(cmd) j.wait() self._log.debug('untar : %s', cmd) self._log.debug('untar : %s\n---\n%s\n---\n%s', j.get_stdout_string(), j.get_stderr_string(), j.exit_code) if no_staging_units: # nothing to stage, push to the agent self.advance(no_staging_units, rps.AGENT_STAGING_INPUT_PENDING, publish=True, push=True) for unit, actionables in staging_units: self._handle_unit(unit, actionables)
def lrms_config_hook(cls, name, cfg, lrms, logger, profiler): """ FIXME: this config hook will manipulate the LRMS nodelist. Not a nice thing to do, but hey... :P What really should be happening is that the LRMS digs information on node reservation out of the config and configures the node list accordingly. This config hook should be limited to starting the DVM. """ dvm_command = ru.which('orte-dvm') if not dvm_command: raise Exception("Couldn't find orte-dvm") # Now that we found the orte-dvm, get ORTE version out, err, ret = ru.sh_callout('orte-info | grep "Open RTE"', shell=True) orte_info = dict() for line in out.split('\n'): line = line.strip() if not line: continue key, val = line.split(':', 1) if 'Open RTE' == key.strip(): orte_info['version'] = val.strip() elif 'Open RTE repo revision' == key.strip(): orte_info['version_detail'] = val.strip() assert(orte_info.get('version')) logger.info("Found Open RTE: %s / %s", orte_info['version'], orte_info.get('version_detail')) # Use (g)stdbuf to disable buffering. # We need this to get the "DVM ready", # without waiting for orte-dvm to complete. # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff. stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + orte-dvm + debug_args dvm_args = [stdbuf_cmd, stdbuf_arg, dvm_command] # Additional (debug) arguments to orte-dvm if os.environ.get('RADICAL_PILOT_ORTE_VERBOSE'): debug_strings = [ '--debug-devel', '--mca odls_base_verbose 100', '--mca rml_base_verbose 100' ] else: debug_strings = [] # Split up the debug strings into args and add them to the dvm_args [dvm_args.extend(ds.split()) for ds in debug_strings] vm_size = len(lrms.node_list) logger.info("Start DVM on %d nodes ['%s']", vm_size, ' '.join(dvm_args)) profiler.prof(event='orte_dvm_start', uid=cfg['pilot_id']) dvm_uri = None dvm_process = mp.Popen(dvm_args, stdout=mp.PIPE, stderr=mp.STDOUT) while True: line = dvm_process.stdout.readline().strip() if line.startswith('VMURI:'): if len(line.split(' ')) != 2: raise Exception("Unknown VMURI format: %s" % line) label, dvm_uri = line.split(' ', 1) if label != 'VMURI:': raise Exception("Unknown VMURI format: %s" % line) logger.info("ORTE DVM URI: %s" % dvm_uri) elif line == 'DVM ready': if not dvm_uri: raise Exception("VMURI not found!") logger.info("ORTE DVM startup successful!") profiler.prof(event='orte_dvm_ok', uid=cfg['pilot_id']) break else: # Check if the process is still around, # and log output in debug mode. if dvm_process.poll() is None: logger.debug("ORTE: %s", line) else: # Process is gone: fatal! raise Exception("ORTE DVM process disappeared") profiler.prof(event='orte_dvm_fail', uid=cfg['pilot_id']) # ---------------------------------------------------------------------- def _watch_dvm(): logger.info('starting DVM watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: logger.debug('dvm output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `orte-submit` call will trigger # termination anyway. os.kill(os.getpid()) logger.info('DVM stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = ru.Thread(target=_watch_dvm, name="DVMWatcher") dvm_watcher.start() lm_info = {'dvm_uri' : dvm_uri, 'version_info': {name: orte_info}} # we need to inform the actual LM instance about the DVM URI. So we # pass it back to the LRMS which will keep it in an 'lm_info', which # will then be passed as part of the slots via the scheduler return lm_info
def _lsfscript_generator(url, logger, jd, ppn, lsf_version, queue): """ generates an LSF script from a SAGA job description """ lsf_bsubs = '' command = '' env_string = '' if jd.executable: command += "%s " % (jd.executable) if jd.arguments: command += ' '.join(jd.arguments) bsub_queue = queue or jd.queue if bsub_queue: lsf_bsubs += "#BSUB -q %s \n" % bsub_queue if jd.name: lsf_bsubs += "#BSUB -J %s \n" % jd.name if jd.job_contact: lsf_bsubs += "#BSUB -u %s \n" % jd.job_contact if jd.working_directory: lsf_bsubs += "#BSUB -cwd %s \n" \ % jd.working_directory if jd.wall_time_limit: lsf_bsubs += "#BSUB -W %s:%s \n" \ % (int(jd.wall_time_limit / 60), int(jd.wall_time_limit % 60)) # if working directory is set, we want stdout to end up in the # working directory as well, unless it contains a specific # path name - otherwise we pass `output` as is. if jd.output: if os.path.isabs(jd.output): path = '' elif jd.working_directory: path = '%s/' % jd.working_directory else: path = '' lsf_bsubs += "#BSUB -o %s%s \n" % (path, jd.output) # same holds for stderr if jd.error: if os.path.isabs(jd.error): path = '' elif jd.working_directory: path = '%s/' % jd.working_directory else: path = '' lsf_bsubs += "#BSUB -e %s%s \n" % (path, jd.error) if jd.project and ':' in jd.project: account, reservation = jd.project.split(':', 1) lsf_bsubs += "#BSUB -P %s \n" % account lsf_bsubs += "#BSUB -U %s \n" % reservation elif jd.project: lsf_bsubs += "#BSUB -P %s \n" % jd.project # Request enough nodes to cater for the number of gpus and cores requested if not jd.total_cpu_count: total_cpu_count = 1 else: total_cpu_count = jd.total_cpu_count if not jd.total_gpu_count: total_gpu_count = 1 else: total_gpu_count = jd.total_gpu_count hostname = url.host if not hostname or 'localhost' in hostname: out, _, ret = ru.sh_callout('hostname -f') if ret: hostname = os.environ.get('HOSTNAME', '') else: hostname = out.strip() if not hostname: raise RuntimeError('cannot determine target host f or %s' % url) cpn, gpn, smt, valid_alloc_flags = 0, 1, SMT_DEFAULT, [] for resource_name in RESOURCES: if resource_name in hostname: smt = jd.system_architecture.get('smt') or smt cpn = RESOURCES[resource_name]['cpn'] * smt gpn = RESOURCES[resource_name]['gpn'] valid_alloc_flags = RESOURCES[resource_name]['valid_alloc_flags'] break if not cpn: raise ValueError('LSF host (%s) not yet supported' % hostname) if smt not in SMT_VALID_VALUES: smt = SMT_DEFAULT cpu_nodes = int(total_cpu_count / cpn) if total_cpu_count > (cpu_nodes * cpn): cpu_nodes += 1 gpu_nodes = int(total_gpu_count / gpn) if total_gpu_count > (gpu_nodes * gpn): gpu_nodes += 1 nodes = max(cpu_nodes, gpu_nodes) lsf_bsubs += "#BSUB -nnodes %s \n" % str(nodes) alloc_flags = [] for flag in jd.system_architecture.get('options', []): if flag.lower() in valid_alloc_flags: alloc_flags.append(flag.lower()) alloc_flags.append('smt%d' % smt) lsf_bsubs += "#BSUB -alloc_flags '%s' \n" % ' '.join(alloc_flags) env_string += "export RADICAL_SAGA_SMT=%d" % smt if jd.environment: for k, v in jd.environment.items(): env_string += " %s=%s" % (k, v) # escape double quotes and dollar signs, otherwise 'echo |' # further down won't work # only escape '$' in args and exe. not in the bsubs command = command.replace('$', '\\$') lsfscript = "\n#!/bin/bash \n%s\n%s\n%s" % (lsf_bsubs, env_string, command) lsfscript = lsfscript.replace('"', '\\"') return lsfscript
def _cmd(cmd): _, _, ret = ru.sh_callout(cmd) return not bool(ret)
def rm_config_hook(cls, name, cfg, rm, log, profiler): prte = ru.which('prte') if not prte: raise Exception("Couldn't find prte") # Now that we found the prte, get PRUN version out, _, _ = ru.sh_callout('prte_info | grep "Open RTE"', shell=True) prte_info = dict() for line in out.split('\n'): line = line.strip() if 'Open RTE:' in line: prte_info['version'] = line.split(':')[1].strip() elif 'Open RTE repo revision:' in line: prte_info['version_detail'] = line.split(':')[1].strip() log.info("Found Open RTE: %s [%s]", prte_info.get('version'), prte_info.get('version_detail')) # write hosts file furi = '%s/prrte.uri' % os.getcwd() fhosts = '%s/prrte.hosts' % os.getcwd() vm_size = len(rm.node_list) with open(fhosts, 'w') as fout: for node in rm.node_list: fout.write('%s slots=%d\n' % (node[0], rm.cores_per_node * rm.smt)) pre = os.environ['PRRTE_PREFIX'] prte += ' --prefix %s' % pre prte += ' --report-uri %s' % furi prte += ' --hostfile %s' % fhosts if profiler.enabled: prte += ' --pmca orte_state_base_verbose 1' # prte profiling # large tasks imply large message sizes, and we need to account for that # FIXME: we should derive the message size from DVM size - smaller DVMs # will never need large messages, as they can't run large tasks) prte += ' --pmca ptl_base_max_msg_size %d' % (1024 * 1024 * 1024 * 1) # prte += ' --pmca rmaps_base_verbose 5' # debug mapper problems for large tasks if log.isEnabledFor(logging.DEBUG): prte += ' -pmca orte_rmaps_base_verbose 100' # we apply two temporary tweaks on Summit which should not be needed in # the long run: # # avoid 64 node limit (ssh connection limit) prte += ' --pmca plm_rsh_no_tree_spawn 1' # ensure 1 ssh per dvm prte += ' --pmca plm_rsh_num_concurrent %d' % vm_size # Use (g)stdbuf to disable buffering. We need this to get the # "DVM ready" message to ensure DVM startup completion # # The command seems to be generally available on our Cray's, # if not, we can code some home-coooked pty stuff (TODO) stdbuf_cmd = ru.which(['stdbuf', 'gstdbuf']) if not stdbuf_cmd: raise Exception("Couldn't find (g)stdbuf") stdbuf_arg = "-oL" # Base command = (g)stdbuf <args> + prte + prte-args + debug_args cmdline = '%s %s %s ' % (stdbuf_cmd, stdbuf_arg, prte) # cmdline = prte # Additional (debug) arguments to prte verbose = bool(os.environ.get('RADICAL_PILOT_PRUN_VERBOSE')) if verbose: debug_strings = [ '--debug-devel', '--pmca odls_base_verbose 100', '--pmca rml_base_verbose 100', ] else: debug_strings = [] # Split up the debug strings into args and add them to the cmdline cmdline += ' '.join(debug_strings) cmdline = cmdline.strip() log.info("Start prte on %d nodes [%s]", vm_size, cmdline) profiler.prof(event='dvm_start', uid=cfg['pid']) dvm_uri = None dvm_process = mp.Popen(cmdline.split(), stdout=mp.PIPE, stderr=mp.STDOUT) # ---------------------------------------------------------------------- def _watch_dvm(): log.info('starting prte watcher') retval = dvm_process.poll() while retval is None: line = dvm_process.stdout.readline().strip() if line: log.debug('prte output: %s', line) else: time.sleep(1.0) if retval != 0: # send a kill signal to the main thread. # We know that Python and threading are likely not to play well # with signals - but this is an exceptional case, and not part # of the stadard termination sequence. If the signal is # swallowed, the next `prun` call will trigger # termination anyway. os.kill(os.getpid()) raise RuntimeError('PRTE DVM died') log.info('prte stopped (%d)' % dvm_process.returncode) # ---------------------------------------------------------------------- dvm_watcher = mt.Thread(target=_watch_dvm) dvm_watcher.daemon = True dvm_watcher.start() for _ in range(100): time.sleep(0.5) try: with open(furi, 'r') as fin: for line in fin.readlines(): if '://' in line: dvm_uri = line.strip() break except Exception as e: log.debug('DVM check: uri file missing: %s...' % str(e)[:24]) time.sleep(0.5) if dvm_uri: break if not dvm_uri: raise Exception("VMURI not found!") log.info("prte startup successful: [%s]", dvm_uri) # in some cases, the DVM seems to need some additional time to settle. # FIXME: this should not be needed, really time.sleep(10) profiler.prof(event='dvm_ok', uid=cfg['pid']) lm_info = { 'dvm_uri': dvm_uri, 'version_info': prte_info, 'cvd_id_mode': 'physical' } # we need to inform the actual LaunchMethod instance about the prte URI. # So we pass it back to the ResourceManager which will keep it in an # 'lm_info', which will then be passed as part of the slots via the # scheduler return lm_info
def _cmd(cmd): _, _, ret = ru.sh_callout(cmd) if ret == 0: return True else: return False
def excuse(): cmd_fetch = "telnet bofh.jeffballard.us 666 2>&1 " cmd_filter = "grep 'Your excuse is:' | cut -f 2- -d :" out = ru.sh_callout("%s | %s" % (cmd_fetch, cmd_filter), shell=True)[0] return out.strip()