def peek(self, app, remote_filename, local_file, offset=0, size=None): job = app.execution assert 'ssh_remote_folder' in job, \ "Missing attribute `ssh_remote_folder` on `Job` instance" \ " passed to `PbsLrms.peek`." if size is None: size = sys.maxsize _filename_mapping = generic_filename_mapping( job.lrms_jobname, job.lrms_jobid, remote_filename) _remote_filename = os.path.join( job.ssh_remote_folder, _filename_mapping) try: self.transport.connect() remote_handler = self.transport.open( _remote_filename, mode='r', bufsize=-1) remote_handler.seek(offset) data = remote_handler.read(size) except Exception as ex: log.error("Could not read remote file '%s': %s: %s", _remote_filename, ex.__class__.__name__, str(ex)) try: local_file.write(data) except (TypeError, AttributeError): output_file = open(local_file, 'w+b') output_file.write(data) output_file.close() log.debug('... Done.')
def _cleanup_terminating_task(self, app, pid, termstatus=None): app.execution.state = Run.State.TERMINATING if termstatus is not None: app.execution.returncode = termstatus if pid in self.job_infos: self.job_infos[pid]['terminated'] = True if app.requested_memory is not None: assert (app.requested_memory == self.job_infos[pid]['requested_memory']) self.available_memory += app.requested_memory wrapper_filename = posixpath.join( app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: log.debug( "Reading resource utilization from wrapper file `%s` for task %s ...", wrapper_filename, app) with self.transport.open(wrapper_filename, 'r') as wrapper_file: outcome = self._parse_wrapper_output(wrapper_file) app.execution.update(outcome) if termstatus is None: app.execution.returncode = outcome.returncode except Exception as err: msg = ("Could not open wrapper file `{0}` for task `{1}`: {2}" .format(wrapper_filename, app, err)) log.warning("%s -- Termination status and resource utilization fields will not be set.", msg) raise gc3libs.exceptions.InvalidValue(msg) finally: self._delete_job_resource_file(pid)
def get_results(self, app, download_dir, overwrite=False, changed_only=True): if app.output_base_url is not None: raise gc3libs.exceptions.DataStagingError( "Retrieval of output files to non-local destinations" " is not supported in the ShellCmd backend.") self.transport.connect() # Make list of files to copy, in the form of (remote_path, # local_path) pairs. This entails walking the # `Application.outputs` list to expand wildcards and # directory references. stageout = list() for remote_relpath, local_url in app.outputs.iteritems(): if local_url.scheme in ['swift', 'swt', 'swifts', 'swts']: continue local_relpath = local_url.path if remote_relpath == gc3libs.ANY_OUTPUT: remote_relpath = '' local_relpath = '' stageout += _make_remote_and_local_path_pair( self.transport, app, remote_relpath, download_dir, local_relpath) # copy back all files, renaming them to adhere to the # ArcLRMS convention log.debug("Downloading job output into '%s' ...", download_dir) for remote_path, local_path in stageout: # ignore missing files (this is what ARC does too) self.transport.get(remote_path, local_path, ignore_nonexisting=True, overwrite=overwrite, changed_only=changed_only) return
def _cleanup_terminating_task(self, app, pid, termstatus=None): app.execution.state = Run.State.TERMINATING if termstatus is not None: app.execution.returncode = termstatus if pid in self.job_infos: self.job_infos[pid]['terminated'] = True if app.requested_memory is not None: assert (app.requested_memory == self.job_infos[pid] ['requested_memory']) self.available_memory += app.requested_memory wrapper_filename = posixpath.join(app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: log.debug( "Reading resource utilization from wrapper file `%s` for task %s ...", wrapper_filename, app) with self.transport.open(wrapper_filename, 'r') as wrapper_file: outcome = self._parse_wrapper_output(wrapper_file) app.execution.update(outcome) if termstatus is None: app.execution.returncode = outcome.returncode except Exception as err: msg = ( "Could not open wrapper file `{0}` for task `{1}`: {2}".format( wrapper_filename, app, err)) log.warning( "%s -- Termination status and resource utilization fields will not be set.", msg) raise gc3libs.exceptions.InvalidValue(msg) finally: self._delete_job_resource_file(pid)
def peek(self, app, remote_filename, local_file, offset=0, size=None): job = app.execution assert 'ssh_remote_folder' in job, \ "Missing attribute `ssh_remote_folder` on `Job` instance" \ " passed to `PbsLrms.peek`." if size is None: size = sys.maxsize _filename_mapping = generic_filename_mapping(job.lrms_jobname, job.lrms_jobid, remote_filename) _remote_filename = os.path.join(job.ssh_remote_folder, _filename_mapping) try: self.transport.connect() remote_handler = self.transport.open(_remote_filename, mode='r', bufsize=-1) remote_handler.seek(offset) data = remote_handler.read(size) except Exception as ex: log.error("Could not read remote file '%s': %s: %s", _remote_filename, ex.__class__.__name__, str(ex)) try: local_file.write(data) except (TypeError, AttributeError): output_file = open(local_file, 'w+b') output_file.write(data) output_file.close() log.debug('... Done.')
def _parse_stat_output(self, stdout): """ Receive the output of ``squeue --noheader -o %i:%T:%r and parse it. """ jobstatus = dict() if stdout.strip() == '': # if stdout is empty and `squeue -j` exitcode is 0, then # the job has recently completed; # # if the job has been removed from the controllers' # memory, then `squeue -j` exits with code 1 jobstatus['state'] = Run.State.TERMINATING else: # parse stdout jobid, state, reason = stdout.split('^') log.debug("translating SLURM's state '%s' to gc3libs.Run.State", state) if state in ['PENDING', 'CONFIGURING']: # XXX: see above for a discussion of whether 'CONFIGURING' # should be grouped with 'RUNNING' or not; here it's # likely the correct choice to group it with 'PENDING' as # the "configuring" phase may last a few minutes during # which the job is not yet really running. jobstatus['state'] = Run.State.SUBMITTED elif state in ['RUNNING', 'COMPLETING']: jobstatus['state'] = Run.State.RUNNING elif state in ['SUSPENDED']: jobstatus['state'] = Run.State.STOPPED elif state in ['COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT']: jobstatus['state'] = Run.State.TERMINATING else: jobstatus['state'] = Run.State.UNKNOWN return jobstatus
def execute_command(self, cmdline): """ Scan the given command-line and return a predefined result if *any* word in command position matches one of the keys in the `expected_answer` argument to the class constructor. Note that the parsing of command-line is based on regular expressions and is thus only an approximation at ``sh`` syntax. It will *certainly* fail on some command-lines, but there is no way around this short of writing a complete ``sh`` parser just for this function. (And no, Python's module `shlex` will not do the job -- been there, done that.) """ log.debug("scanning command-line <<<%s>>>", cmdline) for match in self._COMMAND_RE.finditer(cmdline): cmd = match.group("cmd") if cmd in self.expected_answer: reply = self.expected_answer[cmd] log.debug("returning programmed reply for '%s': %s", cmd, reply) return reply # if everything else failed, do run the command-line ... return LocalTransport.execute_command(self, cmdline)
def validate_data(self, data_file_list): """ Supported protocols: file, gsiftp, srm, http, https """ for url in data_file_list: log.debug("Resource %s: checking URL '%s' ..." % (self.name, url)) if not url.scheme in ['srm', 'lfc', 'file', 'http', 'gsiftp', 'https']: return False return True
def cancel_job(self, app): controller, job = self._get_job_and_controller(app.execution.lrms_jobid) try: log.debug("Calling arc.JobController.Cancel(job)") if not controller.CancelJob(job): raise gc3libs.exceptions.LRMSError('arc.JobController.Cancel returned False') except Exception, ex: gc3libs.log.error('Failed while killing job. Error type %s, message %s' % (ex.__class__,str(ex))) raise gc3libs.exceptions.LRMSError('Failed while killing job. Error type %s, message %s' % (ex.__class__,str(ex)))
def _update_job_resource_file(self, pid, resources): """ Update file in `self.resource_dir/PID` with `resources`. """ self.transport.connect() # XXX: We should check for exceptions! log.debug("Updating resource file for pid %s", pid) with self.transport.open(posixpath.join(self.resource_dir, str(pid)), 'wb') as fp: pickle.dump(resources, fp, -1)
def _update_job_resource_file(self, pid, resources): """ Update file in `self.resource_dir/PID` with `resources`. """ self.transport.connect() # XXX: We should check for exceptions! log.debug("Updating resource file for pid %s", pid) with self.transport.open( posixpath.join(self.resource_dir, str(pid)), 'wb') as fp: pickle.dump(resources, fp, -1)
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug("Process with PID %s found." " Checking its running status ...", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING # if `requested_walltime` is set, enforce it as a # running time limit if app.requested_walltime is not None: exit_code2, stdout2, stderr2 = self.transport.execute_command( "ps -p %d -o etimes=" % pid) if exit_code2 != 0: # job terminated already, do cleanup and return self._cleanup_terminating_task(app, pid) return app.execution.state cancel = False elapsed = Duration(stdout2.strip() + 'seconds') if elapsed > self.max_walltime: log.warning("Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.", app, elapsed.to_timedelta(), self.max_walltime, self.name) cancel = True if elapsed > app.requested_walltime: log.warning("Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.", app, elapsed.to_timedelta(), app.requested_walltime) cancel = True if cancel: self.cancel_job(app) # set signal to SIGTERM in termination status self._cleanup_terminating_task(app, pid, termstatus=(15, -1)) return app.execution.state else: log.debug( "Process with PID %d not found," " assuming task %s has finished running.", pid, app) self._cleanup_terminating_task(app, pid) self._get_persisted_resource_state() return app.execution.state
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug( "Process with PID %s found." " Checking its running status", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING else: log.debug( "Process with PID %d not found." " Checking wrapper file ...", pid) app.execution.state = Run.State.TERMINATING if pid in self.job_infos: self.job_infos[pid]['terminated'] = True assert (app.requested_memory == self.job_infos[pid] ['requested_memory']) if app.requested_memory: self.available_memory += app.requested_memory wrapper_filename = posixpath.join( app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: wrapper_file = self.transport.open(wrapper_filename, 'r') except Exception as err: self._delete_job_resource_file(pid) raise gc3libs.exceptions.InvalidValue( "Could not open wrapper file '%s' for task '%s': %s" % (wrapper_filename, app, err), do_log=True) try: outcome = self._parse_wrapper_output(wrapper_file) app.execution.returncode = \ Run.shellexit_to_returncode(int(outcome.ReturnCode)) self._delete_job_resource_file(pid) finally: wrapper_file.close() self._get_persisted_resource_state() return app.execution.state
def __run_command_and_parse_output(self, cmd, parser, kind='accounting'): log.debug("Checking remote job %s info with `%s` ...", kind, cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: return parser(stdout, stderr) else: raise gc3libs.exceptions.AuxiliaryCommandError( "Failed running %s command `%s`:" " exit code: %d, stderr: '%s'" % (kind, cmd, exit_code, stderr), do_log=True)
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug("Process with PID %s found." " Checking its running status", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING else: log.debug( "Process with PID %d not found." " Checking wrapper file ...", pid) app.execution.state = Run.State.TERMINATING if pid in self.job_infos: self.job_infos[pid]['terminated'] = True assert (app.requested_memory == self.job_infos[pid]['requested_memory']) if app.requested_memory: self.available_memory += app.requested_memory wrapper_filename = posixpath.join( app.execution.lrms_execdir, ShellcmdLrms.WRAPPER_DIR, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) try: wrapper_file = self.transport.open(wrapper_filename, 'r') except Exception as err: self._delete_job_resource_file(pid) raise gc3libs.exceptions.InvalidValue( "Could not open wrapper file '%s' for task '%s': %s" % (wrapper_filename, app, err), do_log=True) try: outcome = self._parse_wrapper_output(wrapper_file) app.execution.returncode = \ Run.shellexit_to_returncode(int(outcome.ReturnCode)) self._delete_job_resource_file(pid) finally: wrapper_file.close() self._get_persisted_resource_state() return app.execution.state
def free(self, app): controller, job = self._get_job_and_controller(app.execution.lrms_jobid) log.debug("Calling JobController.CleanJob") if not controller.CleanJob(job): log.error("arc1.JobController.CleanJob returned False for ARC job ID '%s'", app.execution.lrms_jobid) # XXX: this is necessary as the other component of arc library seems to refer to the job.xml file # remove Job from job.xml file log.debug("Removing job '%s' from jobfile '%s'", app, gc3libs.Default.ARC_JOBLIST_LOCATION) job.RemoveJobsFromFile(gc3libs.Default.ARC_JOBLIST_LOCATION, [job.IDFromEndpoint])
def _iterjobs(self): """ Iterate over all jobs. """ self._get_JobSupervisor_and_JobController() for c in self._controllers: log.debug("Calling JobController.GetJobInformation() ...") c.GetJobInformation() log.debug('... controller returned %d jobs' % len(c.GetJobs())) return itertools.chain(* [c.GetJobs() for c in self._controllers])
def _delete_job_resource_file(self, pid): """ Delete `self.resource_dir/PID` file """ self.transport.connect() log.debug("Deleting resource file for pid %s ...", pid) pidfile = posixpath.join(self.resource_dir, str(pid)) try: self.transport.remove(pidfile) except Exception as err: log.debug("Ignored error deleting file `%s`: %s: %s", pidfile, err.__class__.__name__, err)
def _delete_job_resource_file(self, pid): """ Delete `self.resource_dir/PID` file """ self.transport.connect() log.debug("Deleting resource file for pid %s ...", pid) pidfile = posixpath.join(self.resource_dir, str(pid)) try: self.transport.remove(pidfile) except Exception as err: log.debug( "Ignored error deleting file `%s`: %s: %s", pidfile, err.__class__.__name__, err)
def update_job_state(self, app): """ Advance `app`'s status to the next one in the normal execution graph. """ log.debug("No-Op backend updating state of Task %s ...", app) transitions = self.transition_graph[app.execution.state] log.debug("Task %s transitions: %s.", app, str.join(", ", [ ("with probability %g to state %s" % (prob, state)) for prob, state in transitions.items() if prob > 0 ])) dice = random() log.debug("Rolled dice, got %g result", dice) for prob, state in sorted(transitions.items()): if dice < prob: log.debug( "Task %s transitions to state '%s'", app, state) # update resource state based on old and new app state if app.execution.state == Run.State.SUBMITTED: self.queued -= 1 self.user_queued -= 1 if app.execution.state == Run.State.RUNNING: self.user_run -= 1 if state == Run.State.RUNNING: self.user_run += 1 if state == Run.State.TERMINATING: self.free_slots += app.requested_cores if app.requested_memory: self.available_memory += app.requested_memory # set the new app state app.execution.state = state break else: dice -= prob return app.execution.state
def _parse_stat_output(self, stdout, stderr): """ Parse output of ``squeue --noheader -o %i:%T:%r``. """ state = Run.State.UNKNOWN for line in stdout.split('\n'): line = line.strip() # sites might wrap basic SLURM commands like `squeue` or # `sacct` to provide additional information to users; we # need to tell the actual SLURM output from the sites' own # info; fortunately, SLURM's `--format` option allows # arbitrary string prefixes which we can leverage to tag # the interesting output lines. if line.startswith('GC3Pie^'): # parse stdout _, job_id, job_state_code, reason = stdout.split('^') log.debug("translating SLURM state `%s` to gc3libs.Run.State", job_state_code) if job_state_code in ['PENDING', 'CONFIGURING']: # XXX: see comments in `count_jobs` for a discussion # of whether 'CONFIGURING' should be grouped with # 'RUNNING' or not; here it's likely the correct # choice to group it with 'PENDING' as the # "configuring" phase may last a few minutes during # which the job is not yet really running. state = Run.State.SUBMITTED elif job_state_code in ['RUNNING', 'COMPLETING']: state = Run.State.RUNNING elif job_state_code in ['SUSPENDED']: state = Run.State.STOPPED elif job_state_code in [ 'COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT' ]: state = Run.State.TERMINATING else: state = Run.State.UNKNOWN break else: # No `GC3pie:` line found in output: # # * If stdout is empty and `squeue -j` exitcode is 0, then # the job has recently completed (but we still need to # call `sacct` to reap the termination status). # # * If the job has been removed from the controllers' # memory, then `squeue -j` exits with code 1. state = Run.State.TERMINATING return self._stat_result(state, None) # no term status info
def _get_targets(self): """ Wrapper around `arc.TargetGenerator.GetTargets()`. """ # tg = arc.TargetGenerator(self._usercfg, 1) # return tg.FoundTargets() # This methodd should spawn the ldapsearch to update the ExecutionTager information log.debug('Calling arc.TargetGenerator.RetrieveExecutionTargets') self._get_JobSupervisor_and_JobController() self._target_generator.RetrieveExecutionTargets() log.debug('Calling arc.TargetGenerator.GetExecutionTargets()') return self._target_generator.GetExecutionTargets()
def _parse_stat_output(self, stdout, stderr): # parse `qstat` output pbs_status = stdout.split()[4] log.debug("translating PBS/Torque's `qstat` code" " '%s' to gc3libs.Run.State", pbs_status) if pbs_status in ['Q', 'W']: state = Run.State.SUBMITTED elif pbs_status in ['R']: state = Run.State.RUNNING elif pbs_status in ['S', 'H', 'T'] or 'qh' in pbs_status: state = Run.State.STOPPED elif pbs_status in ['C', 'E', 'F']: state = Run.State.TERMINATING else: state = Run.State.UNKNOWN return self._stat_result(state, None) # no term status info
def _read_job_resource_file(self, pid): """ Get resource information on job with pid `pid`, if it exists. Returns None if it does not exist. """ self.transport.connect() log.debug("Reading resource file for pid %s", pid) jobinfo = None fname = posixpath.join(self.resource_dir, str(pid)) with self.transport.open(fname, 'rb') as fp: try: jobinfo = pickle.load(fp) except Exception as ex: log.error("Unable to read remote resource file %s: %s", fname, ex) raise return jobinfo
def _parse_stat_output(self, stdout, stderr): # parse `qstat` output pbs_status = stdout.split()[4] log.debug( "translating PBS/Torque's `qstat` code" " '%s' to gc3libs.Run.State", pbs_status) if pbs_status in ['Q', 'W']: state = Run.State.SUBMITTED elif pbs_status in ['R']: state = Run.State.RUNNING elif pbs_status in ['S', 'H', 'T'] or 'qh' in pbs_status: state = Run.State.STOPPED elif pbs_status in ['C', 'E', 'F']: state = Run.State.TERMINATING else: state = Run.State.UNKNOWN return self._stat_result(state, None) # no term status info
def _parse_stat_output(self, stdout, stderr): ge_status_code = stdout.split()[4] log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", ge_status_code) if ge_status_code in ["s", "S", "T"] or ge_status_code.startswith("h"): state = Run.State.STOPPED elif "qw" in ge_status_code: state = Run.State.SUBMITTED elif "r" in ge_status_code or "R" in ge_status_code or "t" in ge_status_code: state = Run.State.RUNNING elif ge_status_code == "E": # error condition state = Run.State.TERMINATING else: log.warning("unknown SGE job status '%s', returning `UNKNOWN`", ge_status_code) state = Run.State.UNKNOWN # to get the exit status information we'll have to parse # `qacct` output so put ``None`` here return self._stat_result(state, None)
def get_resource_status(self): self.updated = False try: self.running_kernel except AttributeError: self._gather_machine_specs() self.job_infos = self._get_persisted_resource_state() used_memory = self._compute_used_memory(self.job_infos) self.available_memory = self.total_memory - used_memory self.updated = True log.debug( "Recovered resource information from files in %s:" " available memory: %s, memory used by jobs: %s", self.resource_dir, self.available_memory.to_str('%g%s', unit=Memory.MB, conv=float), used_memory.to_str('%g%s', unit=Memory.MB, conv=float)) return self
def get_resource_status(self): self.updated = False try: self.running_kernel except AttributeError: self._gather_machine_specs() self.job_infos = self._get_persisted_resource_state() used_memory = self._compute_used_memory(self.job_infos) self.available_memory = self.total_memory - used_memory self.updated = True log.debug("Recovered resource information from files in %s:" " available memory: %s, memory used by jobs: %s", self.resource_dir, self.available_memory.to_str('%g%s', unit=Memory.MB, conv=float), used_memory.to_str('%g%s', unit=Memory.MB, conv=float)) return self
def _parse_stat_output(self, stdout): job_status = stdout.split()[4] log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", job_status) jobstatus = dict() if job_status in ['s', 'S', 'T'] or job_status.startswith('h'): jobstatus['state'] = Run.State.STOPPED elif 'qw' in job_status: jobstatus['state'] = Run.State.SUBMITTED elif 'r' in job_status or 'R' in job_status or 't' in job_status: jobstatus['state'] = Run.State.RUNNING elif job_status == 'E': # error condition jobstatus['state'] = Run.State.TERMINATING else: log.warning("unknown SGE job status '%s', returning `UNKNOWN`", job_status) jobstatus['state'] = Run.State.UNKNOWN return jobstatus
def _parse_stat_output(self, stdout): # check that passed object obeys contract # parse `qstat` output job_status = stdout.split()[4] jobstatus = dict() log.debug("translating PBS/Torque's `qstat` code " "'%s' to gc3libs.Run.State", job_status) if job_status in ['Q', 'W']: jobstatus['state'] = Run.State.SUBMITTED elif job_status in ['R']: jobstatus['state'] = Run.State.RUNNING elif job_status in ['S', 'H', 'T'] or 'qh' in job_status: jobstatus['state'] = Run.State.STOPPED elif job_status in ['C', 'E', 'F']: jobstatus['state'] = Run.State.TERMINATING else: jobstatus['state'] = Run.State.UNKNOWN return jobstatus
def _get_persisted_resource_state(self): """ Get information on total resources from the files stored in `self.resource_dir`. It then returns a dictionary {PID: {key: values}} with informations for each job which is associated to a running process. """ self.transport.connect() pidfiles = self.transport.listdir(self.resource_dir) log.debug("Checking status of the following PIDs: %s", str.join(", ", pidfiles)) job_infos = {} for pid in pidfiles: job = self._read_job_resource_file(pid) if job: job_infos[pid] = job else: # Process not found, ignore it continue return job_infos
def get_results(self, app, download_dir, overwrite=False, changed_only=True): if app.output_base_url is not None: raise gc3libs.exceptions.UnrecoverableDataStagingError( "Retrieval of output files to non-local destinations" " is not supported (yet).") job = app.execution try: self.transport.connect() # Make list of files to copy, in the form of (remote_path, # local_path) pairs. This entails walking the # `Application.outputs` list to expand wildcards and # directory references. stageout = list() for remote_relpath, local_url in app.outputs.items(): local_relpath = local_url.path if remote_relpath == gc3libs.ANY_OUTPUT: remote_relpath = '' local_relpath = '' stageout += _make_remote_and_local_path_pair( self.transport, job, remote_relpath, download_dir, local_relpath) # copy back all files, renaming them to adhere to the # ArcLRMS convention log.debug("Downloading job output into '%s' ...", download_dir) for remote_path, local_path in stageout: # ignore missing files (this is what ARC does too) self.transport.get(remote_path, local_path, ignore_nonexisting=True, overwrite=overwrite, changed_only=changed_only) return except: raise
def _parse_stat_output(self, stdout, stderr): ge_status_code = stdout.split()[4] log.debug("translating SGE's `qstat` code '%s' to gc3libs.Run.State", ge_status_code) if (ge_status_code in ['s', 'S', 'T'] or ge_status_code.startswith('h')): state = Run.State.STOPPED elif 'qw' in ge_status_code: state = Run.State.SUBMITTED elif ('r' in ge_status_code or 'R' in ge_status_code or 't' in ge_status_code): state = Run.State.RUNNING elif ge_status_code == 'E': # error condition state = Run.State.TERMINATING else: log.warning("unknown SGE job status '%s', returning `UNKNOWN`", ge_status_code) state = Run.State.UNKNOWN # to get the exit status information we'll have to parse # `qacct` output so put ``None`` here return self._stat_result(state, None)
def _lsf_state_to_gc3pie_state(stat): log.debug("Translating LSF's `bjobs` status '%s' to gc3libs.Run.State ...", stat) try: return { # LSF 'stat' mapping: 'PEND' : Run.State.SUBMITTED, 'RUN' : Run.State.RUNNING, 'PSUSP' : Run.State.STOPPED, 'USUSP' : Run.State.STOPPED, 'SSUSP' : Run.State.STOPPED, # DONE = successful termination 'DONE' : Run.State.TERMINATING, # EXIT = job was killed / exit forced 'EXIT' : Run.State.TERMINATING, # ZOMBI = job "killed" and unreachable 'ZOMBI' : Run.State.TERMINATING, 'UNKWN' : Run.State.UNKNOWN, }[stat] except KeyError: log.warning("Unknown LSF job status '%s', returning `UNKNOWN`", stat) return Run.State.UNKNOWN
def _parse_stat_output(self, stdout): # check that passed object obeys contract # parse `qstat` output job_status = stdout.split()[4] jobstatus = dict() log.debug( "translating PBS/Torque's `qstat` code " "'%s' to gc3libs.Run.State", job_status) if job_status in ['Q', 'W']: jobstatus['state'] = Run.State.SUBMITTED elif job_status in ['R']: jobstatus['state'] = Run.State.RUNNING elif job_status in ['S', 'H', 'T'] or 'qh' in job_status: jobstatus['state'] = Run.State.STOPPED elif job_status in ['C', 'E', 'F']: jobstatus['state'] = Run.State.TERMINATING else: jobstatus['state'] = Run.State.UNKNOWN return jobstatus
def _get_job_and_controller(self, jobid): """ Return a pair `(c, j)` where `j` is the `arc.Job` object corresponding to the given `jobid` and `c` is the corresponding `arc.JobController`. """ """ jobmaster = arc.JobSupervisor(usercfg, []); jobcontrollers = jobmaster.GetJobControllers(); """ self._iterjobs() for c in self._controllers: log.debug("Calling JobController.GetJobs in get_job_and_controller") jl = c.GetJobs() for j in jl: if j.JobID.str() == jobid: # found, clean remote job sessiondir return (c, j) raise KeyError("No job found with job ID '%s'" % jobid)
def submit_job(self, app): """ Transition `app`'s status to `Run.State.SUBMITTED` if possible. Note that this method still checks that `app`'s requirements are compatible with what this resource was instanciated with, and that conversely the resource still has enough free cores/memory/etc to host a new application. So, submission to a No-Op resource may still fail! """ free_slots = self.free_slots - app.requested_cores if free_slots <= 0: raise gc3libs.exceptions.LRMSSubmitError( "Resource %s already running maximum allowed number of jobs" " (%s). Increase 'max_cores' to raise." % (self.name, self.max_cores)) if (app.requested_memory and self.available_memory < app.requested_memory): raise gc3libs.exceptions.LRMSSubmitError( "Resource %s does not have enough available memory:" " %s requested, but only %s available." % (self.name, app.requested_memory.to_str('%g%s', unit=Memory.MB), available_memory.to_str('%g%s', unit=Memory.MB),) ) log.debug("Faking execution of command '%s' ...", str.join(" ", app.arguments)) # Update application and current resources app.execution.lrms_jobid = id(app) self.free_slots = free_slots if app.requested_memory: self.available_memory -= app.requested_memory self.queued += 1 self.user_queued += 1 return app
def cancel_job(self, app): job = app.execution try: self.transport.connect() cmd = self._cancel_command(job.lrms_jobid) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code != 0: # XXX: It is possible that 'qdel' fails because job # has been already completed thus the cancel_job # behaviour should be tolerant to these errors. log.error("Failed executing remote command '%s'; exit status %d", cmd, exit_code) log.debug(" remote command returned STDOUT '%s'", stdout) log.debug(" remote command returned STDERR '%s'", stderr) if exit_code == 127: # command was not executed, time to signal an exception raise gc3libs.exceptions.LRMSError( "Cannot execute remote command '%s'" " -- See DEBUG level log for details" % (cmd,) ) return job except: log.critical("Failure checking status") raise
def _parse_stat_output(self, stdout, stderr): """ Receive the output of ``squeue --noheader -o %i:%T:%r and parse it. """ state = Run.State.UNKNOWN if stdout.strip() == '': # If stdout is empty and `squeue -j` exitcode is 0, then # the job has recently completed (but we still need to # call `sacct` to reap the termination status). # # If the job has been removed from the controllers' # memory, then `squeue -j` exits with code 1. state = Run.State.TERMINATING else: # parse stdout job_id, job_state_code, reason = stdout.split('^') log.debug("translating SLURM's state '%s' to gc3libs.Run.State", job_state_code) if job_state_code in ['PENDING', 'CONFIGURING']: # XXX: see comments in `count_jobs` for a discussion # of whether 'CONFIGURING' should be grouped with # 'RUNNING' or not; here it's likely the correct # choice to group it with 'PENDING' as the # "configuring" phase may last a few minutes during # which the job is not yet really running. state = Run.State.SUBMITTED elif job_state_code in ['RUNNING', 'COMPLETING']: state = Run.State.RUNNING elif job_state_code in ['SUSPENDED']: state = Run.State.STOPPED elif job_state_code in [ 'COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT' ]: state = Run.State.TERMINATING else: state = Run.State.UNKNOWN return self._stat_result(state, None) # no term status info
def peek(self, app, remote_filename, local_file, offset=0, size=None): job = app.execution assert job.has_key('lrms_jobid'), \ "Missing attribute `lrms_jobid` on `Job` instance passed to `ArcLrms.peek`." controller, j = self._get_job_and_controller(job.lrms_jobid) if size is None: size = sys.maxint # `local_file` could be a file name (string) or a file-like # object, as per function docstring; ensure `local_file_name` # is the local path try: local_file_name = local_file.name except AttributeError: local_file_name = local_file # `local_file` could be a file name (string) or a file-like # object, as per function docstring; ensure `local_file_name` # is the local path try: local_file_name = local_file.name except AttributeError: local_file_name = local_file source_url = arc.URL(job.lrms_jobid + '/' + remote_filename) destination_url = arc.URL(local_file_name) # download file log.debug("Arc1Lrms.peek(): Downloading remote file '%s' into local file '%s' ..." % (remote_filename, local_file_name)) if not controller.ARCCopyFile(source_url, destination_url): log.warning("Failed downloading '%s' to '%s'" % (source_url.str(), destination_url.str())) log.debug("Arc1LRMS.peek(): arc.JobController.ARCCopyFile: completed")
def _lsf_state_to_gc3pie_state(stat): log.debug("Translating LSF's `bjobs` status '%s' to" " gc3libs.Run.State ...", stat) try: return { # LSF 'stat' mapping: 'PEND': Run.State.SUBMITTED, 'RUN': Run.State.RUNNING, 'PSUSP': Run.State.STOPPED, 'USUSP': Run.State.STOPPED, 'SSUSP': Run.State.STOPPED, # DONE = successful termination 'DONE': Run.State.TERMINATING, # EXIT = job was killed / exit forced 'EXIT': Run.State.TERMINATING, # ZOMBI = job "killed" and unreachable 'ZOMBI': Run.State.TERMINATING, 'UNKWN': Run.State.UNKNOWN, }[stat] except KeyError: log.warning( "Unknown LSF job status '%s', returning `UNKNOWN`", stat) return Run.State.UNKNOWN
def execute_command(self, cmdline): """ Scan the given command-line and return a predefined result if *any* word in command position matches one of the keys in the `expected_answer` argument to the class constructor. Note that the parsing of command-line is based on regular expressions and is thus only an approximation at ``sh`` syntax. It will *certainly* fail on some command-lines, but there is no way around this short of writing a complete ``sh`` parser just for this function. (And no, Python's module `shlex` will not do the job -- been there, done that.) """ log.debug("scanning command-line <<<%s>>>", cmdline) for match in self._COMMAND_RE.finditer(cmdline): cmd = match.group("cmd") if cmd in self.expected_answer: return self.expected_answer[cmd] # if everything else failed, do run the command-line ... return LocalTransport.execute_command(self, cmdline)
def _parse_stat_output(self, stdout, stderr): """ Receive the output of ``squeue --noheader -o %i:%T:%r and parse it. """ state = Run.State.UNKNOWN if stdout.strip() == '': # If stdout is empty and `squeue -j` exitcode is 0, then # the job has recently completed (but we still need to # call `sacct` to reap the termination status). # # If the job has been removed from the controllers' # memory, then `squeue -j` exits with code 1. state = Run.State.TERMINATING else: # parse stdout job_id, job_state_code, reason = stdout.split('^') log.debug("translating SLURM's state '%s' to gc3libs.Run.State", job_state_code) if job_state_code in ['PENDING', 'CONFIGURING']: # XXX: see comments in `count_jobs` for a discussion # of whether 'CONFIGURING' should be grouped with # 'RUNNING' or not; here it's likely the correct # choice to group it with 'PENDING' as the # "configuring" phase may last a few minutes during # which the job is not yet really running. state = Run.State.SUBMITTED elif job_state_code in ['RUNNING', 'COMPLETING']: state = Run.State.RUNNING elif job_state_code in ['SUSPENDED']: state = Run.State.STOPPED elif job_state_code in ['COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT']: state = Run.State.TERMINATING else: state = Run.State.UNKNOWN return self._stat_result(state, None) # no term status info
def _parse_stat_output(stdout): # LSF `bjobs -l` uses a LDIF-style continuation lines, wherein # a line is truncated at 79 characters and continues upon the # next one; continuation lines start with a fixed amount of # whitespace. Join continuation lines, so that we can work on # a single block of text. lines = [ ] for line in stdout.split('\n'): if len(line) == 0: continue if line.startswith(LsfLrms._CONTINUATION_LINE_START): lines[-1] += line[len(LsfLrms._CONTINUATION_LINE_START):] else: lines.append(line) # now rebuild stdout by joining the reconstructed lines stdout = str.join('\n', lines) jobstatus = gc3libs.utils.Struct() # XXX: this only works if the current status is the first one # reported in STDOUT ... match = LsfLrms._status_re.search(stdout) if match: stat = match.group('state') jobstatus.state = LsfLrms._lsf_state_to_gc3pie_state(stat) if stat == 'DONE': # DONE = success jobstatus.exit_status = 0 elif stat == 'EXIT': # EXIT = job exited with exit code != 0 match = LsfLrms._unsuccessful_exit_re.search(stdout) if match: log.debug("LSF says: '%s'", match.group(0)) jobstatus.exit_status = int(match.group('exit_status')) assert 'state' in jobstatus return jobstatus
def _parse_stat_output(self, stdout): """ Receive the output of ``squeue --noheader -o %i:%T:%r and parse it. """ jobstatus = dict() if stdout.strip() == '': # if stdout is empty and `squeue -j` exitcode is 0, then # the job has recently completed; # # if the job has been removed from the controllers' # memory, then `squeue -j` exits with code 1 jobstatus['state'] = Run.State.TERMINATING else: # parse stdout jobid, state, reason = stdout.split('^') log.debug("translating SLURM's state '%s' to gc3libs.Run.State", state) if state in ['PENDING', 'CONFIGURING']: # XXX: see above for a discussion of whether 'CONFIGURING' # should be grouped with 'RUNNING' or not; here it's # likely the correct choice to group it with 'PENDING' as # the "configuring" phase may last a few minutes during # which the job is not yet really running. jobstatus['state'] = Run.State.SUBMITTED elif state in ['RUNNING', 'COMPLETING']: jobstatus['state'] = Run.State.RUNNING elif state in ['SUSPENDED']: jobstatus['state'] = Run.State.STOPPED elif state in [ 'COMPLETED', 'CANCELLED', 'FAILED', 'NODE_FAIL', 'PREEMPTED', 'TIMEOUT' ]: jobstatus['state'] = Run.State.TERMINATING else: jobstatus['state'] = Run.State.UNKNOWN return jobstatus
def cancel_job(self, app): job = app.execution try: self.transport.connect() cmd = self._cancel_command(job.lrms_jobid) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code != 0: # XXX: It is possible that 'qdel' fails because job # has been already completed thus the cancel_job # behaviour should be tolerant to these errors. log.error( "Failed executing remote command '%s'; exit status %d", cmd, exit_code) log.debug(" remote command returned STDOUT '%s'", stdout) log.debug(" remote command returned STDERR '%s'", stderr) if exit_code == 127: # command was not executed, time to signal an exception raise gc3libs.exceptions.LRMSError( "Cannot execute remote command '%s'" " -- See DEBUG level log for details" % (cmd, )) return job except: log.critical('Failure checking status') raise
def get_resource_status(self): try: self.transport.connect() _command = ("%s -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_stdout, stderr)) _command = ("%s -F -U %s" % (self._qstat, self._username)) log.debug("Running `%s`...", _command) exit_code, qstat_F_stdout, stderr \ = self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SGE backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, qstat_F_stdout, stderr)) (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(qstat_stdout, self._username) slots = compute_nr_of_slots(qstat_F_stdout) self.free_slots = int(slots['global']['available']) self.used_quota = -1 log.info( "Updated resource '%s' status:" " free slots: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def get_resource_status(self): self.updated = False try: self.transport.connect() _command = ("%s --noheader -o '%%i^%%T^%%u^%%U^%%r^%%R'" % self._squeue) log.debug("Running `%s`...", _command) exitcode, stdout, stderr = self.transport.execute_command(_command) if exitcode != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "SLURM backend failed executing '%s':" " exit code: %d; stdout: '%s', stderr: '%s'" % (_command, exitcode, stdout, stderr)) log.debug("Computing updated values for total/available slots ...") (total_running, self.queued, self.user_run, self.user_queued) \ = count_jobs(stdout, self._username) self.total_run = total_running self.free_slots = -1 self.used_quota = -1 log.info( "Updated resource '%s' status:" " free slots: %d," " total running: %d," " own running jobs: %d," " own queued jobs: %d," " total queued jobs: %d", self.name, self.free_slots, self.total_run, self.user_run, self.user_queued, self.queued, ) return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex), exc_info=True) raise
def update_job_state(self, app): try: job = app.execution job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: %s" % str(ex)) try: self.transport.connect() cmd = self._stat_command(job) log.debug("Checking remote job status with '%s' ..." % cmd) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code == 0: jobstatus = self._parse_stat_output(stdout) job.update(jobstatus) job.state = jobstatus.get('state', Run.State.UNKNOWN) if job.state == Run.State.UNKNOWN: log.warning("Unknown batch job status," " setting GC3Pie job state to `UNKNOWN`") if 'exit_status' in jobstatus: job.returncode = Run.shellexit_to_returncode( int(jobstatus['exit_status'])) # SLURM's `squeue` command exits with code 0 if the # job ID exists in the database (i.e., a job with that # ID has been run) but prints no output. In this # case, we need to continue and examine the accounting # command output to get the termination status etc. if job.state != Run.State.TERMINATING: return job.state else: log.error("Failed while running the `qstat`/`bjobs` command." " exit code: %d, stderr: '%s'" % (exit_code, stderr)) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. cmd = self._acct_command(job) if cmd: log.debug("Retrieving accounting information using command" " '%s' ..." % cmd) try: return self.__do_acct(job, cmd, self._parse_acct_output) except gc3libs.exceptions.AuxiliaryCommandError: # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. try: cmd = self._secondary_acct_command(job) if cmd: log.debug( "The primary job accounting command" " returned no information; trying" " with '%s' instead...", cmd) return self.__do_acct( job, cmd, self._parse_secondary_acct_output) except (gc3libs.exceptions.AuxiliaryCommandError, NotImplementedError): # ignore error -- there is nothing we can do pass # No *stat command and no *acct command returned # correctly. try: if (time.time() - job.stat_failed_at) > self.accounting_delay: # accounting info should be there, if it's not # then job is definitely lost log.critical( "Failed executing remote command: '%s';" "exit status %d", cmd, exit_code) log.debug(" remote command returned stdout: '%s'", stdout) log.debug(" remote command returned stderr: '%s'", stderr) raise gc3libs.exceptions.LRMSError( "Failed executing remote command: '%s'; exit status %d" % (cmd, exit_code)) else: # do nothing, let's try later... return job.state except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() except Exception as ex: log.error("Error in querying Batch resource '%s': %s: %s", self.name, ex.__class__.__name__, str(ex)) raise # If we reach this point it means that we don't actually know # the current state of the job. job.state = Run.State.UNKNOWN return job.state
def get_resource_status(self): """ Get dynamic information out of the LSF subsystem. return self dynamic information required (at least those): total_queued free_slots user_running user_queued """ try: self.transport.connect() # Run lhosts to get the list of available nodes and their # related number of cores # used to compute self.total_slots # lhost output format: # ($nodeid,$OStype,$model,$cpuf,$ncpus,$maxmem,$maxswp) _command = ('%s -w' % self._lshosts) exit_code, stdout, stderr = self.transport.execute_command( _command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "LSF backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, stdout, stderr)) if stdout: lhosts_output = stdout.strip().split('\n') # Remove Header lhosts_output.pop(0) else: lhosts_output = [] # compute self.total_slots self.max_cores = 0 for line in lhosts_output: # HOST_NAME type model cpuf ncpus maxmem maxswp server RESOURCES # noqa (hostname, h_type, h_model, h_cpuf, h_ncpus) = \ line.strip().split()[0:5] try: self.max_cores += int(h_ncpus) except ValueError: # h_ncpus == '-' pass # Run `bjobs -u all -w` to get information about the jobs # for a given user used to compute `running_jobs`, # `self.queued`, `self.user_run` and `self.user_queued`. # # bjobs output format: # JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME # noqa _command = ('%s -u all -w' % self._bjobs) log.debug("Runing `%s`... ", _command) exit_code, stdout, stderr = \ self.transport.execute_command(_command) if exit_code != 0: # cannot continue raise gc3libs.exceptions.LRMSError( "LSF backend failed executing '%s':" "exit code: %d; stdout: '%s'; stderr: '%s'." % (_command, exit_code, stdout, stderr)) if stdout: bjobs_output = stdout.strip().split('\n') # Remove Header bjobs_output.pop(0) else: bjobs_output = [] # user runing/queued used_cores = 0 self.queued = 0 self.user_queued = 0 self.user_run = 0 queued_statuses = [ 'PEND', 'PSUSP', 'USUSP', 'SSUSP', 'WAIT', 'ZOMBI' ] for line in bjobs_output: # JOBID USER STAT QUEUE FROM_HOST EXEC_HOST JOB_NAME SUBMIT_TIME # noqa (jobid, user, stat, queue, from_h, exec_h) = \ line.strip().split()[0:6] # to compute the number of cores allocated per each job # we use the output format of EXEC_HOST field # e.g.: 1*cpt178:2*cpt151 for node in exec_h.split(':'): try: # multi core (cores, n_name) = node.split('*') except ValueError: # single core cores = 1 try: cores = int(cores) except ValueError: # core == '-' pass used_cores += cores if stat in queued_statuses: self.queued += 1 if user == self._username: if stat in queued_statuses: self.user_queued += 1 else: self.user_run += 1 self.free_slots = self.max_cores - used_cores return self except Exception as ex: # self.transport.close() log.error("Error querying remote LRMS, see debug log for details.") log.debug("Error querying LRMS: %s: %s", ex.__class__.__name__, str(ex)) raise
def submit_job(self, app): """ Run an `Application` instance as a local process. :see: `LRMS.submit_job` """ # Update current resource usage to check how many jobs are # running in there. Please note that for consistency with # other backends, these updated information are not kept! try: self.transport.connect() except gc3libs.exceptions.TransportError as ex: raise gc3libs.exceptions.LRMSSubmitError( "Unable to access shellcmd resource at %s: %s" % (self.frontend, str(ex))) job_infos = self._get_persisted_resource_state() free_slots = self.max_cores - self._compute_used_cores(job_infos) available_memory = self.total_memory - \ self._compute_used_memory(job_infos) if self.free_slots == 0 or free_slots == 0: # XXX: We shouldn't check for self.free_slots ! raise gc3libs.exceptions.LRMSSubmitError( "Resource %s already running maximum allowed number of jobs" " (%s). Increase 'max_cores' to raise." % (self.name, self.max_cores)) if app.requested_memory and \ (available_memory < app.requested_memory or self.available_memory < app.requested_memory): raise gc3libs.exceptions.LRMSSubmitError( "Resource %s does not have enough available memory:" " %s requested, but only %s available." % ( self.name, app.requested_memory.to_str('%g%s', unit=Memory.MB), available_memory.to_str('%g%s', unit=Memory.MB), )) log.debug("Executing local command '%s' ...", str.join(" ", app.arguments)) # Check if spooldir is a valid directory if not self.spooldir: ex, stdout, stderr = self.transport.execute_command( 'cd "$TMPDIR" && pwd') if ex != 0 or stdout.strip() == '' or not stdout[0] == '/': log.debug( "Unable to recover a valid absolute path for spooldir." " Using `/var/tmp`.") self.spooldir = '/var/tmp' else: self.spooldir = stdout.strip() # determine execution directory exit_code, stdout, stderr = self.transport.execute_command( "mktemp -d %s " % posixpath.join(self.spooldir, 'gc3libs.XXXXXX')) if exit_code != 0: log.error("Error creating temporary directory on host %s: %s", self.frontend, stderr) log.debug('Freeing resources used by failed application') self.free(app) raise gc3libs.exceptions.LRMSSubmitError( "Error creating temporary directory on host %s: %s", self.frontend, stderr) execdir = stdout.strip() app.execution.lrms_execdir = execdir # Copy input files to remote dir for local_path, remote_path in app.inputs.items(): if local_path.scheme != 'file': continue remote_path = posixpath.join(execdir, remote_path) remote_parent = os.path.dirname(remote_path) try: if (remote_parent not in ['', '.'] and not self.transport.exists(remote_parent)): log.debug("Making remote directory '%s'", remote_parent) self.transport.makedirs(remote_parent) log.debug("Transferring file '%s' to '%s'", local_path.path, remote_path) self.transport.put(local_path.path, remote_path) # preserve execute permission on input files if os.access(local_path.path, os.X_OK): self.transport.chmod(remote_path, 0o755) except: log.critical( "Copying input file '%s' to remote host '%s' failed", local_path.path, self.frontend) log.debug('Cleaning up failed application') self.free(app) raise # try to ensure that a local executable really has # execute permissions, but ignore failures (might be a # link to a file we do not own) if app.arguments[0].startswith('./'): try: self.transport.chmod( posixpath.join(execdir, app.arguments[0][2:]), 0o755) # os.chmod(app.arguments[0], 0755) except: log.error("Failed setting execution flag on remote file '%s'", posixpath.join(execdir, app.arguments[0])) # set up redirection redirection_arguments = '' if app.stdin is not None: # stdin = open(app.stdin, 'r') redirection_arguments += " <%s" % app.stdin if app.stdout is not None: redirection_arguments += " >%s" % app.stdout stdout_dir = os.path.dirname(app.stdout) if stdout_dir: self.transport.makedirs(posixpath.join(execdir, stdout_dir)) if app.join: redirection_arguments += " 2>&1" else: if app.stderr is not None: redirection_arguments += " 2>%s" % app.stderr stderr_dir = os.path.dirname(app.stderr) if stderr_dir: self.transport.makedirs(posixpath.join( execdir, stderr_dir)) # set up environment env_commands = [] for k, v in app.environment.iteritems(): env_commands.append("export {k}={v};".format(k=sh_quote_safe(k), v=sh_quote_unsafe(v))) # Create the directory in which pid, output and wrapper script # files will be stored wrapper_dir = posixpath.join(execdir, ShellcmdLrms.WRAPPER_DIR) if not self.transport.isdir(wrapper_dir): try: self.transport.makedirs(wrapper_dir) except: log.error("Failed creating remote folder '%s'" % wrapper_dir) self.free(app) raise # Set up scripts to download/upload the swift/http files downloadfiles = [] uploadfiles = [] wrapper_downloader_filename = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_DOWNLOADER) for url, outfile in app.inputs.items(): if url.scheme in [ 'swift', 'swifts', 'swt', 'swts', 'http', 'https' ]: downloadfiles.append( "python '%s' download '%s' '%s'" % (wrapper_downloader_filename, str(url), outfile)) for infile, url in app.outputs.items(): if url.scheme in ['swift', 'swt', 'swifts', 'swts']: uploadfiles.append( "python '%s' upload '%s' '%s'" % (wrapper_downloader_filename, str(url), infile)) if downloadfiles or uploadfiles: # Also copy the downloader. with open( resource_filename(Requirement.parse("gc3pie"), "gc3libs/etc/downloader.py")) as fd: wrapper_downloader = self.transport.open( wrapper_downloader_filename, 'w') wrapper_downloader.write(fd.read()) wrapper_downloader.close() # Build pidfilename = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_PID) wrapper_output_filename = posixpath.join( wrapper_dir, ShellcmdLrms.WRAPPER_OUTPUT_FILENAME) wrapper_script_fname = posixpath.join(wrapper_dir, ShellcmdLrms.WRAPPER_SCRIPT) try: # Create the wrapper script wrapper_script = self.transport.open(wrapper_script_fname, 'w') commands = (r"""#!/bin/sh echo $$ >{pidfilename} cd {execdir} exec {redirections} {environment} {downloadfiles} '{time_cmd}' -o '{wrapper_out}' -f '{fmt}' {command} rc=$? {uploadfiles} rc2=$? if [ $rc -ne 0 ]; then exit $rc; else exit $rc2; fi """.format( pidfilename=pidfilename, execdir=execdir, time_cmd=self.time_cmd, wrapper_out=wrapper_output_filename, fmt=ShellcmdLrms.TIMEFMT, redirections=redirection_arguments, environment=str.join('\n', env_commands), downloadfiles=str.join('\n', downloadfiles), uploadfiles=str.join('\n', uploadfiles), command=(str.join(' ', (sh_quote_unsafe(arg) for arg in app.arguments))), )) wrapper_script.write(commands) wrapper_script.close() #log.info("Wrapper script: <<<%s>>>", commands) except gc3libs.exceptions.TransportError: log.error("Freeing resources used by failed application") self.free(app) raise try: self.transport.chmod(wrapper_script_fname, 0o755) # Execute the script in background self.transport.execute_command(wrapper_script_fname, detach=True) except gc3libs.exceptions.TransportError: log.error("Freeing resources used by failed application") self.free(app) raise # Just after the script has been started the pidfile should be # filled in with the correct pid. # # However, the script can have not been able to write the # pidfile yet, so we have to wait a little bit for it... pidfile = None for retry in gc3libs.utils.ExponentialBackoff(): try: pidfile = self.transport.open(pidfilename, 'r') break except gc3libs.exceptions.TransportError as ex: if '[Errno 2]' in str(ex): # no such file or directory time.sleep(retry) continue else: raise if pidfile is None: # XXX: probably self.free(app) should go here as well raise gc3libs.exceptions.LRMSSubmitError( "Unable to get PID file of submitted process from" " execution directory `%s`: %s" % (execdir, pidfilename)) pid = pidfile.read().strip() try: pid = int(pid) except ValueError: # XXX: probably self.free(app) should go here as well pidfile.close() raise gc3libs.exceptions.LRMSSubmitError( "Invalid pid `%s` in pidfile %s." % (pid, pidfilename)) pidfile.close() # Update application and current resources app.execution.lrms_jobid = pid # We don't need to update free_slots since its value is # checked at runtime. if app.requested_memory: self.available_memory -= app.requested_memory self.job_infos[pid] = { 'requested_cores': app.requested_cores, 'requested_memory': app.requested_memory, 'execution_dir': execdir, 'terminated': False, } self._update_job_resource_file(pid, self.job_infos[pid]) return app
def submit_job(self, app): """This method will create a remote directory to store job's sandbox, and will copy the sandbox in there. """ job = app.execution # Create the remote directory. self.transport.connect() cmd = ("mkdir -p {0};" " mktemp -d {0}/batch_job.XXXXXXXXXX".format(self.spooldir)) exit_code, stdout, stderr = self.transport.execute_command(cmd) if exit_code != 0: raise gc3libs.exceptions.SpoolDirError( "Cannot create temporary job working directory" " on resource '%s'; command '%s' exited" " with code: %d and stderr: '%s'." % (self.name, cmd, exit_code, stderr)) ssh_remote_folder = stdout.split('\n')[0] # Copy the input file(s) to remote directory. for local_path, remote_path in list(app.inputs.items()): remote_path = os.path.join(ssh_remote_folder, remote_path) remote_parent = os.path.dirname(remote_path) try: if remote_parent not in ['', '.']: log.debug("Making remote directory '%s'", remote_parent) self.transport.makedirs(remote_parent) log.debug("Transferring file '%s' to '%s'", local_path.path, remote_path) self.transport.put(local_path.path, remote_path) # preserve execute permission on input files if os.access(local_path.path, os.X_OK): self.transport.chmod(remote_path, 0o755) except: log.critical( "Copying input file '%s' to remote cluster '%s' failed", local_path.path, self.frontend) raise if app.arguments[0].startswith('./'): gc3libs.log.debug("Making remote path '%s' executable.", app.arguments[0]) self.transport.chmod( os.path.join(ssh_remote_folder, app.arguments[0]), 0o755) # if STDOUT/STDERR should be saved in a directory, ensure it # exists (see Issue 495 for details) for dest in (app.stdout, app.stderr): if dest: destdir = os.path.dirname(dest) if destdir: self.transport.makedirs( posixpath.join(ssh_remote_folder, destdir)) try: sub_cmd, aux_script = self._submit_command(app) if aux_script != '': # create temporary script name script_filename = ('./script.%s.sh' % uuid.uuid4()) # save script to a temporary file and submit that one instead local_script_file = tempfile.NamedTemporaryFile(mode='wt') local_script_file.write('#!/bin/sh\n') # Add preamble file prologue = self.get_prologue_script(app) if prologue: local_script_file.write(prologue) local_script_file.write(aux_script) # Add epilogue files epilogue = self.get_epilogue_script(app) if epilogue: local_script_file.write(epilogue) local_script_file.flush() # upload script to remote location self.transport.put( local_script_file.name, os.path.join(ssh_remote_folder, script_filename)) # set execution mode on remote script self.transport.chmod( os.path.join(ssh_remote_folder, script_filename), 0o755) # cleanup local_script_file.close() if os.path.exists(local_script_file.name): os.unlink(local_script_file.name) else: # we still need a script name even if there is no # script to submit script_filename = '' # Submit it exit_code, stdout, stderr = self.transport.execute_command( "/bin/sh -c %s" % sh_quote_safe('cd %s && %s %s' % (ssh_remote_folder, sub_cmd, script_filename))) if exit_code != 0: raise gc3libs.exceptions.LRMSError( "Failed executing command 'cd %s && %s %s' on resource" " '%s'; exit code: %d, stderr: '%s'." % (ssh_remote_folder, sub_cmd, script_filename, self.name, exit_code, stderr)) jobid = self._parse_submit_output(stdout) log.debug('Job submitted with jobid: %s', jobid) job.execution_target = self.frontend job.lrms_jobid = jobid job.lrms_jobname = jobid try: if app.jobname: job.lrms_jobname = app.jobname except: pass if 'stdout' in app: job.stdout_filename = app.stdout else: job.stdout_filename = '%s.o%s' % (job.lrms_jobname, jobid) if app.join: job.stderr_filename = job.stdout_filename else: if 'stderr' in app: job.stderr_filename = app.stderr else: job.stderr_filename = '%s.e%s' % (job.lrms_jobname, jobid) job.history.append('Submitted to %s @ %s, got jobid %s' % (self._batchsys_name, self.name, jobid)) job.history.append( "Submission command output:\n" " === stdout ===\n%s" " === stderr ===\n%s" " === end ===\n" % (stdout, stderr), 'pbs', 'qsub') job.ssh_remote_folder = ssh_remote_folder return job except: log.critical( "Failure submitting job to resource '%s' - " "see log file for errors", self.name) raise
def update_job_state(self, app): """ Query the running status of the local process whose PID is stored into `app.execution.lrms_jobid`, and map the POSIX process status to GC3Libs `Run.State`. """ self.transport.connect() pid = app.execution.lrms_jobid exit_code, stdout, stderr = self.transport.execute_command( "ps ax | grep -E '^ *%d '" % pid) if exit_code == 0: log.debug( "Process with PID %s found." " Checking its running status ...", pid) # Process exists. Check the status status = stdout.split()[2] if status[0] == 'T': # Job stopped app.execution.state = Run.State.STOPPED elif status[0] in ['R', 'I', 'U', 'S', 'D', 'W']: # Job is running. Check manpage of ps both on linux # and BSD to know the meaning of these statuses. app.execution.state = Run.State.RUNNING # if `requested_walltime` is set, enforce it as a # running time limit if app.requested_walltime is not None: exit_code2, stdout2, stderr2 = self.transport.execute_command( "ps -p %d -o etime=" % pid) if exit_code2 != 0: # job terminated already, do cleanup and return self._cleanup_terminating_task(app, pid) return app.execution.state cancel = False elapsed = _parse_time_duration(stdout2.strip()) if elapsed > self.max_walltime: log.warning( "Task %s ran for %s, exceeding max_walltime %s of resource %s: cancelling it.", app, elapsed.to_timedelta(), self.max_walltime, self.name) cancel = True if elapsed > app.requested_walltime: log.warning( "Task %s ran for %s, exceeding own `requested_walltime` %s: cancelling it.", app, elapsed.to_timedelta(), app.requested_walltime) cancel = True if cancel: self.cancel_job(app) # set signal to SIGTERM in termination status self._cleanup_terminating_task(app, pid, termstatus=(15, -1)) return app.execution.state else: log.debug( "Process with PID %d not found," " assuming task %s has finished running.", pid, app) self._cleanup_terminating_task(app, pid) self._get_persisted_resource_state() return app.execution.state
def update_job_state(self, app): job = app.execution try: job.lrms_jobid except AttributeError as ex: # `job` has no `lrms_jobid`: object is invalid raise gc3libs.exceptions.InvalidArgument( "Job object is invalid: {ex}".format(ex=ex)) self.transport.connect() cmd = self._stat_command(job) try: state, termstatus = self.__run_command_and_parse_output( cmd, self._parse_stat_output, 'status') if state != Run.State.TERMINATING: # no need to go further and parse acct info; also, # exit status is not relevant in this case job.state = state log.debug("Task %s state set to %s", app, state) return state except gc3libs.exceptions.AuxiliaryCommandError: # use the special state value ``None`` to signal that # the "status" command failed, we might need this # after the "acct" command has run state, termstatus = None, None assert state is None or state == Run.State.TERMINATING log.debug( "Job status command gave state `%s`" " and termination status `%s` for task %s", state, termstatus, app) # In some batch systems, jobs disappear from qstat # output as soon as they are finished. In these cases, # we have to check some *accounting* command to check # the exit status. acctinfo = {} for cmd_fn, parse_fn in [ # this is the regular sacct/qacct/bjobs command (self._acct_command, self._parse_acct_output), # This is used to distinguish between a standard # Torque installation and a PBSPro where `tracejob` # does not work but if `job_history_enable=True`, # then we can actually access information about # finished jobs with `qstat -x -f`. (self._secondary_acct_command, self._parse_secondary_acct_output), ]: cmd = cmd_fn(job) # `._secondary_acct_command` returns ``None`` if no # "secondary" accouting method is defined -- skip to next # iteration, if any if cmd is None: continue try: acctinfo = self.__run_command_and_parse_output( cmd, parse_fn, 'accounting') # use info from the first acct command that succeeds if acctinfo: log.debug("Gathered accounting info %r for task %s", acctinfo, app) break except gc3libs.exceptions.AuxiliaryCommandError: log.debug("Accounting command `%s` failed.", cmd) # try next one pass except gc3libs.exceptions.UnexpectedJobState as ex: log.debug( "Unexpected output from accounting command `%s`: %s.", cmd, ex) # try next one pass # if no termination status is known and the acct # command provided one, use it if 'termstatus' in acctinfo: # if we have a termination status, then the job has terminated state = Run.State.TERMINATING if termstatus is None: termstatus = acctinfo['termstatus'] else: # this should not happen! but one never knows how new # versions of the software may break old habits and # parsing rules, so better fail loudly here so we get # a bug report and a chance to fix... assert termstatus == acctinfo['termstatus'], ( "Status and accounting commands disagree" " on job termination status: {1} vs {2}".format( termstatus, acctinfo['termstatus'])) if termstatus is None: # No *stat command and no *acct command returned correctly. try: job.stat_failed_at except AttributeError: # this is the first time `qstat` fails, record a # timestamp and retry later job.stat_failed_at = time.time() return job.state if (time.time() - job.stat_failed_at) <= self.accounting_delay: # do nothing, let's try later... return job.state else: # accounting info should be there, if it's not # then job is definitely lost job.state = Run.State.UNKNOWN raise gc3libs.exceptions.LRMSError( "Could not retrieve status information for task {app}". format(app=app)) # if we got to this point the job is in TERMINATING state # and we know at least the termination status assert state == Run.State.TERMINATING job.state = state job.returncode = termstatus job.update(acctinfo) return state