def test_restart_finishes_job(self): test = "restart_finishes" with self._setup_app_provider(test) as app_provider: job_id = '12345' with app_provider.new_app() as app: manager = app.only_manager job_info = { 'job_id': job_id, 'command_line': 'sleep 1000', 'setup': True, } submit_job(manager, job_info) external_id = None for i in range(10): time.sleep(.05) # TODO: unfortunate breaking of abstractions here. external_id = manager._proxied_manager._external_id(job_id) if external_id: break if external_id is None: assert False, "Test failed, couldn't get exteranl id for job id." drmaa_session = DrmaaSessionFactory().get() drmaa_session.kill(external_id) drmaa_session.close() consumer = self._status_update_consumer(test) consumer.start() with app_provider.new_app() as app: consumer.wait_for_messages() consumer.join() assert len(consumer.messages) == 1, len(consumer.messages) assert consumer.messages[0]["status"] == "complete"
def main(): arg_parser = ArgumentParser(description=DESCRIPTION) arg_parser.add_argument("--external_id", required=True) args = arg_parser.parse_args() external_id = load(args.external_id) session = DrmaaSessionFactory().get() external_id = session.kill(external_id)
class DRMAAJobRunner( AsynchronousJobRunner ): """ Job runner backed by a finite pool of worker threads. FIFO scheduling """ runner_name = "DRMAARunner" restrict_job_name_length = 15 def __init__( self, app, nworkers, **kwargs ): """Start the job runner""" global drmaa runner_param_specs = { 'drmaa_library_path': dict( map=str, default=os.environ.get( 'DRMAA_LIBRARY_PATH', None ) ) } for retry_exception in RETRY_EXCEPTIONS_LOWER: runner_param_specs[retry_exception + '_state'] = dict( map=str, valid=lambda x: x in ( model.Job.states.OK, model.Job.states.ERROR ), default=model.Job.states.OK ) runner_param_specs[retry_exception + '_retries'] = dict( map=int, valid=lambda x: int >= 0, default=0 ) if 'runner_param_specs' not in kwargs: kwargs[ 'runner_param_specs' ] = dict() kwargs[ 'runner_param_specs' ].update( runner_param_specs ) super( DRMAAJobRunner, self ).__init__( app, nworkers, **kwargs ) # This allows multiple drmaa runners (although only one per handler) in the same job config file if 'drmaa_library_path' in kwargs: log.info( 'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path ) os.environ['DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path # Import is delayed until runner initialization to allow for the # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH try: drmaa = __import__( "drmaa" ) except (ImportError, RuntimeError) as exc: raise exc.__class__('The Python drmaa package is required to use this ' 'feature, please install it or correct the ' 'following error:\n%s: %s' % (exc.__class__.__name__, str(exc))) from pulsar.managers.util.drmaa import DrmaaSessionFactory # Subclasses may need access to state constants self.drmaa_job_states = drmaa.JobState # Descriptive state strings pulled from the drmaa lib itself self.drmaa_job_state_strings = { drmaa.JobState.UNDETERMINED: 'process status cannot be determined', drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', drmaa.JobState.RUNNING: 'job is running', drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', drmaa.JobState.USER_SUSPENDED: 'job is user suspended', drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed', } # Ensure a DRMAA session exists and is initialized self.ds = DrmaaSessionFactory().get() self.userid = None self._init_monitor_thread() self._init_worker_threads() def url_to_destination(self, url): """Convert a legacy URL to a job destination""" if not url: return native_spec = url.split('/')[2] if native_spec: params = dict( nativeSpecification=native_spec ) log.debug( "Converted URL '%s' to destination runner=drmaa, params=%s" % ( url, params ) ) return JobDestination( runner='drmaa', params=params ) else: log.debug( "Converted URL '%s' to destination runner=drmaa" % url ) return JobDestination( runner='drmaa' ) def get_native_spec( self, url ): """Get any native DRM arguments specified by the site configuration""" try: return url.split('/')[2] or None except: return None def queue_job( self, job_wrapper ): """Create job script and submit it to the DRM""" # prepare the job # external_runJob_script can be None, in which case it's not used. external_runjob_script = job_wrapper.get_destination_configuration("drmaa_external_runjob_script", None) include_metadata = asbool( job_wrapper.job_destination.params.get( "embed_metadata_in_job", True) ) if not self.prepare_job( job_wrapper, include_metadata=include_metadata): return # get configured job destination job_destination = job_wrapper.job_destination # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() job_name = self._job_name(job_wrapper) ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name ) # set up the drmaa job template jt = dict( remoteCommand=ajs.job_file, jobName=ajs.job_name, workingDirectory=job_wrapper.working_directory, outputPath=":%s" % ajs.output_file, errorPath=":%s" % ajs.error_file ) # Avoid a jt.exitCodePath for now - it's only used when finishing. native_spec = job_destination.params.get('nativeSpecification', None) if native_spec is not None: jt['nativeSpecification'] = native_spec # fill in the DRM's job run template script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file) try: self.write_executable_script( ajs.job_file, script ) except: job_wrapper.fail( "failure preparing job script", exception=True ) log.exception( "(%s) failure writing job script" % galaxy_id_tag ) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag ) if job_wrapper.cleanup_job in ( "always", "onsuccess" ): job_wrapper.cleanup() return log.debug( "(%s) submitting file %s", galaxy_id_tag, ajs.job_file ) if native_spec: log.debug( "(%s) native specification is: %s", galaxy_id_tag, native_spec ) # runJob will raise if there's a submit problem if external_runjob_script is None: # TODO: create a queue for retrying submission indefinitely # TODO: configurable max tries and sleep trynum = 0 external_job_id = None fail_msg = None while external_job_id is None and trynum < 5: try: external_job_id = self.ds.run_job(**jt) break except ( drmaa.InternalException, drmaa.DeniedByDrmException ) as e: trynum += 1 log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e ) fail_msg = "Unable to run this job due to a cluster error, please retry it later" time.sleep( 5 ) except: log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag ) trynum = 5 else: log.error( "(%s) All attempts to submit job failed" % galaxy_id_tag ) if not fail_msg: fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE job_wrapper.fail( fail_msg ) return else: job_wrapper.change_ownership_for_run() # if user credentials are not available, use galaxy credentials (if permitted) allow_guests = asbool(job_wrapper.job_destination.params.get( "allow_guests", False) ) pwent = job_wrapper.user_system_pwent if pwent is None: if not allow_guests: fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user job_wrapper.fail( fail_msg ) return pwent = job_wrapper.galaxy_system_pwent log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, pwent[0], pwent[2] ) ) filename = self.store_jobtemplate(job_wrapper, jt) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]).strip() log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) ) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination( job_destination, external_job_id ) # Store DRM related state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( ajs ) def _complete_terminal_job( self, ajs, drmaa_state, **kwargs ): """ Handle a job upon its termination in the DRM. This method is meant to be overridden by subclasses to improve post-mortem and reporting of failures. Returns True if job was not actually terminal, None otherwise. """ if drmaa_state == drmaa.JobState.FAILED: if ajs.job_wrapper.get_state() != model.Job.states.DELETED: ajs.stop_job = False ajs.fail_message = "The cluster DRM system terminated this job" self.work_queue.put( ( self.fail_job, ajs ) ) elif drmaa_state == drmaa.JobState.DONE: # External metadata processing for external runjobs external_metadata = not asbool( ajs.job_wrapper.job_destination.params.get( "embed_metadata_in_job", True) ) if external_metadata: self._handle_metadata_externally( ajs.job_wrapper, resolve_requirements=True ) if ajs.job_wrapper.get_state() != model.Job.states.DELETED: self.work_queue.put( ( self.finish_job, ajs ) ) def check_watched_items( self ): """ Called by the monitor thread to look at each watched job and deal with state changes. """ new_watched = [] for ajs in self.watched: external_job_id = ajs.job_id galaxy_id_tag = ajs.job_wrapper.get_id_tag() old_state = ajs.old_state try: assert external_job_id not in ( None, 'None' ), '(%s/%s) Invalid job id' % ( galaxy_id_tag, external_job_id ) state = self.ds.job_status( external_job_id ) # Reset exception retries for retry_exception in RETRY_EXCEPTIONS_LOWER: setattr( ajs, retry_exception + '_retries', 0) except ( drmaa.InternalException, drmaa.InvalidJobException ) as e: ecn = type(e).__name__ retry_param = ecn.lower() + '_retries' state_param = ecn.lower() + '_state' retries = getattr( ajs, retry_param, 0 ) log.warning("(%s/%s) unable to check job status because of %s exception for %d consecutive tries: %s", galaxy_id_tag, external_job_id, ecn, retries + 1, e) if self.runner_params[ retry_param ] > 0: if retries < self.runner_params[ retry_param ]: # will retry check on next iteration setattr( ajs, retry_param, retries + 1 ) new_watched.append( ajs ) continue if self.runner_params[ state_param ] == model.Job.states.OK: log.warning( "(%s/%s) job will now be finished OK", galaxy_id_tag, external_job_id ) self.work_queue.put( ( self.finish_job, ajs ) ) elif self.runner_params[ state_param ] == model.Job.states.ERROR: log.warning( "(%s/%s) job will now be errored", galaxy_id_tag, external_job_id ) self.work_queue.put( ( self.fail_job, ajs ) ) else: raise Exception( "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()", state_param, self.runner_params[ state_param ] ) continue except drmaa.DrmCommunicationException as e: log.warning( "(%s/%s) unable to communicate with DRM: %s", galaxy_id_tag, external_job_id, e ) new_watched.append( ajs ) continue except Exception as e: # so we don't kill the monitor thread log.exception( "(%s/%s) unable to check job status: %s" % ( galaxy_id_tag, external_job_id, e ) ) log.warning( "(%s/%s) job will now be errored" % ( galaxy_id_tag, external_job_id ) ) ajs.fail_message = "Cluster could not complete job" self.work_queue.put( ( self.fail_job, ajs ) ) continue if state != old_state: log.debug( "(%s/%s) state change: %s" % ( galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state] ) ) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state( model.Job.states.RUNNING ) if state in ( drmaa.JobState.FAILED, drmaa.JobState.DONE ): if self._complete_terminal_job( ajs, drmaa_state=state ) is not None: # job was not actually terminal state = ajs.old_state else: continue if ajs.check_limits(): self.work_queue.put( ( self.fail_job, ajs ) ) continue ajs.old_state = state new_watched.append( ajs ) # Replace the watch list with the updated version self.watched = new_watched def stop_job( self, job ): """Attempts to delete a job from the DRM queue""" try: ext_id = job.get_job_runner_external_id() assert ext_id not in ( None, 'None' ), 'External job id is None' kill_script = job.get_destination_configuration(self.app.config, "drmaa_external_killjob_script", None) if kill_script is None: self.ds.kill( ext_id ) else: # FIXME: hardcoded path subprocess.Popen( [ '/usr/bin/sudo', '-E', kill_script, str( ext_id ), str( self.userid ) ], shell=False ) log.debug( "(%s/%s) Removed from DRM queue at user's request" % ( job.get_id(), ext_id ) ) except drmaa.InvalidJobException: log.debug( "(%s/%s) User killed running job, but it was already dead" % ( job.get_id(), ext_id ) ) except Exception as e: log.debug( "(%s/%s) User killed running job, but error encountered removing from DRM queue: %s" % ( job.get_id(), ext_id, e ) ) def recover( self, job, job_wrapper ): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() if job_id is None: self.put( job_wrapper ) return ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper ) ajs.job_id = str( job_id ) ajs.command_line = job.get_command_line() ajs.job_wrapper = job_wrapper ajs.job_destination = job_wrapper.job_destination if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) ) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put( ajs ) elif job.get_state() == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) ) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put( ajs ) def store_jobtemplate(self, job_wrapper, jt): """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string. Path is hard-coded, but it's no worse than other path in this module. Uses Galaxy's JobID, so file is expected to be unique.""" filename = "%s/%s.jt_json" % (self.app.config.cluster_files_directory, job_wrapper.get_id_tag()) with open(filename, 'w+') as fp: json.dump(jt, fp) log.debug( '(%s) Job script for external submission is: %s' % ( job_wrapper.job_id, filename ) ) return filename def external_runjob(self, external_runjob_script, jobtemplate_filename, username): """ runs an external script the will QSUB a new job. The external script will be run with sudo, and will setuid() to the specified user. Effectively, will QSUB as a different user (then the one used by Galaxy). """ script_parts = external_runjob_script.split() script = script_parts[0] command = [ '/usr/bin/sudo', '-E', script] for script_argument in script_parts[1:]: command.append(script_argument) command.extend( [ str(username), jobtemplate_filename ] ) log.info("Running command %s" % command) p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdoutdata, stderrdata) = p.communicate() exitcode = p.returncode # os.unlink(jobtemplate_filename) if exitcode != 0: # There was an error in the child process raise RuntimeError("External_runjob failed (exit code %s)\nChild process reported error:\n%s" % (str(exitcode), stderrdata)) if not stdoutdata.strip(): raise RuntimeError("External_runjob did return the job id: %s" % (stdoutdata)) # The expected output is a single line containing a single numeric value: # the DRMAA job-ID. If not the case, will throw an error. jobId = stdoutdata return jobId def _job_name(self, job_wrapper): external_runjob_script = job_wrapper.get_destination_configuration("drmaa_external_runjob_script", None) galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes job_name = 'g%s' % galaxy_id_tag if job_wrapper.tool.old_id: job_name += '_%s' % job_wrapper.tool.old_id if external_runjob_script is None: job_name += '_%s' % job_wrapper.user job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) ) if self.restrict_job_name_length: job_name = job_name[:self.restrict_job_name_length] return job_name
class DRMAAJobRunner(AsynchronousJobRunner): """ Job runner backed by a finite pool of worker threads. FIFO scheduling """ runner_name = "DRMAARunner" def __init__(self, app, nworkers, **kwargs): """Start the job runner""" global drmaa runner_param_specs = dict( drmaa_library_path=dict(map=str, default=os.environ.get( 'DRMAA_LIBRARY_PATH', None)), invalidjobexception_state=dict( map=str, valid=lambda x: x in (model.Job.states.OK, model.Job.states.ERROR), default=model.Job.states.OK), invalidjobexception_retries=dict(map=int, valid=lambda x: int >= 0, default=0), internalexception_state=dict( map=str, valid=lambda x: x in (model.Job.states.OK, model.Job.states.ERROR), default=model.Job.states.OK), internalexception_retries=dict(map=int, valid=lambda x: int >= 0, default=0)) if 'runner_param_specs' not in kwargs: kwargs['runner_param_specs'] = dict() kwargs['runner_param_specs'].update(runner_param_specs) super(DRMAAJobRunner, self).__init__(app, nworkers, **kwargs) # This allows multiple drmaa runners (although only one per handler) in the same job config file if 'drmaa_library_path' in kwargs: log.info( 'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path) os.environ[ 'DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path # Import is delayed until runner initialization to allow for the # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH try: drmaa = __import__("drmaa") except (ImportError, RuntimeError) as exc: raise exc.__class__( 'The Python drmaa package is required to use this ' 'feature, please install it or correct the ' 'following error:\n%s: %s' % (exc.__class__.__name__, str(exc))) # Subclasses may need access to state constants self.drmaa_job_states = drmaa.JobState # Descriptive state strings pulled from the drmaa lib itself self.drmaa_job_state_strings = { drmaa.JobState.UNDETERMINED: 'process status cannot be determined', drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', drmaa.JobState.RUNNING: 'job is running', drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', drmaa.JobState.USER_SUSPENDED: 'job is user suspended', drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed', } # Ensure a DRMAA session exists and is initialized self.ds = DrmaaSessionFactory().get() self.userid = None self._init_monitor_thread() self._init_worker_threads() def url_to_destination(self, url): """Convert a legacy URL to a job destination""" if not url: return native_spec = url.split('/')[2] if native_spec: params = dict(nativeSpecification=native_spec) log.debug( "Converted URL '%s' to destination runner=drmaa, params=%s" % (url, params)) return JobDestination(runner='drmaa', params=params) else: log.debug("Converted URL '%s' to destination runner=drmaa" % url) return JobDestination(runner='drmaa') def get_native_spec(self, url): """Get any native DRM arguments specified by the site configuration""" try: return url.split('/')[2] or None except: return None def queue_job(self, job_wrapper): """Create job script and submit it to the DRM""" # prepare the job # external_runJob_script can be None, in which case it's not used. external_runjob_script = job_wrapper.get_destination_configuration( "drmaa_external_runjob_script", None) include_metadata = asbool( job_wrapper.job_destination.params.get("embed_metadata_in_job", True)) if not self.prepare_job(job_wrapper, include_metadata=include_metadata): return # get configured job destination job_destination = job_wrapper.job_destination # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes job_name = 'g%s' % galaxy_id_tag if job_wrapper.tool.old_id: job_name += '_%s' % job_wrapper.tool.old_id if external_runjob_script is None: job_name += '_%s' % job_wrapper.user job_name = ''.join( map( lambda x: x if x in (string.letters + string.digits + '_') else '_', job_name)) ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name) # set up the drmaa job template jt = dict(remoteCommand=ajs.job_file, jobName=ajs.job_name, workingDirectory=job_wrapper.working_directory, outputPath=":%s" % ajs.output_file, errorPath=":%s" % ajs.error_file) # Avoid a jt.exitCodePath for now - it's only used when finishing. native_spec = job_destination.params.get('nativeSpecification', None) if native_spec is not None: jt['nativeSpecification'] = native_spec # fill in the DRM's job run template script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file) try: self.write_executable_script(ajs.job_file, script) except: job_wrapper.fail("failure preparing job script", exception=True) log.exception("(%s) failure writing job script" % galaxy_id_tag) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag) if job_wrapper.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug("(%s) submitting file %s", galaxy_id_tag, ajs.job_file) if native_spec: log.debug("(%s) native specification is: %s", galaxy_id_tag, native_spec) # runJob will raise if there's a submit problem if external_runjob_script is None: # TODO: create a queue for retrying submission indefinitely # TODO: configurable max tries and sleep trynum = 0 external_job_id = None fail_msg = None while external_job_id is None and trynum < 5: try: external_job_id = self.ds.run_job(**jt) break except (drmaa.InternalException, drmaa.DeniedByDrmException) as e: trynum += 1 log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e) fail_msg = "Unable to run this job due to a cluster error, please retry it later" time.sleep(5) except: log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag) trynum = 5 else: log.error("(%s) All attempts to submit job failed" % galaxy_id_tag) if not fail_msg: fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE job_wrapper.fail(fail_msg) return else: job_wrapper.change_ownership_for_run() # if user credentials are not available, use galaxy credentials (if permitted) allow_guests = asbool( job_wrapper.job_destination.params.get("allow_guests", False)) pwent = job_wrapper.user_system_pwent if pwent is None: if not allow_guests: fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user job_wrapper.fail(fail_msg) return pwent = job_wrapper.galaxy_system_pwent log.debug('(%s) submitting with credentials: %s [uid: %s]' % (galaxy_id_tag, pwent[0], pwent[2])) filename = self.store_jobtemplate(job_wrapper, jt) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]).strip() log.info("(%s) queued as %s" % (galaxy_id_tag, external_job_id)) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination(job_destination, external_job_id) # Store DRM related state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(ajs) def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): """ Handle a job upon its termination in the DRM. This method is meant to be overridden by subclasses to improve post-mortem and reporting of failures. """ if drmaa_state == drmaa.JobState.FAILED: if ajs.job_wrapper.get_state() != model.Job.states.DELETED: ajs.stop_job = False ajs.fail_message = "The cluster DRM system terminated this job" self.work_queue.put((self.fail_job, ajs)) elif drmaa_state == drmaa.JobState.DONE: # External metadata processing for external runjobs external_metadata = not asbool( ajs.job_wrapper.job_destination.params.get( "embed_metadata_in_job", True)) if external_metadata: self._handle_metadata_externally(ajs.job_wrapper, resolve_requirements=True) super(DRMAAJobRunner, self)._complete_terminal_job(ajs) def check_watched_items(self): """ Called by the monitor thread to look at each watched job and deal with state changes. """ new_watched = [] for ajs in self.watched: external_job_id = ajs.job_id galaxy_id_tag = ajs.job_wrapper.get_id_tag() old_state = ajs.old_state try: assert external_job_id not in ( None, 'None'), '(%s/%s) Invalid job id' % (galaxy_id_tag, external_job_id) state = self.ds.job_status(external_job_id) except (drmaa.InternalException, drmaa.InvalidJobException) as e: if isinstance(e, drmaa.InvalidJobException): ecn = "InvalidJobException".lower() else: ecn = "InternalException".lower() retry_param = ecn.lower() + '_retries' state_param = ecn.lower() + '_state' retries = getattr(ajs, retry_param, 0) if self.runner_params[retry_param] > 0: if retries < self.runner_params[retry_param]: # will retry check on next iteration setattr(ajs, retry_param, retries + 1) continue if self.runner_params[state_param] == model.Job.states.OK: log.info( "(%s/%s) job left DRM queue with following message: %s", galaxy_id_tag, external_job_id, e) self.work_queue.put((self.finish_job, ajs)) elif self.runner_params[state_param] == model.Job.states.ERROR: log.info( "(%s/%s) job check resulted in %s after %s tries: %s", galaxy_id_tag, external_job_id, ecn, retries, e) self.work_queue.put((self.fail_job, ajs)) else: raise Exception( "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()", state_param, self.runner_params[state_param]) continue except drmaa.DrmCommunicationException as e: log.warning("(%s/%s) unable to communicate with DRM: %s", galaxy_id_tag, external_job_id, e) new_watched.append(ajs) continue except Exception as e: # so we don't kill the monitor thread log.exception("(%s/%s) Unable to check job status: %s" % (galaxy_id_tag, external_job_id, str(e))) log.warning("(%s/%s) job will now be errored" % (galaxy_id_tag, external_job_id)) ajs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, ajs)) continue if state != old_state: log.debug("(%s/%s) state change: %s" % (galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state])) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state(model.Job.states.RUNNING) if state in (drmaa.JobState.FAILED, drmaa.JobState.DONE): self._complete_terminal_job(ajs, drmaa_state=state) continue if ajs.check_limits(): self.work_queue.put((self.fail_job, ajs)) continue ajs.old_state = state new_watched.append(ajs) # Replace the watch list with the updated version self.watched = new_watched def stop_job(self, job): """Attempts to delete a job from the DRM queue""" try: ext_id = job.get_job_runner_external_id() assert ext_id not in (None, 'None'), 'External job id is None' kill_script = job.get_destination_configuration( self.app.config, "drmaa_external_killjob_script", None) if kill_script is None: self.ds.kill(ext_id) else: # FIXME: hardcoded path subprocess.Popen([ '/usr/bin/sudo', '-E', kill_script, str(ext_id), str(self.userid) ], shell=False) log.debug("(%s/%s) Removed from DRM queue at user's request" % (job.get_id(), ext_id)) except drmaa.InvalidJobException: log.debug( "(%s/%s) User killed running job, but it was already dead" % (job.get_id(), ext_id)) except Exception as e: log.debug( "(%s/%s) User killed running job, but error encountered removing from DRM queue: %s" % (job.get_id(), ext_id, e)) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() if job_id is None: self.put(job_wrapper) return ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper) ajs.job_id = str(job_id) ajs.command_line = job.get_command_line() ajs.job_wrapper = job_wrapper ajs.job_destination = job_wrapper.job_destination if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % (job.get_id(), job.get_job_runner_external_id())) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.get_state() == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % (job.get_id(), job.get_job_runner_external_id())) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put(ajs) def store_jobtemplate(self, job_wrapper, jt): """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string. Path is hard-coded, but it's no worse than other path in this module. Uses Galaxy's JobID, so file is expected to be unique.""" filename = "%s/%s.jt_json" % (self.app.config.cluster_files_directory, job_wrapper.get_id_tag()) with open(filename, 'w+') as fp: json.dump(jt, fp) log.debug('(%s) Job script for external submission is: %s' % (job_wrapper.job_id, filename)) return filename def external_runjob(self, external_runjob_script, jobtemplate_filename, username): """ runs an external script the will QSUB a new job. The external script will be run with sudo, and will setuid() to the specified user. Effectively, will QSUB as a different user (then the one used by Galaxy). """ script_parts = external_runjob_script.split() script = script_parts[0] command = ['/usr/bin/sudo', '-E', script] for script_argument in script_parts[1:]: command.append(script_argument) command.extend([str(username), jobtemplate_filename]) log.info("Running command %s" % command) p = subprocess.Popen(command, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdoutdata, stderrdata) = p.communicate() exitcode = p.returncode # os.unlink(jobtemplate_filename) if exitcode != 0: # There was an error in the child process raise RuntimeError( "External_runjob failed (exit code %s)\nChild process reported error:\n%s" % (str(exitcode), stderrdata)) if not stdoutdata.strip(): raise RuntimeError("External_runjob did return the job id: %s" % (stdoutdata)) # The expected output is a single line containing a single numeric value: # the DRMAA job-ID. If not the case, will throw an error. jobId = stdoutdata return jobId
class DRMAAJobRunner(AsynchronousJobRunner): """ Job runner backed by a finite pool of worker threads. FIFO scheduling """ runner_name = "DRMAARunner" restrict_job_name_length = 15 def __init__(self, app, nworkers, **kwargs): """Start the job runner""" global drmaa runner_param_specs = { 'drmaa_library_path': dict(map=str, default=os.environ.get('DRMAA_LIBRARY_PATH', None)) } for retry_exception in RETRY_EXCEPTIONS_LOWER: runner_param_specs[retry_exception + '_state'] = dict( map=str, valid=lambda x: x in (model.Job.states.OK, model.Job.states.ERROR), default=model.Job.states.OK) runner_param_specs[retry_exception + '_retries'] = dict( map=int, valid=lambda x: int(x) >= 0, default=0) if 'runner_param_specs' not in kwargs: kwargs['runner_param_specs'] = dict() kwargs['runner_param_specs'].update(runner_param_specs) super().__init__(app, nworkers, **kwargs) # This allows multiple drmaa runners (although only one per handler) in the same job config file if 'drmaa_library_path' in kwargs: log.info( 'Overriding DRMAA_LIBRARY_PATH due to runner plugin parameter: %s', self.runner_params.drmaa_library_path) os.environ[ 'DRMAA_LIBRARY_PATH'] = self.runner_params.drmaa_library_path # Import is delayed until runner initialization to allow for the # drmaa_library_path plugin param to override $DRMAA_LIBRARY_PATH try: drmaa = __import__("drmaa") except (ImportError, RuntimeError) as exc: raise exc.__class__( 'The Python drmaa package is required to use this ' 'feature, please install it or correct the ' 'following error:\n%s: %s' % (exc.__class__.__name__, str(exc))) from pulsar.managers.util.drmaa import DrmaaSessionFactory # make the drmaa library also available to subclasses self.drmaa = drmaa # Subclasses may need access to state constants self.drmaa_job_states = drmaa.JobState # Descriptive state strings pulled from the drmaa lib itself self.drmaa_job_state_strings = { drmaa.JobState.UNDETERMINED: 'process status cannot be determined', drmaa.JobState.QUEUED_ACTIVE: 'job is queued and active', drmaa.JobState.SYSTEM_ON_HOLD: 'job is queued and in system hold', drmaa.JobState.USER_ON_HOLD: 'job is queued and in user hold', drmaa.JobState.USER_SYSTEM_ON_HOLD: 'job is queued and in user and system hold', drmaa.JobState.RUNNING: 'job is running', drmaa.JobState.SYSTEM_SUSPENDED: 'job is system suspended', drmaa.JobState.USER_SUSPENDED: 'job is user suspended', drmaa.JobState.DONE: 'job finished normally', drmaa.JobState.FAILED: 'job finished, but failed', } # Ensure a DRMAA session exists and is initialized self.ds = DrmaaSessionFactory().get() self.userid = None self._init_monitor_thread() self._init_worker_threads() self.redact_email_in_job_name = self.app.config.redact_email_in_job_name def url_to_destination(self, url): """Convert a legacy URL to a job destination""" if not url: return native_spec = url.split('/')[2] if native_spec: params = dict(nativeSpecification=native_spec) log.debug( f"Converted URL '{url}' to destination runner=drmaa, params={params}" ) return JobDestination(runner='drmaa', params=params) else: log.debug("Converted URL '%s' to destination runner=drmaa" % url) return JobDestination(runner='drmaa') def get_native_spec(self, url): """Get any native DRM arguments specified by the site configuration""" try: return url.split('/')[2] or None except Exception: return None def queue_job(self, job_wrapper): """Create job script and submit it to the DRM""" # prepare the job # external_runJob_script can be None, in which case it's not used. external_runjob_script = job_wrapper.get_destination_configuration( "drmaa_external_runjob_script", None) include_metadata = asbool( job_wrapper.job_destination.params.get("embed_metadata_in_job", True)) if not self.prepare_job(job_wrapper, include_metadata=include_metadata): return # get configured job destination job_destination = job_wrapper.job_destination # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() job_name = self._job_name(job_wrapper) ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name) # set up the drmaa job template jt = dict(remoteCommand=ajs.job_file, jobName=ajs.job_name, workingDirectory=job_wrapper.working_directory, outputPath=":%s" % ajs.output_file, errorPath=":%s" % ajs.error_file) # Avoid a jt.exitCodePath for now - it's only used when finishing. native_spec = job_destination.params.get('nativeSpecification', None) if native_spec is None: native_spec = job_destination.params.get('native_specification', None) if native_spec is not None: jt['nativeSpecification'] = native_spec # fill in the DRM's job run template script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell) try: self.write_executable_script(ajs.job_file, script) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception("(%s) failure writing job script" % galaxy_id_tag) return # job was deleted while we were preparing it if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED): log.debug( "(%s) Job deleted/stopped by user before it entered the queue", galaxy_id_tag) if job_wrapper.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug("(%s) submitting file %s", galaxy_id_tag, ajs.job_file) if native_spec: log.debug("(%s) native specification is: %s", galaxy_id_tag, native_spec) # runJob will raise if there's a submit problem if external_runjob_script is None: # TODO: create a queue for retrying submission indefinitely # TODO: configurable max tries and sleep trynum = 0 external_job_id = None fail_msg = None while external_job_id is None and trynum < 5: try: external_job_id = self.ds.run_job(**jt) break except (drmaa.InternalException, drmaa.DeniedByDrmException) as e: trynum += 1 log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e) fail_msg = "Unable to run this job due to a cluster error, please retry it later" time.sleep(5) except Exception: log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag) trynum = 5 else: log.error("(%s) All attempts to submit job failed" % galaxy_id_tag) if not fail_msg: fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE job_wrapper.fail(fail_msg) return else: job_wrapper.change_ownership_for_run() # if user credentials are not available, use galaxy credentials (if permitted) allow_guests = asbool( job_wrapper.job_destination.params.get("allow_guests", False)) pwent = job_wrapper.user_system_pwent if pwent is None: if not allow_guests: fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user job_wrapper.fail(fail_msg) return pwent = job_wrapper.galaxy_system_pwent log.debug('({}) submitting with credentials: {} [uid: {}]'.format( galaxy_id_tag, pwent[0], pwent[2])) filename = self.store_jobtemplate(job_wrapper, jt) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]) if external_job_id is None: job_wrapper.fail("(%s) could not queue job" % galaxy_id_tag) return log.info(f"({galaxy_id_tag}) queued as {external_job_id}") # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) # Store DRM related state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(ajs) def _complete_terminal_job(self, ajs, drmaa_state, **kwargs): """ Handle a job upon its termination in the DRM. This method is meant to be overridden by subclasses to improve post-mortem and reporting of failures. Returns True if job was not actually terminal, None otherwise. (Note: This function always returns None. Hence this function actually does not determine if a job was terminal, but the implementation in the subclasses is supposed to do this.) """ job_state = ajs.job_wrapper.get_state() if drmaa_state == drmaa.JobState.FAILED and job_state != model.Job.states.STOPPED: if job_state != model.Job.states.DELETED: ajs.stop_job = False ajs.fail_message = "The cluster DRM system terminated this job" self.work_queue.put((self.fail_job, ajs)) elif drmaa_state == drmaa.JobState.DONE or job_state == model.Job.states.STOPPED: # External metadata processing for external runjobs external_metadata = not asbool( ajs.job_wrapper.job_destination.params.get( "embed_metadata_in_job", True)) if external_metadata: self._handle_metadata_externally(ajs.job_wrapper, resolve_requirements=True) if job_state != model.Job.states.DELETED: self.work_queue.put((self.finish_job, ajs)) def check_watched_item(self, ajs, new_watched): """ look at a single watched job, determine its state, and deal with errors that could happen in this process. to be called from check_watched_items() returns the state or None if exceptions occurred in the latter case the job is appended to new_watched if a 1 drmaa.InternalException, 2 drmaa.InvalidJobExceptionnot, or 3 drmaa.DrmCommunicationException occurred (which causes the job to be tested again in the next iteration of check_watched_items) - the job is finished as errored if any other exception occurs - the job is finished OK or errored after the maximum number of retries depending on the exception Note that None is returned in all cases where the loop in check_watched_items is to be continued """ external_job_id = ajs.job_id galaxy_id_tag = ajs.job_wrapper.get_id_tag() state = None try: assert external_job_id not in ( None, 'None'), f'({galaxy_id_tag}/{external_job_id}) Invalid job id' state = self.ds.job_status(external_job_id) # Reset exception retries for retry_exception in RETRY_EXCEPTIONS_LOWER: setattr(ajs, retry_exception + '_retries', 0) except (drmaa.InternalException, drmaa.InvalidJobException) as e: ecn = type(e).__name__ retry_param = ecn.lower() + '_retries' state_param = ecn.lower() + '_state' retries = getattr(ajs, retry_param, 0) log.warning( "(%s/%s) unable to check job status because of %s exception for %d consecutive tries: %s", galaxy_id_tag, external_job_id, ecn, retries + 1, e) if self.runner_params[retry_param] > 0: if retries < self.runner_params[retry_param]: # will retry check on next iteration setattr(ajs, retry_param, retries + 1) new_watched.append(ajs) return None if self.runner_params[state_param] == model.Job.states.OK: log.warning("(%s/%s) job will now be finished OK", galaxy_id_tag, external_job_id) self.work_queue.put((self.finish_job, ajs)) elif self.runner_params[state_param] == model.Job.states.ERROR: log.warning("(%s/%s) job will now be errored", galaxy_id_tag, external_job_id) self.work_queue.put((self.fail_job, ajs)) else: raise Exception( "%s is set to an invalid value (%s), this should not be possible. See galaxy.jobs.drmaa.__init__()", state_param, self.runner_params[state_param]) return None except drmaa.DrmCommunicationException as e: log.warning("(%s/%s) unable to communicate with DRM: %s", galaxy_id_tag, external_job_id, e) new_watched.append(ajs) return None except Exception: # so we don't kill the monitor thread log.exception( f"({galaxy_id_tag}/{external_job_id}) unable to check job status" ) log.warning( f"({galaxy_id_tag}/{external_job_id}) job will now be errored") ajs.fail_message = "Cluster could not complete job" self.work_queue.put((self.fail_job, ajs)) return None return state def check_watched_items(self): """ Called by the monitor thread to look at each watched job and deal with state changes. """ new_watched = [] for ajs in self.watched: external_job_id = ajs.job_id galaxy_id_tag = ajs.job_wrapper.get_id_tag() old_state = ajs.old_state state = self.check_watched_item(ajs, new_watched) if state is None: continue if state != old_state: log.debug("({}/{}) state change: {}".format( galaxy_id_tag, external_job_id, self.drmaa_job_state_strings[state])) if state == drmaa.JobState.RUNNING and not ajs.running: ajs.running = True ajs.job_wrapper.change_state(model.Job.states.RUNNING) if state in (drmaa.JobState.FAILED, drmaa.JobState.DONE): if self._complete_terminal_job(ajs, drmaa_state=state) is not None: # job was not actually terminal state = ajs.old_state else: continue if ajs.running: # TODO: stop checking at some point ajs.job_wrapper.check_for_entry_points() if ajs.check_limits(): self.work_queue.put((self.fail_job, ajs)) continue ajs.old_state = state new_watched.append(ajs) # Replace the watch list with the updated version self.watched = new_watched def stop_job(self, job_wrapper): """Attempts to delete a job from the DRM queue""" job = job_wrapper.get_job() try: ext_id = job.get_job_runner_external_id() assert ext_id not in (None, 'None'), 'External job id is None' kill_script = job_wrapper.get_destination_configuration( "drmaa_external_killjob_script") if kill_script is None: self.ds.kill(ext_id) else: cmd = shlex.split(kill_script) cmd.extend([str(ext_id), str(self.userid)]) commands.execute(cmd) log.info( f"({job.id}/{ext_id}) Removed from DRM queue at user's request" ) except drmaa.InvalidJobException: log.exception( f"({job.id}/{ext_id}) User killed running job, but it was already dead" ) except commands.CommandLineException as e: log.error( "({}/{}) User killed running job, but command execution failed: {}" .format(job.id, ext_id, unicodify(e))) except Exception: log.exception( f"({job.id}/{ext_id}) User killed running job, but error encountered removing from DRM queue" ) def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() if job_id is None: self.put(job_wrapper) return ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper) ajs.job_id = str(job_id) ajs.command_line = job.get_command_line() ajs.job_wrapper = job_wrapper ajs.job_destination = job_wrapper.job_destination if job.state in (model.Job.states.RUNNING, model.Job.states.STOPPED): log.debug( f"({job.id}/{job.get_job_runner_external_id()}) is still in {job.state} state, adding to the DRM queue" ) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.get_state() == model.Job.states.QUEUED: log.debug( f"({job.id}/{job.get_job_runner_external_id()}) is still in DRM queued state, adding to the DRM queue" ) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put(ajs) def store_jobtemplate(self, job_wrapper, jt): """ Stores the content of a DRMAA JobTemplate object in a file as a JSON string. Path is hard-coded, but it's no worse than other path in this module. Uses Galaxy's JobID, so file is expected to be unique.""" filename = f"{self.app.config.cluster_files_directory}/{job_wrapper.get_id_tag()}.jt_json" with open(filename, 'w+') as fp: json.dump(jt, fp) log.debug( f'({job_wrapper.job_id}) Job script for external submission is: {filename}' ) return filename def external_runjob(self, external_runjob_script, jobtemplate_filename, username): """ runs an external script that will QSUB a new job. The external script needs to be run with sudo, and will setuid() to the specified user. Effectively, will QSUB as a different user (than the one used by Galaxy). """ cmd = shlex.split(external_runjob_script) cmd.extend([str(username), jobtemplate_filename]) log.info("Running command %s" % cmd) try: stdoutdata = commands.execute(cmd).strip() except commands.CommandLineException: log.exception("External_runjob failed") return None # The expected output is a single line containing a single numeric value: # the DRMAA job-ID. If not the case, will throw an error. if not stdoutdata: log.exception( "External_runjob did not returned nothing instead of the job id" ) return None return stdoutdata def _job_name(self, job_wrapper): external_runjob_script = job_wrapper.get_destination_configuration( "drmaa_external_runjob_script", None) galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes job_name = 'g%s' % galaxy_id_tag if job_wrapper.tool.old_id: job_name += '_%s' % job_wrapper.tool.old_id if not self.redact_email_in_job_name and external_runjob_script is None: job_name += '_%s' % job_wrapper.user job_name = ''.join(x if x in (string.ascii_letters + string.digits + '_') else '_' for x in job_name) if self.restrict_job_name_length: job_name = job_name[:self.restrict_job_name_length] return job_name