def _job_state( self, job, job_wrapper ): job_state = AsynchronousJobState() # TODO: Determine why this is set when using normal message queue updates # but not CLI submitted MQ updates... raw_job_id = job.get_job_runner_external_id() or job_wrapper.job_id job_state.job_id = str( raw_job_id ) job_state.runner_url = job_wrapper.get_job_runner_url() job_state.job_destination = job_wrapper.job_destination job_state.job_wrapper = job_wrapper return job_state
def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" # TODO this needs to be implemented to override unimplemented base method job_id = job.get_job_runner_external_id() log.debug("k8s trying to recover job: " + job_id) if job_id is None: self.put(job_wrapper) return ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper) ajs.job_id = str(job_id) ajs.command_line = job.command_line ajs.job_wrapper = job_wrapper ajs.job_destination = job_wrapper.job_destination if job.state == model.Job.states.RUNNING: log.debug("(%s/%s) is still in running state, adding to the runner monitor queue" % ( job.id, job.job_runner_external_id)) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue" % ( job.id, job.job_runner_external_id)) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs)
def recover(self,job,job_wrapper): # Recovers jobs in the queued/running state when Galaxy started # What is 'job' an instance of??? # Could be model.Job? # Fetch the job id used by JSE-Drop job_name = job.get_job_runner_external_id() # Get the job destination job_destination = job_wrapper.job_destination # Fetch the drop dir drop_off_dir = self._get_drop_dir() log.debug("recover: drop-off dir = %s" % drop_off_dir) jse_drop = JSEDrop(drop_off_dir) # Store state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_name job_state.job_destination = job_destination # Sort out the status if job.state == model.Job.states.RUNNING: job_state.old_state = True job_state.running = True elif job.get_state() == model.Job.states.QUEUED: job_state.old_state = True job_state.running = False # Add to the queue of jobs to monitor self.monitor_queue.put(job_state)
def recover( self, job, job_wrapper ): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() if job_id is None: self.put( job_wrapper ) return ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper ) ajs.job_id = str( job_id ) ajs.command_line = job.get_command_line() ajs.job_wrapper = job_wrapper ajs.job_destination = job_wrapper.job_destination if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) ) ajs.old_state = drmaa.JobState.RUNNING ajs.running = True self.monitor_queue.put( ajs ) elif job.get_state() == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) ) ajs.old_state = drmaa.JobState.QUEUED_ACTIVE ajs.running = False self.monitor_queue.put( ajs )
def recover( self, job, job_wrapper ): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_state = AsynchronousJobState() job_state.job_id = str( job.get_job_runner_external_id() ) job_state.runner_url = job_wrapper.get_job_runner_url() job_state.job_destination = job_wrapper.job_destination job_wrapper.command_line = job.get_command_line() job_state.job_wrapper = job_wrapper state = job.get_state() if state in [model.Job.states.RUNNING, model.Job.states.QUEUED]: log.debug( "(LWR/%s) is still in running state, adding to the LWR queue" % ( job.get_id()) ) job_state.old_state = True job_state.running = state == model.Job.states.RUNNING self.monitor_queue.put( job_state )
def recover(self, job, job_wrapper): msg = ('(name!r/runner!r) is still in {state!s} state, adding to' ' the runner monitor queue') job_id = job.get_job_runner_external_id() ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper) ajs.job_id = self.JOB_NAME_PREFIX + str(job_id) ajs.command_line = job.command_line ajs.job_wrapper = job_wrapper ajs.job_destination = job_wrapper.job_destination if job.state == model.Job.states.RUNNING: LOGGER.debug(msg.format( name=job.id, runner=job.job_runner_external_id, state='running')) ajs.old_state = model.Job.states.RUNNING ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: LOGGER.debug(msg.format( name=job.id, runner=job.job_runner_external_id, state='queued')) ajs.old_state = model.Job.states.QUEUED ajs.running = False self.monitor_queue.put(ajs)
def recover(self, job, job_wrapper): """ Recovers jobs stuck in the queued/running state when Galaxy started """ """ This method is called by galaxy at the time of startup. Jobs in Running & Queued status in galaxy are put in the monitor_queue by creating an AsynchronousJobState object """ job_id = job_wrapper.job_id ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper) ajs.job_id = str(job_id) ajs.job_destination = job_wrapper.job_destination job_wrapper.command_line = job.command_line ajs.job_wrapper = job_wrapper if job.state == model.Job.states.RUNNING: log.debug("(%s/%s) is still in running state, adding to the god queue" % (job.id, job.get_job_runner_external_id())) ajs.old_state = 'R' ajs.running = True self.monitor_queue.put(ajs) elif job.state == model.Job.states.QUEUED: log.debug("(%s/%s) is still in god queued state, adding to the god queue" % (job.id, job.get_job_runner_external_id())) ajs.old_state = 'Q' ajs.running = False self.monitor_queue.put(ajs)
def queue_job(self, job_wrapper): job_destination = job_wrapper.job_destination self._populate_parameter_defaults(job_destination) command_line, client, remote_job_config, compute_environment = self.__prepare_job(job_wrapper, job_destination) if not command_line: return try: dependencies_description = PulsarJobRunner.__dependencies_description(client, job_wrapper) rewrite_paths = not PulsarJobRunner.__rewrite_parameters(client) unstructured_path_rewrites = {} output_names = [] if compute_environment: unstructured_path_rewrites = compute_environment.unstructured_path_rewrites output_names = compute_environment.output_names() client_job_description = ClientJobDescription( command_line=command_line, input_files=self.get_input_files(job_wrapper), client_outputs=self.__client_outputs(client, job_wrapper), working_directory=job_wrapper.tool_working_directory, metadata_directory=job_wrapper.working_directory, tool=job_wrapper.tool, config_files=job_wrapper.extra_filenames, dependencies_description=dependencies_description, env=client.env, rewrite_paths=rewrite_paths, arbitrary_files=unstructured_path_rewrites, touch_outputs=output_names, ) job_id = pulsar_submit_job(client, client_job_description, remote_job_config) log.info("Pulsar job submitted with job_id %s" % job_id) job_wrapper.set_job_destination(job_destination, job_id) job_wrapper.change_state(model.Job.states.QUEUED) except Exception: job_wrapper.fail("failure running job", exception=True) log.exception("failure running job %d", job_wrapper.job_id) return pulsar_job_state = AsynchronousJobState() pulsar_job_state.job_wrapper = job_wrapper pulsar_job_state.job_id = job_id pulsar_job_state.old_state = True pulsar_job_state.running = False pulsar_job_state.job_destination = job_destination self.monitor_job(pulsar_job_state)
def queue_job(self, job_wrapper): command_line = '' job_destination = job_wrapper.job_destination try: job_wrapper.prepare() if hasattr(job_wrapper, 'prepare_input_files_cmds') and job_wrapper.prepare_input_files_cmds is not None: for cmd in job_wrapper.prepare_input_files_cmds: # run the commands to stage the input files #log.debug( 'executing: %s' % cmd ) if 0 != os.system(cmd): raise Exception('Error running file staging command: %s' % cmd) job_wrapper.prepare_input_files_cmds = None # prevent them from being used in-line command_line = self.build_command_line( job_wrapper, include_metadata=False, include_work_dir_outputs=False ) except: job_wrapper.fail( "failure preparing job", exception=True ) log.exception("failure running job %d" % job_wrapper.job_id) return # If we were able to get a command line, run the job if not command_line: job_wrapper.finish( '', '' ) return try: client = self.get_client_from_wrapper(job_wrapper) output_files = self.get_output_files(job_wrapper) input_files = job_wrapper.get_input_fnames() working_directory = job_wrapper.working_directory tool = job_wrapper.tool file_stager = FileStager(client, tool, command_line, job_wrapper.extra_filenames, input_files, output_files, working_directory) rebuilt_command_line = file_stager.get_rewritten_command_line() job_id = file_stager.job_id client.launch( rebuilt_command_line ) job_wrapper.set_job_destination( job_destination, job_id ) job_wrapper.change_state( model.Job.states.QUEUED ) except: job_wrapper.fail( "failure running job", exception=True ) log.exception("failure running job %d" % job_wrapper.job_id) return lwr_job_state = AsynchronousJobState() lwr_job_state.job_wrapper = job_wrapper lwr_job_state.job_id = job_id lwr_job_state.old_state = True lwr_job_state.running = False lwr_job_state.job_destination = job_destination self.monitor_job(lwr_job_state)
def queue_job(self, job_wrapper): job_destination = job_wrapper.job_destination command_line, client, remote_job_config = self.__prepare_job( job_wrapper, job_destination ) if not command_line: return try: dependency_resolution = LwrJobRunner.__dependency_resolution( client ) remote_dependency_resolution = dependency_resolution == "remote" requirements = job_wrapper.tool.requirements if remote_dependency_resolution else [] client_job_description = ClientJobDescription( command_line=command_line, output_files=self.get_output_files(job_wrapper), input_files=job_wrapper.get_input_fnames(), working_directory=job_wrapper.working_directory, tool=job_wrapper.tool, config_files=job_wrapper.extra_filenames, requirements=requirements, version_file=job_wrapper.get_version_string_path(), ) job_id = lwr_submit_job(client, client_job_description, remote_job_config) log.info("lwr job submitted with job_id %s" % job_id) job_wrapper.set_job_destination( job_destination, job_id ) job_wrapper.change_state( model.Job.states.QUEUED ) except Exception: job_wrapper.fail( "failure running job", exception=True ) log.exception("failure running job %d" % job_wrapper.job_id) return lwr_job_state = AsynchronousJobState() lwr_job_state.job_wrapper = job_wrapper lwr_job_state.job_id = job_id lwr_job_state.old_state = True lwr_job_state.running = False lwr_job_state.job_destination = job_destination self.monitor_job(lwr_job_state)
def queue_job( self, job_wrapper ): """Create job script and submit it to the DRM""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=True ): return # command line has been added to the wrapper by prepare_job() command_line = job_wrapper.runner_command_line # Get shell and job execution interface job_destination = job_wrapper.job_destination shell_params, job_params = self.parse_destination_params(job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper ) job_file_kwargs = job_interface.job_script_kwargs(ajs.output_file, ajs.error_file, ajs.job_name) script = self.get_job_file( job_wrapper, exit_code_path=ajs.exit_code_file, **job_file_kwargs ) try: fh = file(ajs.job_file, "w") fh.write(script) fh.close() except: log.exception("(%s) failure writing job script" % galaxy_id_tag ) job_wrapper.fail("failure preparing job script", exception=True) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.info("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag ) if self.app.config.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug( "(%s) submitting file: %s" % ( galaxy_id_tag, ajs.job_file ) ) cmd_out = shell.execute(job_interface.submit(ajs.job_file)) if cmd_out.returncode != 0: log.error('(%s) submission failed (stdout): %s' % (galaxy_id_tag, cmd_out.stdout)) log.error('(%s) submission failed (stderr): %s' % (galaxy_id_tag, cmd_out.stderr)) job_wrapper.fail("failure submitting job") return # Some job runners return something like 'Submitted batch job XXXX' # Strip and split to get job ID. external_job_id = cmd_out.stdout.strip().split()[-1] if not external_job_id: log.error('(%s) submission did not return a job identifier, failing job' % galaxy_id_tag) job_wrapper.fail("failure submitting job") return log.info("(%s) queued with identifier: %s" % ( galaxy_id_tag, external_job_id ) ) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination( job_destination, external_job_id ) # Store state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( ajs )
def queue_job( self, job_wrapper ): """Create job script and submit it to the DRM""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=True ): return # command line has been added to the wrapper by prepare_job() command_line = job_wrapper.runner_command_line # get configured job destination job_destination = job_wrapper.job_destination # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes job_name = 'g%s' % galaxy_id_tag if job_wrapper.tool.old_id: job_name += '_%s' % job_wrapper.tool.old_id if self.external_runJob_script is None: job_name += '_%s' % job_wrapper.user job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) ) ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name ) # set up the drmaa job template jt = self.ds.createJobTemplate() jt.remoteCommand = ajs.job_file jt.jobName = ajs.job_name jt.outputPath = ":%s" % ajs.output_file jt.errorPath = ":%s" % ajs.error_file # Avoid a jt.exitCodePath for now - it's only used when finishing. native_spec = job_destination.params.get('nativeSpecification', None) if native_spec is not None: jt.nativeSpecification = native_spec # fill in the DRM's job run template script = drm_template % ( job_wrapper.galaxy_lib_dir, job_wrapper.get_env_setup_clause(), os.path.abspath( job_wrapper.working_directory ), command_line, ajs.exit_code_file ) try: fh = file( ajs.job_file, "w" ) fh.write( script ) fh.close() os.chmod( ajs.job_file, 0755 ) except: job_wrapper.fail( "failure preparing job script", exception=True ) log.exception( "(%s) failure writing job script" % galaxy_id_tag ) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag ) if self.app.config.cleanup_job in ( "always", "onsuccess" ): job_wrapper.cleanup() return log.debug( "(%s) submitting file %s" % ( galaxy_id_tag, ajs.job_file ) ) log.debug( "(%s) command is: %s" % ( galaxy_id_tag, command_line ) ) # runJob will raise if there's a submit problem if self.external_runJob_script is None: external_job_id = self.ds.runJob(jt) else: job_wrapper.change_ownership_for_run() log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, job_wrapper.user_system_pwent[0], job_wrapper.user_system_pwent[2] ) ) filename = self.store_jobtemplate(job_wrapper, jt) self.userid = job_wrapper.user_system_pwent[2] external_job_id = self.external_runjob(filename, job_wrapper.user_system_pwent[2]).strip() log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) ) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination( job_destination, external_job_id ) # Store DRM related state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # delete the job template self.ds.deleteJobTemplate( jt ) # Add to our 'queue' of jobs to monitor self.monitor_queue.put( ajs )
def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() pbs_job_state = AsynchronousJobState() pbs_job_state.output_file = "%s/%s.o" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.error_file = "%s/%s.e" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.exit_code_file = "%s/%s.ec" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.job_file = "%s/%s.sh" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.job_id = str(job_id) pbs_job_state.runner_url = job_wrapper.get_job_runner_url() pbs_job_state.job_destination = job_wrapper.job_destination job_wrapper.command_line = job.command_line pbs_job_state.job_wrapper = job_wrapper if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the PBS queue" % (job.id, job.get_job_runner_external_id())) pbs_job_state.old_state = 'R' pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in PBS queued state, adding to the PBS queue" % (job.id, job.get_job_runner_external_id())) pbs_job_state.old_state = 'Q' pbs_job_state.running = False self.monitor_queue.put(pbs_job_state)
def queue_job(self, job_wrapper): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not (self.app.config.pbs_stage_path)): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop( '-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params[ 'destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error("Connection to PBS server for submit failed: %s: %s" % (errno, text)) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [str(o) for o in output_fnames] output_files.append(ecfile) stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() + output_files, symlink=True) stageout = self.get_stage_in_out(output_files) attrs = [ dict(name=pbs.ATTR_o, value=pbs_ofile), dict(name=pbs.ATTR_e, value=pbs_efile), dict(name=pbs.ATTR_stagein, value=stagein), dict(name=pbs.ATTR_stageout, value=stageout), ] # If not, we're using NFS else: attrs = [ dict(name=pbs.ATTR_o, value=ofile), dict(name=pbs.ATTR_e, value=efile), ] # define PBS job options attrs.append( dict(name=pbs.ATTR_N, value=str("%s_%s_%s" % (job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user)))) job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options)) for i, attr in enumerate(attrs + pbs_options): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath(job_wrapper.working_directory) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with open(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join(job_wrapper.get_input_fnames() + output_files), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [stage_commands] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script(job_file, script) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file)) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) time.sleep(2) else: log.error("(%s) All attempts to submit job failed" % galaxy_job_id) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id)) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id)) # persist destination job_wrapper.set_job_destination(job_destination, job_id) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(job_state)
def recover( self, job, job_wrapper ): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() pbs_job_state = AsynchronousJobState() pbs_job_state.output_file = "%s/%s.o" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.error_file = "%s/%s.e" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.exit_code_file = "%s/%s.ec" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.job_id = str( job_id ) pbs_job_state.runner_url = job_wrapper.get_job_runner_url() pbs_job_state.job_destination = job_wrapper.job_destination job_wrapper.command_line = job.command_line pbs_job_state.job_wrapper = job_wrapper if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the PBS queue" % ( job.id, job.get_job_runner_external_id() ) ) pbs_job_state.old_state = 'R' pbs_job_state.running = True self.monitor_queue.put( pbs_job_state ) elif job.state == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in PBS queued state, adding to the PBS queue" % ( job.id, job.get_job_runner_external_id() ) ) pbs_job_state.old_state = 'Q' pbs_job_state.running = False self.monitor_queue.put( pbs_job_state )
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}") ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_destination=job_wrapper.job_destination) # Kubernetes doesn't really produce meaningful "job stdout", but file needs to be present with open(ajs.output_file, 'w'): pass with open(ajs.error_file, 'w'): pass if not self.prepare_job( job_wrapper, include_metadata=False, modify_command_for_container=False, ): return script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None) try: self.write_executable_script(ajs.job_file, script, job_io=job_wrapper.job_io) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception(f"({job_wrapper.get_id_tag()}) failure writing job script") return # Construction of Kubernetes objects follow: https://kubernetes.io/docs/concepts/workloads/controllers/job/ if self.__has_guest_ports(job_wrapper): try: self.__configure_port_routing(ajs) except HTTPError: log.exception("Kubernetes failed to expose tool ports as services, HTTP exception encountered") ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to export tool ports as services." self.mark_as_failed(ajs) return k8s_job_prefix = self.__produce_k8s_job_prefix() k8s_job_obj = job_object_dict( self.runner_params, k8s_job_prefix, self.__get_k8s_job_spec(ajs) ) job = Job(self._pykube_api, k8s_job_obj) try: job.create() except HTTPError: log.exception("Kubernetes failed to create job, HTTP exception encountered") ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to create job." self.mark_as_failed(ajs) return if not job.name: log.exception(f"Kubernetes failed to create job, empty name encountered: [{job.obj}]") ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR ajs.fail_message = "Kubernetes failed to create job." self.mark_as_failed(ajs) return job_id = job.name # define job attributes in the AsyncronousJobState for follow-up ajs.job_id = job_id # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(job_id) self.monitor_queue.put(ajs)
def queue_job( self, job_wrapper ): """Create job script and submit it to the DRM""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=True ): return # Get shell and job execution interface job_destination = job_wrapper.job_destination shell_params, job_params = self.parse_destination_params(job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper ) job_file_kwargs = job_interface.job_script_kwargs(ajs.output_file, ajs.error_file, ajs.job_name) script = self.get_job_file( job_wrapper, exit_code_path=ajs.exit_code_file, **job_file_kwargs ) try: fh = file(ajs.job_file, "w") fh.write(script) fh.close() except: log.exception("(%s) failure writing job script" % galaxy_id_tag ) job_wrapper.fail("failure preparing job script", exception=True) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.info("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag ) if self.app.config.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug( "(%s) submitting file: %s" % ( galaxy_id_tag, ajs.job_file ) ) cmd_out = shell.execute(job_interface.submit(ajs.job_file)) if cmd_out.returncode != 0: log.error('(%s) submission failed (stdout): %s' % (galaxy_id_tag, cmd_out.stdout)) log.error('(%s) submission failed (stderr): %s' % (galaxy_id_tag, cmd_out.stderr)) job_wrapper.fail("failure submitting job") return # Some job runners return something like 'Submitted batch job XXXX' # Strip and split to get job ID. external_job_id = cmd_out.stdout.strip().split()[-1] if not external_job_id: log.error('(%s) submission did not return a job identifier, failing job' % galaxy_id_tag) job_wrapper.fail("failure submitting job") return log.info("(%s) queued with identifier: %s" % ( galaxy_id_tag, external_job_id ) ) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination( job_destination, external_job_id ) # Store state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( ajs )
def queue_job( self, job_wrapper ): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop('-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) ) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [ str( o ) for o in output_fnames ] output_files.append(ecfile) stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True ) stageout = self.get_stage_in_out( output_files ) attrs = [ dict( name=pbs.ATTR_o, value=pbs_ofile ), dict( name=pbs.ATTR_e, value=pbs_efile ), dict( name=pbs.ATTR_stagein, value=stagein ), dict( name=pbs.ATTR_stageout, value=stageout ), ] # If not, we're using NFS else: attrs = [ dict( name=pbs.ATTR_o, value=ofile ), dict( name=pbs.ATTR_e, value=efile ), ] # define PBS job options attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) ) job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) ) for i, attr in enumerate( attrs + pbs_options ): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath( job_wrapper.working_directory ) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with open(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join( job_wrapper.get_input_fnames() + output_files ), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [ stage_commands ] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script( job_file, script ) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id ) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ( "always", "onsuccess" ): self.cleanup( ( ofile, efile, ecfile, job_file ) ) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) ) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) ) time.sleep(2) else: log.error( "(%s) All attempts to submit job failed" % galaxy_job_id ) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) ) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) ) # persist destination job_wrapper.set_job_destination( job_destination, job_id ) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( job_state )
def queue_job(self, job_wrapper): """Create job script and submit it to Kubernetes cluster""" # prepare the job # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same # where galaxy will expect results. log.debug("Starting queue_job for job " + job_wrapper.get_id_tag()) ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_destination=job_wrapper.job_destination) if not self.prepare_job(job_wrapper, include_metadata=False, modify_command_for_container=False, stdout_file=ajs.output_file, stderr_file=ajs.error_file): return script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None) try: self.write_executable_script(ajs.job_file, script) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception("(%s) failure writing job script" % job_wrapper.get_id_tag()) return # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/ k8s_job_name = self.__produce_unique_k8s_job_name(job_wrapper.get_id_tag()) k8s_job_obj = job_object_dict( self.runner_params, k8s_job_name, self.__get_k8s_job_spec(ajs) ) # Checks if job exists and is trusted, or if it needs re-creation. job = Job(self._pykube_api, k8s_job_obj) job_exists = job.exists() if job_exists and not self._galaxy_instance_id: # if galaxy instance id is not set, then we don't trust matching jobs and we simply delete and # re-create the job log.debug("Matching job exists, but Job is not trusted, so it will be deleted and a new one created.") job.delete() elapsed_seconds = 0 while job.exists(): sleep(3) elapsed_seconds += 3 if elapsed_seconds > self.runner_params['k8s_timeout_seconds_job_deletion']: log.debug("Timed out before k8s could delete existing untrusted job " + k8s_job_name + ", not queuing associated Galaxy job.") return log.debug("Waiting for job to be deleted " + k8s_job_name) Job(self._pykube_api, k8s_job_obj).create() elif job_exists and self._galaxy_instance_id: # The job exists and we trust the identifier. log.debug("Matching job exists, but Job is trusted, so we simply use the existing one for " + k8s_job_name) # We simply leave the k8s job to be handled later on by check_watched_item(). else: # Creates the Kubernetes Job if it doesn't exist. job.create() # define job attributes in the AsyncronousJobState for follow-up ajs.job_id = k8s_job_name # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(k8s_job_name) self.monitor_queue.put(ajs)
def queue_job(self, job_wrapper): """Write JSE-Drop file to drop location """ # Get the configured job destination job_destination = job_wrapper.job_destination # Get the parameters defined for this destination # i.e. location of the drop-off directory etc drop_off_dir = self._get_drop_dir() virtual_env = self._get_virtual_env() qsub_options = self._get_qsub_options(job_destination) galaxy_slots = self._get_galaxy_slots(job_destination) galaxy_id = self._get_galaxy_id() log.debug("queue_job: drop-off dir = %s" % drop_off_dir) log.debug("queue_job: virtual_env = %s" % virtual_env) log.debug("queue_job: qsub options = %s" % qsub_options) log.debug("queue_job: galaxy_slots = %s" % galaxy_slots) log.debug("queue_job: galaxy_id = %s" % galaxy_id) if drop_off_dir is None: # Can't locate drop-off dir job_wrapper.fail("failure preparing job script (no JSE-drop " "directory defined)",exception=True ) log.exception("(%s/%s) failure writing job script (no " "JSE-drop directory defined)" % (galaxy_id_tag,job_name)) return # Initialise JSE-drop wrapper jse_drop = JSEDrop(drop_off_dir) # ID and name for job galaxy_id_tag = job_wrapper.get_id_tag() log.debug("ID tag: %s" % galaxy_id_tag) job_name = self._get_job_name(galaxy_id_tag, job_wrapper.tool.old_id, galaxy_id) log.debug("Job name: %s" % job_name) # Prepare the job wrapper (or abort) if not self.prepare_job(job_wrapper): return # Sort out the slots (see e.g. condor.py for example) if galaxy_slots: galaxy_slots_statement = 'GALAXY_SLOTS="%s"; export GALAXY_SLOTS_CONFIGURED="1"' % galaxy_slots else: galaxy_slots_statement = 'GALAXY_SLOTS="1"' # Create script contents script = self.get_job_file(job_wrapper, galaxy_virtual_env=virtual_env, slots_statement=galaxy_slots_statement, exit_code_path=None) # Separate leading shell specification from generated script shell = '\n'.join(filter(lambda x: x.startswith('#!'), script.split('\n'))) script = '\n'.join(filter(lambda x: not x.startswith('#!'), script.split('\n'))) # Create header with embedded qsub flags qsub_header = ["-V", "-wd %s" % job_wrapper.working_directory] if qsub_options: qsub_header.append(qsub_options) qsub_header = '\n'.join(["#$ %s" % opt for opt in qsub_header]) log.debug("qsub_header: %s" % qsub_header) # Reassemble the script components script = "\n".join((shell,qsub_header,script)) # Create the drop file to submit the job try: drop_file = jse_drop.run(job_name,script) log.debug("created drop file %s" % drop_file) log.info("(%s) submitted as %s" % (galaxy_id_tag,job_name)) except: # Some problem writing the qsub file job_wrapper.fail("failure preparing job script", exception=True ) log.exception("(%s/%s) failure writing job script" % (galaxy_id_tag,job_name)) return # External job id (i.e. id used by JSE-Drop as a handle to # identify the job) is the same as the job name here external_job_id = job_name # Store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination(job_destination, external_job_id) # Store state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_name job_state.old_state = True job_state.running = False job_state.job_destination = job_destination # Add to the queue of jobs to monitor self.monitor_job(job_state) log.info("%s: queued" % job_name)
def queue_job(self, job_wrapper): """Create job script and submit it to the DRM""" # prepare the job include_metadata = asbool(job_wrapper.job_destination.params.get("embed_metadata_in_job", DEFAULT_EMBED_METADATA_IN_JOB)) if not self.prepare_job(job_wrapper, include_metadata=include_metadata): return # Get shell and job execution interface job_destination = job_wrapper.job_destination shell_params, job_params = self.parse_destination_params(job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper) job_file_kwargs = job_interface.job_script_kwargs(ajs.output_file, ajs.error_file, ajs.job_name) script = self.get_job_file( job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, **job_file_kwargs ) try: self.write_executable_script(ajs.job_file, script) except Exception: log.exception(f"({galaxy_id_tag}) failure writing job script") job_wrapper.fail("failure preparing job script", exception=True) return # job was deleted while we were preparing it if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED): log.debug("(%s) Job deleted/stopped by user before it entered the queue", galaxy_id_tag) if job_wrapper.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug(f"({galaxy_id_tag}) submitting file: {ajs.job_file}") returncode, stdout = self.submit(shell, job_interface, ajs.job_file, galaxy_id_tag, retry=MAX_SUBMIT_RETRY) if returncode != 0: job_wrapper.fail("failure submitting job") return # Some job runners return something like 'Submitted batch job XXXX' # Strip and split to get job ID. external_job_id = stdout.strip().split()[-1] if not external_job_id: log.error(f'({galaxy_id_tag}) submission did not return a job identifier, failing job') job_wrapper.fail("failure submitting job") return log.info(f"({galaxy_id_tag}) queued with identifier: {external_job_id}") # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) # Store state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(ajs)
def queue_job(self, job_wrapper): """Create job script and submit it to the DRM""" # prepare the job # external_runJob_script can be None, in which case it's not used. external_runjob_script = job_wrapper.get_destination_configuration("drmaa_external_runjob_script", None) include_metadata = asbool(job_wrapper.job_destination.params.get("embed_metadata_in_job", True)) if not self.prepare_job(job_wrapper, include_metadata=include_metadata): return # get configured job destination job_destination = job_wrapper.job_destination # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() job_name = self._job_name(job_wrapper) ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name) # set up the drmaa job template jt = dict( remoteCommand=ajs.job_file, jobName=ajs.job_name, workingDirectory=job_wrapper.working_directory, outputPath=f":{ajs.output_file}", errorPath=f":{ajs.error_file}" ) # Avoid a jt.exitCodePath for now - it's only used when finishing. native_spec = job_destination.params.get('nativeSpecification', None) if native_spec is None: native_spec = job_destination.params.get('native_specification', None) if native_spec is not None: jt['nativeSpecification'] = native_spec # fill in the DRM's job run template script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell) try: self.write_executable_script(ajs.job_file, script, job_io=job_wrapper.job_io) except Exception: job_wrapper.fail("failure preparing job script", exception=True) log.exception(f"({galaxy_id_tag}) failure writing job script") return # job was deleted while we were preparing it if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED): log.debug("(%s) Job deleted/stopped by user before it entered the queue", galaxy_id_tag) if job_wrapper.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug("(%s) submitting file %s", galaxy_id_tag, ajs.job_file) if native_spec: log.debug("(%s) native specification is: %s", galaxy_id_tag, native_spec) # runJob will raise if there's a submit problem if external_runjob_script is None: # TODO: create a queue for retrying submission indefinitely # TODO: configurable max tries and sleep trynum = 0 external_job_id = None fail_msg = None while external_job_id is None and trynum < 5: try: external_job_id = self.ds.run_job(**jt) break except (drmaa.InternalException, drmaa.DeniedByDrmException) as e: trynum += 1 log.warning('(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e) fail_msg = "Unable to run this job due to a cluster error, please retry it later" time.sleep(5) except Exception: log.exception('(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag) trynum = 5 else: log.error(f"({galaxy_id_tag}) All attempts to submit job failed") if not fail_msg: fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE job_wrapper.fail(fail_msg) return else: job_wrapper.change_ownership_for_run() # if user credentials are not available, use galaxy credentials (if permitted) allow_guests = asbool(job_wrapper.job_destination.params.get("allow_guests", False)) pwent = job_wrapper.user_system_pwent if pwent is None: if not allow_guests: fail_msg = f"User {job_wrapper.user} is not mapped to any real user, and not permitted to start jobs." job_wrapper.fail(fail_msg) return pwent = job_wrapper.galaxy_system_pwent log.debug(f'({galaxy_id_tag}) submitting with credentials: {pwent[0]} [uid: {pwent[2]}]') filename = self.store_jobtemplate(job_wrapper, jt) self.userid = pwent[2] external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2]) if external_job_id is None: job_wrapper.fail(f"({galaxy_id_tag}) could not queue job") return log.info(f"({galaxy_id_tag}) queued as {external_job_id}") # store runner information for tracking if Galaxy restarts job_wrapper.set_external_id(external_job_id) # Store DRM related state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(ajs)
def queue_job( self, job_wrapper ): """Create job script and submit it to the DRM""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=True ): return # command line has been added to the wrapper by prepare_job() command_line = job_wrapper.runner_command_line # Get shell and job execution interface job_destination = job_wrapper.job_destination shell_params, job_params = self.parse_destination_params(job_destination.params) shell, job_interface = self.get_cli_plugins(shell_params, job_params) # Updated by jinchao print "\nQueue job shell/job interface" print "shell: %s | job: %s" % (shell_params['plugin'], job_params['plugin']) # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper ) # fill in the DRM's job run template script = job_interface.get_job_template(ajs.output_file, ajs.error_file, ajs.job_name, job_wrapper, command_line, ajs.exit_code_file) # Updated by jinchao print "\nQueue job script" print script try: fh = file(ajs.job_file, "w") fh.write(script) fh.close() except: log.exception("(%s) failure writing job script" % galaxy_id_tag ) job_wrapper.fail("failure preparing job script", exception=True) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.info("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag ) if self.app.config.cleanup_job in ("always", "onsuccess"): job_wrapper.cleanup() return log.debug( "(%s) submitting file: %s" % ( galaxy_id_tag, ajs.job_file ) ) log.debug( "(%s) command is: %s" % ( galaxy_id_tag, command_line ) ) cmd_out = shell.execute(job_interface.submit(ajs.job_file, job_wrapper)) # Updated by jinchao print "\nQueue job cmd_out" print cmd_out if cmd_out.returncode != 0: log.error('(%s) submission failed (stdout): %s' % (galaxy_id_tag, cmd_out.stdout)) log.error('(%s) submission failed (stderr): %s' % (galaxy_id_tag, cmd_out.stderr)) job_wrapper.fail("failure submitting job") return # Updated by jinchao print "\nQueue job cmd_out returncode:" print cmd_out.returncode print "\nQueue job cmd_out stdout:" print cmd_out.stdout print "\nQueue job cmd_out stderr:" print cmd_out.stderr print "\nQueue job cmd_out job params" print job_params #external_job_id = cmd_out.stdout.strip() # Updated by jinchao: handle job info to get job id if job_params['plugin'] == 'Hadoop': # The stdout was sent to stderr under hadoop job plugin external_job_id = job_interface.parse_job_info(cmd_out.stderr) else: external_job_id = job_interface.parse_job_info(cmd_out.stdout) print "\Queue job JOB ID:" print external_job_id if not external_job_id: log.error('(%s) submission did not return a job identifier, failing job' % galaxy_id_tag) job_wrapper.fail("failure submitting job") return log.info("(%s) queued with identifier: %s" % ( galaxy_id_tag, external_job_id ) ) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination( job_destination, external_job_id ) # Store state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( ajs )
def queue_job( self, job_wrapper ): """Create job script and submit it to the DRM""" # prepare the job include_metadata = asbool( job_wrapper.job_destination.params.get( "embed_metadata_in_job", True) ) if not self.prepare_job( job_wrapper, include_metadata=include_metadata): return # get configured job destination job_destination = job_wrapper.job_destination # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers. galaxy_id_tag = job_wrapper.get_id_tag() # define job attributes job_name = 'g%s' % galaxy_id_tag if job_wrapper.tool.old_id: job_name += '_%s' % job_wrapper.tool.old_id if self.external_runJob_script is None: job_name += '_%s' % job_wrapper.user job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) ) ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name ) # set up the drmaa job template jt = self.ds.createJobTemplate() jt.remoteCommand = ajs.job_file jt.jobName = ajs.job_name jt.workingDirectory = job_wrapper.working_directory jt.outputPath = ":%s" % ajs.output_file jt.errorPath = ":%s" % ajs.error_file # Avoid a jt.exitCodePath for now - it's only used when finishing. native_spec = job_destination.params.get('nativeSpecification', None) if native_spec is not None: jt.nativeSpecification = native_spec # fill in the DRM's job run template script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file) try: fh = file( ajs.job_file, "w" ) fh.write( script ) fh.close() os.chmod( ajs.job_file, 0o755 ) except: job_wrapper.fail( "failure preparing job script", exception=True ) log.exception( "(%s) failure writing job script" % galaxy_id_tag ) return # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag ) if self.app.config.cleanup_job in ( "always", "onsuccess" ): job_wrapper.cleanup() return log.debug( "(%s) submitting file %s", galaxy_id_tag, ajs.job_file ) if native_spec: log.debug( "(%s) native specification is: %s", galaxy_id_tag, native_spec ) # runJob will raise if there's a submit problem if self.external_runJob_script is None: # TODO: create a queue for retrying submission indefinitely # TODO: configurable max tries and sleep trynum = 0 external_job_id = None fail_msg = None while external_job_id is None and trynum < 5: try: external_job_id = self.ds.runJob(jt) break except ( drmaa.InternalException, drmaa.DeniedByDrmException ) as e: trynum += 1 log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e ) fail_msg = "Unable to run this job due to a cluster error, please retry it later" time.sleep( 5 ) except: log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag ) trynum = 5 else: log.error( "(%s) All attempts to submit job failed" % galaxy_id_tag ) if not fail_msg: fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE job_wrapper.fail( fail_msg ) self.ds.deleteJobTemplate( jt ) return else: job_wrapper.change_ownership_for_run() # if user credentials are not available, use galaxy credentials (if permitted) allow_guests = asbool(job_wrapper.job_destination.params.get( "allow_guests", False) ) pwent = job_wrapper.user_system_pwent if pwent is None: if not allow_guests: fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user job_wrapper.fail( fail_msg ) self.ds.deleteJobTemplate( jt ) return pwent = job_wrapper.galaxy_system_pwent log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, pwent[0], pwent[2] ) ) filename = self.store_jobtemplate(job_wrapper, jt) self.userid = pwent[2] external_job_id = self.external_runjob(filename, pwent[2]).strip() log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) ) # store runner information for tracking if Galaxy restarts job_wrapper.set_job_destination( job_destination, external_job_id ) # Store DRM related state information for job ajs.job_id = external_job_id ajs.old_state = 'new' ajs.job_destination = job_destination # delete the job template self.ds.deleteJobTemplate( jt ) # Add to our 'queue' of jobs to monitor self.monitor_queue.put( ajs )
def queue_job(self, job_wrapper): job_destination = job_wrapper.job_destination self._populate_parameter_defaults(job_destination) command_line, client, remote_job_config, compute_environment, remote_container = self.__prepare_job( job_wrapper, job_destination) if not command_line: return try: dependencies_description = PulsarJobRunner.__dependencies_description( client, job_wrapper) rewrite_paths = not PulsarJobRunner.__rewrite_parameters(client) path_rewrites_unstructured = {} output_names = [] if compute_environment: path_rewrites_unstructured = compute_environment.path_rewrites_unstructured output_names = compute_environment.output_names() client_inputs_list = [] for input_dataset_wrapper in job_wrapper.get_input_paths(): # str here to resolve false_path if set on a DatasetPath object. path = str(input_dataset_wrapper) object_store_ref = { "dataset_id": input_dataset_wrapper.dataset_id, "dataset_uuid": str(input_dataset_wrapper.dataset_uuid), "object_store_id": input_dataset_wrapper.object_store_id, } client_inputs_list.append( ClientInput(path, CLIENT_INPUT_PATH_TYPES.INPUT_PATH, object_store_ref=object_store_ref)) for input_extra_path in compute_environment.path_rewrites_input_extra.keys( ): # TODO: track dataset for object_Store_ref... client_inputs_list.append( ClientInput( input_extra_path, CLIENT_INPUT_PATH_TYPES.INPUT_EXTRA_FILES_PATH)) for input_metadata_path in compute_environment.path_rewrites_input_metadata.keys( ): # TODO: track dataset for object_Store_ref... client_inputs_list.append( ClientInput( input_metadata_path, CLIENT_INPUT_PATH_TYPES.INPUT_METADATA_PATH)) input_files = None client_inputs = ClientInputs(client_inputs_list) else: input_files = self.get_input_files(job_wrapper) client_inputs = None if self.app.config.metadata_strategy == "legacy": # Drop this branch in 19.09. metadata_directory = job_wrapper.working_directory else: metadata_directory = os.path.join( job_wrapper.working_directory, "metadata") remote_pulsar_app_config = job_destination.params.get( "pulsar_app_config", {}) job_directory_files = [] config_files = job_wrapper.extra_filenames tool_script = os.path.join(job_wrapper.working_directory, "tool_script.sh") if os.path.exists(tool_script): log.debug("Registering tool_script for Pulsar transfer [%s]" % tool_script) job_directory_files.append(tool_script) client_job_description = ClientJobDescription( command_line=command_line, input_files=input_files, client_inputs= client_inputs, # Only one of these input defs should be non-None client_outputs=self.__client_outputs(client, job_wrapper), working_directory=job_wrapper.tool_working_directory, metadata_directory=metadata_directory, tool=job_wrapper.tool, config_files=config_files, dependencies_description=dependencies_description, env=client.env, rewrite_paths=rewrite_paths, arbitrary_files=path_rewrites_unstructured, touch_outputs=output_names, remote_pulsar_app_config=remote_pulsar_app_config, job_directory_files=job_directory_files, container=None if not remote_container else remote_container.container_id, ) job_id = pulsar_submit_job(client, client_job_description, remote_job_config) log.info("Pulsar job submitted with job_id %s" % job_id) job_wrapper.set_job_destination(job_destination, job_id) job_wrapper.change_state(model.Job.states.QUEUED) except Exception: job_wrapper.fail("failure running job", exception=True) log.exception("failure running job %d", job_wrapper.job_id) return pulsar_job_state = AsynchronousJobState() pulsar_job_state.job_wrapper = job_wrapper pulsar_job_state.job_id = job_id pulsar_job_state.old_state = True pulsar_job_state.running = False pulsar_job_state.job_destination = job_destination self.monitor_job(pulsar_job_state)