def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() pbs_job_state = AsynchronousJobState() pbs_job_state.output_file = "%s/%s.o" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.error_file = "%s/%s.e" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.exit_code_file = "%s/%s.ec" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.job_file = "%s/%s.sh" % ( self.app.config.cluster_files_directory, job.id) pbs_job_state.job_id = str(job_id) pbs_job_state.runner_url = job_wrapper.get_job_runner_url() pbs_job_state.job_destination = job_wrapper.job_destination job_wrapper.command_line = job.command_line pbs_job_state.job_wrapper = job_wrapper if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the PBS queue" % (job.id, job.get_job_runner_external_id())) pbs_job_state.old_state = 'R' pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in PBS queued state, adding to the PBS queue" % (job.id, job.get_job_runner_external_id())) pbs_job_state.old_state = 'Q' pbs_job_state.running = False self.monitor_queue.put(pbs_job_state)
def recover(self, job, job_wrapper): """Recovers jobs stuck in the queued/running state when Galaxy started""" job_id = job.get_job_runner_external_id() pbs_job_state = AsynchronousJobState() pbs_job_state.output_file = "%s/%s.o" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.error_file = "%s/%s.e" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.exit_code_file = "%s/%s.ec" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job.id) pbs_job_state.job_id = str(job_id) pbs_job_state.runner_url = job_wrapper.get_job_runner_url() pbs_job_state.job_destination = job_wrapper.job_destination job_wrapper.command_line = job.command_line pbs_job_state.job_wrapper = job_wrapper if job.state == model.Job.states.RUNNING: log.debug( "(%s/%s) is still in running state, adding to the PBS queue" % (job.id, job.get_job_runner_external_id()) ) pbs_job_state.old_state = "R" pbs_job_state.running = True self.monitor_queue.put(pbs_job_state) elif job.state == model.Job.states.QUEUED: log.debug( "(%s/%s) is still in PBS queued state, adding to the PBS queue" % (job.id, job.get_job_runner_external_id()) ) pbs_job_state.old_state = "Q" pbs_job_state.running = False self.monitor_queue.put(pbs_job_state)
def queue_job(self, job_wrapper): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not (self.app.config.pbs_stage_path)): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop( '-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params[ 'destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect(util.smart_str(pbs_server_name)) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error("Connection to PBS server for submit failed: %s: %s" % (errno, text)) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [str(o) for o in output_fnames] output_files.append(ecfile) stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() + output_files, symlink=True) stageout = self.get_stage_in_out(output_files) attrs = [ dict(name=pbs.ATTR_o, value=pbs_ofile), dict(name=pbs.ATTR_e, value=pbs_efile), dict(name=pbs.ATTR_stagein, value=stagein), dict(name=pbs.ATTR_stageout, value=stageout), ] # If not, we're using NFS else: attrs = [ dict(name=pbs.ATTR_o, value=ofile), dict(name=pbs.ATTR_e, value=efile), ] # define PBS job options attrs.append( dict(name=pbs.ATTR_N, value=str("%s_%s_%s" % (job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user)))) job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options)) for i, attr in enumerate(attrs + pbs_options): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath(job_wrapper.working_directory) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with file(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join(job_wrapper.get_input_fnames() + output_files), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [stage_commands] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script(job_file, script) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id) pbs.pbs_disconnect(c) if self.app.config.cleanup_job in ("always", "onsuccess"): self.cleanup((ofile, efile, ecfile, job_file)) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file)) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text)) time.sleep(2) else: log.error("(%s) All attempts to submit job failed" % galaxy_job_id) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id)) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id)) # persist destination job_wrapper.set_job_destination(job_destination, job_id) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put(job_state)
def queue_job( self, job_wrapper ): """Create PBS script for a job and submit it to the PBS queue""" # prepare the job if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ): return job_destination = job_wrapper.job_destination # Determine the job's PBS destination (server/queue) and options from the job destination definition pbs_queue_name = None pbs_server_name = self.default_pbs_server pbs_options = [] if '-q' in job_destination.params and 'destination' not in job_destination.params: job_destination.params['destination'] = job_destination.params.pop('-q') if 'destination' in job_destination.params: if '@' in job_destination.params['destination']: # Destination includes a server pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@') if pbs_queue_name == '': # e.g. `qsub -q @server` pbs_queue_name = None else: # Destination is just a queue pbs_queue_name = job_destination.params['destination'] job_destination.params.pop('destination') # Parse PBS params pbs_options = self.parse_destination_params(job_destination.params) # Explicitly set the determined PBS destination in the persisted job destination for recovery job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name) c = pbs.pbs_connect( util.smart_str( pbs_server_name ) ) if c <= 0: errno, text = pbs.error() job_wrapper.fail( "Unable to queue job for execution. Resubmitting the job may succeed." ) log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) ) return # define job attributes ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id) efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id) ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id) output_fnames = job_wrapper.get_output_fnames() # If an application server is set, we're staging if self.app.config.pbs_application_server: pbs_ofile = self.app.config.pbs_application_server + ':' + ofile pbs_efile = self.app.config.pbs_application_server + ':' + efile output_files = [ str( o ) for o in output_fnames ] output_files.append(ecfile) stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True ) stageout = self.get_stage_in_out( output_files ) attrs = [ dict( name=pbs.ATTR_o, value=pbs_ofile ), dict( name=pbs.ATTR_e, value=pbs_efile ), dict( name=pbs.ATTR_stagein, value=stagein ), dict( name=pbs.ATTR_stageout, value=stageout ), ] # If not, we're using NFS else: attrs = [ dict( name=pbs.ATTR_o, value=ofile ), dict( name=pbs.ATTR_e, value=efile ), ] # define PBS job options attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) ) job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) ) for i, attr in enumerate( attrs + pbs_options ): job_attrs[i].name = attr['name'] job_attrs[i].value = attr['value'] if 'resource' in attr: job_attrs[i].resource = attr['resource'] exec_dir = os.path.abspath( job_wrapper.working_directory ) # write the job script if self.app.config.pbs_stage_path != '': # touch the ecfile so that it gets staged with open(ecfile, 'a'): os.utime(ecfile, None) stage_commands = pbs_symlink_template % ( " ".join( job_wrapper.get_input_fnames() + output_files ), self.app.config.pbs_stage_path, exec_dir, ) else: stage_commands = '' env_setup_commands = [ stage_commands ] script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands) job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id) self.write_executable_script( job_file, script ) # job was deleted while we were preparing it if job_wrapper.get_state() == model.Job.states.DELETED: log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id ) pbs.pbs_disconnect(c) if job_wrapper.cleanup_job in ( "always", "onsuccess" ): self.cleanup( ( ofile, efile, ecfile, job_file ) ) job_wrapper.cleanup() return # submit # The job tag includes the job and the task identifier # (if a TaskWrapper was passed in): galaxy_job_id = job_wrapper.get_id_tag() log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) ) tries = 0 while tries < 5: job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None) tries += 1 if job_id: pbs.pbs_disconnect(c) break errno, text = pbs.error() log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) ) time.sleep(2) else: log.error( "(%s) All attempts to submit job failed" % galaxy_job_id ) job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) return if pbs_queue_name is None: log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) ) else: log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) ) # persist destination job_wrapper.set_job_destination( job_destination, job_id ) # Store PBS related state information for job job_state = AsynchronousJobState() job_state.job_wrapper = job_wrapper job_state.job_id = job_id job_state.job_file = job_file job_state.output_file = ofile job_state.error_file = efile job_state.exit_code_file = ecfile job_state.old_state = 'N' job_state.running = False job_state.job_destination = job_destination # Add to our 'queue' of jobs to monitor self.monitor_queue.put( job_state )