Пример #1
0
 def _job_state( self, job, job_wrapper ):
     job_state = AsynchronousJobState()
     # TODO: Determine why this is set when using normal message queue updates
     # but not CLI submitted MQ updates...
     raw_job_id = job.get_job_runner_external_id() or job_wrapper.job_id
     job_state.job_id = str( raw_job_id )
     job_state.runner_url = job_wrapper.get_job_runner_url()
     job_state.job_destination = job_wrapper.job_destination
     job_state.job_wrapper = job_wrapper
     return job_state
Пример #2
0
 def recover(self, job, job_wrapper):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     # TODO this needs to be implemented to override unimplemented base method
     job_id = job.get_job_runner_external_id()
     log.debug("k8s trying to recover job: " + job_id)
     if job_id is None:
         self.put(job_wrapper)
         return
     ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper)
     ajs.job_id = str(job_id)
     ajs.command_line = job.command_line
     ajs.job_wrapper = job_wrapper
     ajs.job_destination = job_wrapper.job_destination
     if job.state == model.Job.states.RUNNING:
         log.debug("(%s/%s) is still in running state, adding to the runner monitor queue" % (
             job.id, job.job_runner_external_id))
         ajs.old_state = model.Job.states.RUNNING
         ajs.running = True
         self.monitor_queue.put(ajs)
     elif job.state == model.Job.states.QUEUED:
         log.debug("(%s/%s) is still in queued state, adding to the runner monitor queue" % (
             job.id, job.job_runner_external_id))
         ajs.old_state = model.Job.states.QUEUED
         ajs.running = False
         self.monitor_queue.put(ajs)
 def recover(self,job,job_wrapper):
     # Recovers jobs in the queued/running state when Galaxy started
     # What is 'job' an instance of???
     # Could be model.Job?
     # Fetch the job id used by JSE-Drop
     job_name = job.get_job_runner_external_id()
     # Get the job destination
     job_destination = job_wrapper.job_destination
     # Fetch the drop dir
     drop_off_dir = self._get_drop_dir()
     log.debug("recover: drop-off dir = %s" % drop_off_dir)
     jse_drop = JSEDrop(drop_off_dir)
     # Store state information for job
     job_state = AsynchronousJobState()
     job_state.job_wrapper = job_wrapper
     job_state.job_id = job_name
     job_state.job_destination = job_destination
     # Sort out the status
     if job.state == model.Job.states.RUNNING:
         job_state.old_state = True
         job_state.running = True
     elif job.get_state() == model.Job.states.QUEUED:
         job_state.old_state = True
         job_state.running = False
     # Add to the queue of jobs to monitor
     self.monitor_queue.put(job_state)
Пример #4
0
 def recover( self, job, job_wrapper ):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     job_id = job.get_job_runner_external_id()
     if job_id is None:
         self.put( job_wrapper )
         return
     ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )
     ajs.job_id = str( job_id )
     ajs.command_line = job.get_command_line()
     ajs.job_wrapper = job_wrapper
     ajs.job_destination = job_wrapper.job_destination
     if job.state == model.Job.states.RUNNING:
         log.debug( "(%s/%s) is still in running state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
         ajs.old_state = drmaa.JobState.RUNNING
         ajs.running = True
         self.monitor_queue.put( ajs )
     elif job.get_state() == model.Job.states.QUEUED:
         log.debug( "(%s/%s) is still in DRM queued state, adding to the DRM queue" % ( job.get_id(), job.get_job_runner_external_id() ) )
         ajs.old_state = drmaa.JobState.QUEUED_ACTIVE
         ajs.running = False
         self.monitor_queue.put( ajs )
Пример #5
0
 def recover( self, job, job_wrapper ):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     job_state = AsynchronousJobState()
     job_state.job_id = str( job.get_job_runner_external_id() )
     job_state.runner_url = job_wrapper.get_job_runner_url()
     job_state.job_destination = job_wrapper.job_destination
     job_wrapper.command_line = job.get_command_line()
     job_state.job_wrapper = job_wrapper
     state = job.get_state()
     if state in [model.Job.states.RUNNING, model.Job.states.QUEUED]:
         log.debug( "(LWR/%s) is still in running state, adding to the LWR queue" % ( job.get_id()) )
         job_state.old_state = True
         job_state.running = state == model.Job.states.RUNNING
         self.monitor_queue.put( job_state )
Пример #6
0
 def recover(self, job, job_wrapper):
     msg = ('(name!r/runner!r) is still in {state!s} state, adding to'
            ' the runner monitor queue')
     job_id = job.get_job_runner_external_id()
     ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                job_wrapper=job_wrapper)
     ajs.job_id = self.JOB_NAME_PREFIX + str(job_id)
     ajs.command_line = job.command_line
     ajs.job_wrapper = job_wrapper
     ajs.job_destination = job_wrapper.job_destination
     if job.state == model.Job.states.RUNNING:
         LOGGER.debug(msg.format(
             name=job.id, runner=job.job_runner_external_id,
             state='running'))
         ajs.old_state = model.Job.states.RUNNING
         ajs.running = True
         self.monitor_queue.put(ajs)
     elif job.state == model.Job.states.QUEUED:
         LOGGER.debug(msg.format(
             name=job.id, runner=job.job_runner_external_id,
             state='queued'))
         ajs.old_state = model.Job.states.QUEUED
         ajs.running = False
         self.monitor_queue.put(ajs)
Пример #7
0
    def recover(self, job, job_wrapper):
        """ Recovers jobs stuck in the queued/running state when Galaxy started """
        """ This method is called by galaxy at the time of startup.
            Jobs in Running & Queued status in galaxy are put in the monitor_queue by creating an AsynchronousJobState object
        """
        job_id = job_wrapper.job_id
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper)
        ajs.job_id = str(job_id)
        ajs.job_destination = job_wrapper.job_destination
        job_wrapper.command_line = job.command_line
        ajs.job_wrapper = job_wrapper
        if job.state == model.Job.states.RUNNING:
            log.debug("(%s/%s) is still in running state, adding to the god queue" % (job.id, job.get_job_runner_external_id()))
            ajs.old_state = 'R'
            ajs.running = True
            self.monitor_queue.put(ajs)

        elif job.state == model.Job.states.QUEUED:
            log.debug("(%s/%s) is still in god queued state, adding to the god queue" % (job.id, job.get_job_runner_external_id()))
            ajs.old_state = 'Q'
            ajs.running = False
            self.monitor_queue.put(ajs)
Пример #8
0
    def queue_job(self, job_wrapper):
        job_destination = job_wrapper.job_destination
        self._populate_parameter_defaults(job_destination)

        command_line, client, remote_job_config, compute_environment = self.__prepare_job(job_wrapper, job_destination)

        if not command_line:
            return

        try:
            dependencies_description = PulsarJobRunner.__dependencies_description(client, job_wrapper)
            rewrite_paths = not PulsarJobRunner.__rewrite_parameters(client)
            unstructured_path_rewrites = {}
            output_names = []
            if compute_environment:
                unstructured_path_rewrites = compute_environment.unstructured_path_rewrites
                output_names = compute_environment.output_names()

            client_job_description = ClientJobDescription(
                command_line=command_line,
                input_files=self.get_input_files(job_wrapper),
                client_outputs=self.__client_outputs(client, job_wrapper),
                working_directory=job_wrapper.tool_working_directory,
                metadata_directory=job_wrapper.working_directory,
                tool=job_wrapper.tool,
                config_files=job_wrapper.extra_filenames,
                dependencies_description=dependencies_description,
                env=client.env,
                rewrite_paths=rewrite_paths,
                arbitrary_files=unstructured_path_rewrites,
                touch_outputs=output_names,
            )
            job_id = pulsar_submit_job(client, client_job_description, remote_job_config)
            log.info("Pulsar job submitted with job_id %s" % job_id)
            job_wrapper.set_job_destination(job_destination, job_id)
            job_wrapper.change_state(model.Job.states.QUEUED)
        except Exception:
            job_wrapper.fail("failure running job", exception=True)
            log.exception("failure running job %d", job_wrapper.job_id)
            return

        pulsar_job_state = AsynchronousJobState()
        pulsar_job_state.job_wrapper = job_wrapper
        pulsar_job_state.job_id = job_id
        pulsar_job_state.old_state = True
        pulsar_job_state.running = False
        pulsar_job_state.job_destination = job_destination
        self.monitor_job(pulsar_job_state)
Пример #9
0
    def queue_job(self, job_wrapper):
        command_line = ''
        job_destination = job_wrapper.job_destination

        try:
            job_wrapper.prepare()
            if hasattr(job_wrapper, 'prepare_input_files_cmds') and job_wrapper.prepare_input_files_cmds is not None:
                for cmd in job_wrapper.prepare_input_files_cmds:  # run the commands to stage the input files
                    #log.debug( 'executing: %s' % cmd )
                    if 0 != os.system(cmd):
                        raise Exception('Error running file staging command: %s' % cmd)
                job_wrapper.prepare_input_files_cmds = None  # prevent them from being used in-line
            command_line = self.build_command_line( job_wrapper, include_metadata=False, include_work_dir_outputs=False )
        except:
            job_wrapper.fail( "failure preparing job", exception=True )
            log.exception("failure running job %d" % job_wrapper.job_id)
            return

        # If we were able to get a command line, run the job
        if not command_line:
            job_wrapper.finish( '', '' )
            return

        try:
            client = self.get_client_from_wrapper(job_wrapper)
            output_files = self.get_output_files(job_wrapper)
            input_files = job_wrapper.get_input_fnames()
            working_directory = job_wrapper.working_directory
            tool = job_wrapper.tool
            file_stager = FileStager(client, tool, command_line, job_wrapper.extra_filenames, input_files, output_files, working_directory)
            rebuilt_command_line = file_stager.get_rewritten_command_line()
            job_id = file_stager.job_id
            client.launch( rebuilt_command_line )
            job_wrapper.set_job_destination( job_destination, job_id )
            job_wrapper.change_state( model.Job.states.QUEUED )
        except:
            job_wrapper.fail( "failure running job", exception=True )
            log.exception("failure running job %d" % job_wrapper.job_id)
            return

        lwr_job_state = AsynchronousJobState()
        lwr_job_state.job_wrapper = job_wrapper
        lwr_job_state.job_id = job_id
        lwr_job_state.old_state = True
        lwr_job_state.running = False
        lwr_job_state.job_destination = job_destination
        self.monitor_job(lwr_job_state)
Пример #10
0
    def queue_job(self, job_wrapper):
        job_destination = job_wrapper.job_destination

        command_line, client, remote_job_config = self.__prepare_job( job_wrapper, job_destination )

        if not command_line:
            return

        try:
            dependency_resolution = LwrJobRunner.__dependency_resolution( client )
            remote_dependency_resolution = dependency_resolution == "remote"
            requirements = job_wrapper.tool.requirements if remote_dependency_resolution else []
            client_job_description = ClientJobDescription(
                command_line=command_line,
                output_files=self.get_output_files(job_wrapper),
                input_files=job_wrapper.get_input_fnames(),
                working_directory=job_wrapper.working_directory,
                tool=job_wrapper.tool,
                config_files=job_wrapper.extra_filenames,
                requirements=requirements,
                version_file=job_wrapper.get_version_string_path(),
            )
            job_id = lwr_submit_job(client, client_job_description, remote_job_config)
            log.info("lwr job submitted with job_id %s" % job_id)
            job_wrapper.set_job_destination( job_destination, job_id )
            job_wrapper.change_state( model.Job.states.QUEUED )
        except Exception:
            job_wrapper.fail( "failure running job", exception=True )
            log.exception("failure running job %d" % job_wrapper.job_id)
            return

        lwr_job_state = AsynchronousJobState()
        lwr_job_state.job_wrapper = job_wrapper
        lwr_job_state.job_id = job_id
        lwr_job_state.old_state = True
        lwr_job_state.running = False
        lwr_job_state.job_destination = job_destination
        self.monitor_job(lwr_job_state)
Пример #11
0
    def queue_job( self, job_wrapper ):
        """Create job script and submit it to the DRM"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=True ):
            return

        # command line has been added to the wrapper by prepare_job()
        command_line = job_wrapper.runner_command_line

        # Get shell and job execution interface
        job_destination = job_wrapper.job_destination
        shell_params, job_params = self.parse_destination_params(job_destination.params)
        shell, job_interface = self.get_cli_plugins(shell_params, job_params)

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )

        job_file_kwargs = job_interface.job_script_kwargs(ajs.output_file, ajs.error_file, ajs.job_name)
        script = self.get_job_file(
            job_wrapper,
            exit_code_path=ajs.exit_code_file,
            **job_file_kwargs
        )

        try:
            fh = file(ajs.job_file, "w")
            fh.write(script)
            fh.close()
        except:
            log.exception("(%s) failure writing job script" % galaxy_id_tag )
            job_wrapper.fail("failure preparing job script", exception=True)
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.info("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
            if self.app.config.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug( "(%s) submitting file: %s" % ( galaxy_id_tag, ajs.job_file ) )

        cmd_out = shell.execute(job_interface.submit(ajs.job_file))
        if cmd_out.returncode != 0:
            log.error('(%s) submission failed (stdout): %s' % (galaxy_id_tag, cmd_out.stdout))
            log.error('(%s) submission failed (stderr): %s' % (galaxy_id_tag, cmd_out.stderr))
            job_wrapper.fail("failure submitting job")
            return
        # Some job runners return something like 'Submitted batch job XXXX' 
        # Strip and split to get job ID.
        external_job_id = cmd_out.stdout.strip().split()[-1]
        if not external_job_id:
            log.error('(%s) submission did not return a job identifier, failing job' % galaxy_id_tag)
            job_wrapper.fail("failure submitting job")
            return

        log.info("(%s) queued with identifier: %s" % ( galaxy_id_tag, external_job_id ) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination( job_destination, external_job_id )

        # Store state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( ajs )
Пример #12
0
    def queue_job( self, job_wrapper ):
        """Create job script and submit it to the DRM"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=True ):
            return

        # command line has been added to the wrapper by prepare_job()
        command_line = job_wrapper.runner_command_line
        
        # get configured job destination
        job_destination = job_wrapper.job_destination

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        job_name = 'g%s' % galaxy_id_tag
        if job_wrapper.tool.old_id:
            job_name += '_%s' % job_wrapper.tool.old_id
        if self.external_runJob_script is None:
            job_name += '_%s' % job_wrapper.user
        job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) )
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name )

        # set up the drmaa job template
        jt = self.ds.createJobTemplate()
        jt.remoteCommand = ajs.job_file
        jt.jobName = ajs.job_name
        jt.outputPath = ":%s" % ajs.output_file
        jt.errorPath = ":%s" % ajs.error_file

        # Avoid a jt.exitCodePath for now - it's only used when finishing.
        native_spec = job_destination.params.get('nativeSpecification', None)
        if native_spec is not None:
            jt.nativeSpecification = native_spec

        # fill in the DRM's job run template
        script = drm_template % ( job_wrapper.galaxy_lib_dir,
                                  job_wrapper.get_env_setup_clause(),
                                  os.path.abspath( job_wrapper.working_directory ),
                                  command_line,
                                  ajs.exit_code_file )

        try:
            fh = file( ajs.job_file, "w" )
            fh.write( script )
            fh.close()
            os.chmod( ajs.job_file, 0755 )
        except:
            job_wrapper.fail( "failure preparing job script", exception=True )
            log.exception( "(%s) failure writing job script" % galaxy_id_tag )
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
            if self.app.config.cleanup_job in ( "always", "onsuccess" ):
                job_wrapper.cleanup()
            return

        log.debug( "(%s) submitting file %s" % ( galaxy_id_tag, ajs.job_file ) )
        log.debug( "(%s) command is: %s" % ( galaxy_id_tag, command_line ) )

        # runJob will raise if there's a submit problem
        if self.external_runJob_script is None:
            external_job_id = self.ds.runJob(jt)
        else:
            job_wrapper.change_ownership_for_run()
            log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, job_wrapper.user_system_pwent[0], job_wrapper.user_system_pwent[2] ) )
            filename = self.store_jobtemplate(job_wrapper, jt)
            self.userid =  job_wrapper.user_system_pwent[2]
            external_job_id = self.external_runjob(filename, job_wrapper.user_system_pwent[2]).strip()
        log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination( job_destination, external_job_id )

        # Store DRM related state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination
        
        # delete the job template
        self.ds.deleteJobTemplate( jt )

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( ajs )
Пример #13
0
 def recover(self, job, job_wrapper):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     job_id = job.get_job_runner_external_id()
     pbs_job_state = AsynchronousJobState()
     pbs_job_state.output_file = "%s/%s.o" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.error_file = "%s/%s.e" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.exit_code_file = "%s/%s.ec" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_file = "%s/%s.sh" % (
         self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_id = str(job_id)
     pbs_job_state.runner_url = job_wrapper.get_job_runner_url()
     pbs_job_state.job_destination = job_wrapper.job_destination
     job_wrapper.command_line = job.command_line
     pbs_job_state.job_wrapper = job_wrapper
     if job.state == model.Job.states.RUNNING:
         log.debug(
             "(%s/%s) is still in running state, adding to the PBS queue" %
             (job.id, job.get_job_runner_external_id()))
         pbs_job_state.old_state = 'R'
         pbs_job_state.running = True
         self.monitor_queue.put(pbs_job_state)
     elif job.state == model.Job.states.QUEUED:
         log.debug(
             "(%s/%s) is still in PBS queued state, adding to the PBS queue"
             % (job.id, job.get_job_runner_external_id()))
         pbs_job_state.old_state = 'Q'
         pbs_job_state.running = False
         self.monitor_queue.put(pbs_job_state)
Пример #14
0
    def queue_job(self, job_wrapper):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job(
                job_wrapper,
                include_metadata=not (self.app.config.pbs_stage_path)):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop(
                '-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params[
                    'destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or
                                                           '', pbs_server_name)

        c = pbs.pbs_connect(util.smart_str(pbs_server_name))
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail(
                "Unable to queue job for execution.  Resubmitting the job may succeed."
            )
            log.error("Connection to PBS server for submit failed: %s: %s" %
                      (errno, text))
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory,
                             job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory,
                               job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [str(o) for o in output_fnames]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out(job_wrapper.get_input_fnames() +
                                            output_files,
                                            symlink=True)
            stageout = self.get_stage_in_out(output_files)
            attrs = [
                dict(name=pbs.ATTR_o, value=pbs_ofile),
                dict(name=pbs.ATTR_e, value=pbs_efile),
                dict(name=pbs.ATTR_stagein, value=stagein),
                dict(name=pbs.ATTR_stageout, value=stageout),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict(name=pbs.ATTR_o, value=ofile),
                dict(name=pbs.ATTR_e, value=efile),
            ]

        # define PBS job options
        attrs.append(
            dict(name=pbs.ATTR_N,
                 value=str("%s_%s_%s" %
                           (job_wrapper.job_id, job_wrapper.tool.id,
                            job_wrapper.user))))
        job_attrs = pbs.new_attropl(len(attrs) + len(pbs_options))
        for i, attr in enumerate(attrs + pbs_options):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath(job_wrapper.working_directory)

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with open(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join(job_wrapper.get_input_fnames() + output_files),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [stage_commands]
        script = self.get_job_file(job_wrapper,
                                   exit_code_path=ecfile,
                                   env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory,
                                 job_wrapper.job_id)
        self.write_executable_script(job_file, script)
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug(
                "Job %s deleted by user before it entered the PBS queue" %
                job_wrapper.job_id)
            pbs.pbs_disconnect(c)
            if job_wrapper.cleanup_job in ("always", "onsuccess"):
                self.cleanup((ofile, efile, ecfile, job_file))
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % (galaxy_job_id, job_file))

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name,
                                    None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning("(%s) pbs_submit failed (try %d/5), PBS error %d: %s" %
                        (galaxy_job_id, tries, errno, text))
            time.sleep(2)
        else:
            log.error("(%s) All attempts to submit job failed" % galaxy_job_id)
            job_wrapper.fail(
                "Unable to run this job due to a cluster error, please retry it later"
            )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" %
                      (galaxy_job_id, job_id))
        else:
            log.debug("(%s) queued in %s queue as %s" %
                      (galaxy_job_id, pbs_queue_name, job_id))

        # persist destination
        job_wrapper.set_job_destination(job_destination, job_id)

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(job_state)
Пример #15
0
 def recover( self, job, job_wrapper ):
     """Recovers jobs stuck in the queued/running state when Galaxy started"""
     job_id = job.get_job_runner_external_id()
     pbs_job_state = AsynchronousJobState()
     pbs_job_state.output_file = "%s/%s.o" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.error_file = "%s/%s.e" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.exit_code_file = "%s/%s.ec" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job.id)
     pbs_job_state.job_id = str( job_id )
     pbs_job_state.runner_url = job_wrapper.get_job_runner_url()
     pbs_job_state.job_destination = job_wrapper.job_destination
     job_wrapper.command_line = job.command_line
     pbs_job_state.job_wrapper = job_wrapper
     if job.state == model.Job.states.RUNNING:
         log.debug( "(%s/%s) is still in running state, adding to the PBS queue" % ( job.id, job.get_job_runner_external_id() ) )
         pbs_job_state.old_state = 'R'
         pbs_job_state.running = True
         self.monitor_queue.put( pbs_job_state )
     elif job.state == model.Job.states.QUEUED:
         log.debug( "(%s/%s) is still in PBS queued state, adding to the PBS queue" % ( job.id, job.get_job_runner_external_id() ) )
         pbs_job_state.old_state = 'Q'
         pbs_job_state.running = False
         self.monitor_queue.put( pbs_job_state )
Пример #16
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # where galaxy will expect results.
        log.debug(f"Starting queue_job for job {job_wrapper.get_id_tag()}")

        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_destination=job_wrapper.job_destination)
        # Kubernetes doesn't really produce meaningful "job stdout", but file needs to be present
        with open(ajs.output_file, 'w'):
            pass
        with open(ajs.error_file, 'w'):
            pass

        if not self.prepare_job(
            job_wrapper,
            include_metadata=False,
            modify_command_for_container=False,
        ):
            return

        script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None)
        try:
            self.write_executable_script(ajs.job_file, script, job_io=job_wrapper.job_io)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception(f"({job_wrapper.get_id_tag()}) failure writing job script")
            return

        # Construction of Kubernetes objects follow: https://kubernetes.io/docs/concepts/workloads/controllers/job/
        if self.__has_guest_ports(job_wrapper):
            try:
                self.__configure_port_routing(ajs)
            except HTTPError:
                log.exception("Kubernetes failed to expose tool ports as services, HTTP exception encountered")
                ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
                ajs.fail_message = "Kubernetes failed to export tool ports as services."
                self.mark_as_failed(ajs)
                return

        k8s_job_prefix = self.__produce_k8s_job_prefix()
        k8s_job_obj = job_object_dict(
            self.runner_params,
            k8s_job_prefix,
            self.__get_k8s_job_spec(ajs)
        )

        job = Job(self._pykube_api, k8s_job_obj)
        try:
            job.create()
        except HTTPError:
            log.exception("Kubernetes failed to create job, HTTP exception encountered")
            ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
            ajs.fail_message = "Kubernetes failed to create job."
            self.mark_as_failed(ajs)
            return
        if not job.name:
            log.exception(f"Kubernetes failed to create job, empty name encountered: [{job.obj}]")
            ajs.runner_state = JobState.runner_states.UNKNOWN_ERROR
            ajs.fail_message = "Kubernetes failed to create job."
            self.mark_as_failed(ajs)
            return
        job_id = job.name

        # define job attributes in the AsyncronousJobState for follow-up
        ajs.job_id = job_id
        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(job_id)
        self.monitor_queue.put(ajs)
Пример #17
0
    def queue_job( self, job_wrapper ):
        """Create job script and submit it to the DRM"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=True ):
            return

        # Get shell and job execution interface
        job_destination = job_wrapper.job_destination
        shell_params, job_params = self.parse_destination_params(job_destination.params)
        shell, job_interface = self.get_cli_plugins(shell_params, job_params)

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )

        job_file_kwargs = job_interface.job_script_kwargs(ajs.output_file, ajs.error_file, ajs.job_name)
        script = self.get_job_file(
            job_wrapper,
            exit_code_path=ajs.exit_code_file,
            **job_file_kwargs
        )

        try:
            fh = file(ajs.job_file, "w")
            fh.write(script)
            fh.close()
        except:
            log.exception("(%s) failure writing job script" % galaxy_id_tag )
            job_wrapper.fail("failure preparing job script", exception=True)
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.info("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
            if self.app.config.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug( "(%s) submitting file: %s" % ( galaxy_id_tag, ajs.job_file ) )

        cmd_out = shell.execute(job_interface.submit(ajs.job_file))
        if cmd_out.returncode != 0:
            log.error('(%s) submission failed (stdout): %s' % (galaxy_id_tag, cmd_out.stdout))
            log.error('(%s) submission failed (stderr): %s' % (galaxy_id_tag, cmd_out.stderr))
            job_wrapper.fail("failure submitting job")
            return
        # Some job runners return something like 'Submitted batch job XXXX'
        # Strip and split to get job ID.
        external_job_id = cmd_out.stdout.strip().split()[-1]
        if not external_job_id:
            log.error('(%s) submission did not return a job identifier, failing job' % galaxy_id_tag)
            job_wrapper.fail("failure submitting job")
            return

        log.info("(%s) queued with identifier: %s" % ( galaxy_id_tag, external_job_id ) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination( job_destination, external_job_id )

        # Store state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( ajs )
Пример #18
0
    def queue_job( self, job_wrapper ):
        """Create PBS script for a job and submit it to the PBS queue"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=not( self.app.config.pbs_stage_path ) ):
            return

        job_destination = job_wrapper.job_destination

        # Determine the job's PBS destination (server/queue) and options from the job destination definition
        pbs_queue_name = None
        pbs_server_name = self.default_pbs_server
        pbs_options = []
        if '-q' in job_destination.params and 'destination' not in job_destination.params:
            job_destination.params['destination'] = job_destination.params.pop('-q')
        if 'destination' in job_destination.params:
            if '@' in job_destination.params['destination']:
                # Destination includes a server
                pbs_queue_name, pbs_server_name = job_destination.params['destination'].split('@')
                if pbs_queue_name == '':
                    # e.g. `qsub -q @server`
                    pbs_queue_name = None
            else:
                # Destination is just a queue
                pbs_queue_name = job_destination.params['destination']
            job_destination.params.pop('destination')

        # Parse PBS params
        pbs_options = self.parse_destination_params(job_destination.params)

        # Explicitly set the determined PBS destination in the persisted job destination for recovery
        job_destination.params['destination'] = '%s@%s' % (pbs_queue_name or '', pbs_server_name)

        c = pbs.pbs_connect( util.smart_str( pbs_server_name ) )
        if c <= 0:
            errno, text = pbs.error()
            job_wrapper.fail( "Unable to queue job for execution.  Resubmitting the job may succeed." )
            log.error( "Connection to PBS server for submit failed: %s: %s" % ( errno, text ) )
            return

        # define job attributes
        ofile = "%s/%s.o" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        efile = "%s/%s.e" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        ecfile = "%s/%s.ec" % (self.app.config.cluster_files_directory, job_wrapper.job_id)

        output_fnames = job_wrapper.get_output_fnames()

        # If an application server is set, we're staging
        if self.app.config.pbs_application_server:
            pbs_ofile = self.app.config.pbs_application_server + ':' + ofile
            pbs_efile = self.app.config.pbs_application_server + ':' + efile
            output_files = [ str( o ) for o in output_fnames ]
            output_files.append(ecfile)
            stagein = self.get_stage_in_out( job_wrapper.get_input_fnames() + output_files, symlink=True )
            stageout = self.get_stage_in_out( output_files )
            attrs = [
                dict( name=pbs.ATTR_o, value=pbs_ofile ),
                dict( name=pbs.ATTR_e, value=pbs_efile ),
                dict( name=pbs.ATTR_stagein, value=stagein ),
                dict( name=pbs.ATTR_stageout, value=stageout ),
            ]
        # If not, we're using NFS
        else:
            attrs = [
                dict( name=pbs.ATTR_o, value=ofile ),
                dict( name=pbs.ATTR_e, value=efile ),
            ]

        # define PBS job options
        attrs.append( dict( name=pbs.ATTR_N, value=str( "%s_%s_%s" % ( job_wrapper.job_id, job_wrapper.tool.id, job_wrapper.user ) ) ) )
        job_attrs = pbs.new_attropl( len( attrs ) + len( pbs_options ) )
        for i, attr in enumerate( attrs + pbs_options ):
            job_attrs[i].name = attr['name']
            job_attrs[i].value = attr['value']
            if 'resource' in attr:
                job_attrs[i].resource = attr['resource']
        exec_dir = os.path.abspath( job_wrapper.working_directory )

        # write the job script
        if self.app.config.pbs_stage_path != '':
            # touch the ecfile so that it gets staged
            with open(ecfile, 'a'):
                os.utime(ecfile, None)

            stage_commands = pbs_symlink_template % (
                " ".join( job_wrapper.get_input_fnames() + output_files ),
                self.app.config.pbs_stage_path,
                exec_dir,
            )
        else:
            stage_commands = ''

        env_setup_commands = [ stage_commands ]
        script = self.get_job_file(job_wrapper, exit_code_path=ecfile, env_setup_commands=env_setup_commands)
        job_file = "%s/%s.sh" % (self.app.config.cluster_files_directory, job_wrapper.job_id)
        self.write_executable_script( job_file, script )
        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "Job %s deleted by user before it entered the PBS queue" % job_wrapper.job_id )
            pbs.pbs_disconnect(c)
            if job_wrapper.cleanup_job in ( "always", "onsuccess" ):
                self.cleanup( ( ofile, efile, ecfile, job_file ) )
                job_wrapper.cleanup()
            return

        # submit
        # The job tag includes the job and the task identifier
        # (if a TaskWrapper was passed in):
        galaxy_job_id = job_wrapper.get_id_tag()
        log.debug("(%s) submitting file %s" % ( galaxy_job_id, job_file ) )

        tries = 0
        while tries < 5:
            job_id = pbs.pbs_submit(c, job_attrs, job_file, pbs_queue_name, None)
            tries += 1
            if job_id:
                pbs.pbs_disconnect(c)
                break
            errno, text = pbs.error()
            log.warning( "(%s) pbs_submit failed (try %d/5), PBS error %d: %s" % (galaxy_job_id, tries, errno, text) )
            time.sleep(2)
        else:
            log.error( "(%s) All attempts to submit job failed" % galaxy_job_id )
            job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" )
            return

        if pbs_queue_name is None:
            log.debug("(%s) queued in default queue as %s" % (galaxy_job_id, job_id) )
        else:
            log.debug("(%s) queued in %s queue as %s" % (galaxy_job_id, pbs_queue_name, job_id) )

        # persist destination
        job_wrapper.set_job_destination( job_destination, job_id )

        # Store PBS related state information for job
        job_state = AsynchronousJobState()
        job_state.job_wrapper = job_wrapper
        job_state.job_id = job_id
        job_state.job_file = job_file
        job_state.output_file = ofile
        job_state.error_file = efile
        job_state.exit_code_file = ecfile
        job_state.old_state = 'N'
        job_state.running = False
        job_state.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( job_state )
Пример #19
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to Kubernetes cluster"""
        # prepare the job
        # We currently don't need to include_metadata or include_work_dir_outputs, as working directory is the same
        # where galaxy will expect results.
        log.debug("Starting queue_job for job " + job_wrapper.get_id_tag())
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory,
                                   job_wrapper=job_wrapper,
                                   job_destination=job_wrapper.job_destination)

        if not self.prepare_job(job_wrapper,
                                include_metadata=False,
                                modify_command_for_container=False,
                                stdout_file=ajs.output_file,
                                stderr_file=ajs.error_file):
            return

        script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell, galaxy_virtual_env=None)
        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception("(%s) failure writing job script" % job_wrapper.get_id_tag())
            return

        # Construction of the Kubernetes Job object follows: http://kubernetes.io/docs/user-guide/persistent-volumes/
        k8s_job_name = self.__produce_unique_k8s_job_name(job_wrapper.get_id_tag())
        k8s_job_obj = job_object_dict(
            self.runner_params,
            k8s_job_name,
            self.__get_k8s_job_spec(ajs)
        )

        # Checks if job exists and is trusted, or if it needs re-creation.
        job = Job(self._pykube_api, k8s_job_obj)
        job_exists = job.exists()
        if job_exists and not self._galaxy_instance_id:
            # if galaxy instance id is not set, then we don't trust matching jobs and we simply delete and
            # re-create the job
            log.debug("Matching job exists, but Job is not trusted, so it will be deleted and a new one created.")
            job.delete()
            elapsed_seconds = 0
            while job.exists():
                sleep(3)
                elapsed_seconds += 3
                if elapsed_seconds > self.runner_params['k8s_timeout_seconds_job_deletion']:
                    log.debug("Timed out before k8s could delete existing untrusted job " + k8s_job_name +
                              ", not queuing associated Galaxy job.")
                    return
                log.debug("Waiting for job to be deleted " + k8s_job_name)

            Job(self._pykube_api, k8s_job_obj).create()
        elif job_exists and self._galaxy_instance_id:
            # The job exists and we trust the identifier.
            log.debug("Matching job exists, but Job is trusted, so we simply use the existing one for " + k8s_job_name)
            # We simply leave the k8s job to be handled later on by check_watched_item().
        else:
            # Creates the Kubernetes Job if it doesn't exist.
            job.create()

        # define job attributes in the AsyncronousJobState for follow-up
        ajs.job_id = k8s_job_name
        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(k8s_job_name)
        self.monitor_queue.put(ajs)
 def queue_job(self, job_wrapper):
     """Write JSE-Drop file to drop location
     """
     # Get the configured job destination
     job_destination = job_wrapper.job_destination
     # Get the parameters defined for this destination
     # i.e. location of the drop-off directory etc
     drop_off_dir = self._get_drop_dir()
     virtual_env = self._get_virtual_env()
     qsub_options = self._get_qsub_options(job_destination)
     galaxy_slots = self._get_galaxy_slots(job_destination)
     galaxy_id = self._get_galaxy_id()
     log.debug("queue_job: drop-off dir = %s" % drop_off_dir)
     log.debug("queue_job: virtual_env  = %s" % virtual_env)
     log.debug("queue_job: qsub options = %s" % qsub_options)
     log.debug("queue_job: galaxy_slots = %s" % galaxy_slots)
     log.debug("queue_job: galaxy_id    = %s" % galaxy_id)
     if drop_off_dir is None:
         # Can't locate drop-off dir
         job_wrapper.fail("failure preparing job script (no JSE-drop "
                          "directory defined)",exception=True )
         log.exception("(%s/%s) failure writing job script (no "
                       "JSE-drop directory defined)" %
                       (galaxy_id_tag,job_name))
         return
     # Initialise JSE-drop wrapper
     jse_drop = JSEDrop(drop_off_dir)
     # ID and name for job
     galaxy_id_tag = job_wrapper.get_id_tag()
     log.debug("ID tag: %s" % galaxy_id_tag)
     job_name = self._get_job_name(galaxy_id_tag,
                                   job_wrapper.tool.old_id,
                                   galaxy_id)
     log.debug("Job name: %s" % job_name)
     # Prepare the job wrapper (or abort)
     if not self.prepare_job(job_wrapper):
         return
     # Sort out the slots (see e.g. condor.py for example)
     if galaxy_slots:
         galaxy_slots_statement = 'GALAXY_SLOTS="%s"; export GALAXY_SLOTS_CONFIGURED="1"' % galaxy_slots
     else:
         galaxy_slots_statement = 'GALAXY_SLOTS="1"'
     # Create script contents
     script = self.get_job_file(job_wrapper,
                                galaxy_virtual_env=virtual_env,
                                slots_statement=galaxy_slots_statement,
                                exit_code_path=None)
     # Separate leading shell specification from generated script
     shell = '\n'.join(filter(lambda x: x.startswith('#!'),
                              script.split('\n')))
     script = '\n'.join(filter(lambda x: not x.startswith('#!'),
                               script.split('\n')))
     # Create header with embedded qsub flags
     qsub_header = ["-V",
                    "-wd %s" % job_wrapper.working_directory]
     if qsub_options:
         qsub_header.append(qsub_options)
     qsub_header = '\n'.join(["#$ %s" % opt for opt in qsub_header])
     log.debug("qsub_header: %s" % qsub_header)
     # Reassemble the script components
     script = "\n".join((shell,qsub_header,script))
     # Create the drop file to submit the job
     try:
         drop_file = jse_drop.run(job_name,script)
         log.debug("created drop file %s" % drop_file)
         log.info("(%s) submitted as %s" % (galaxy_id_tag,job_name))
     except:
         # Some problem writing the qsub file
         job_wrapper.fail("failure preparing job script",
                          exception=True )
         log.exception("(%s/%s) failure writing job script" %
                       (galaxy_id_tag,job_name))
         return
     # External job id (i.e. id used by JSE-Drop as a handle to
     # identify the job) is the same as the job name here
     external_job_id = job_name
     # Store runner information for tracking if Galaxy restarts
     job_wrapper.set_job_destination(job_destination,
                                     external_job_id)
     # Store state information for job
     job_state = AsynchronousJobState()
     job_state.job_wrapper = job_wrapper
     job_state.job_id = job_name
     job_state.old_state = True
     job_state.running = False
     job_state.job_destination = job_destination
     # Add to the queue of jobs to monitor
     self.monitor_job(job_state)
     log.info("%s: queued" % job_name)
Пример #21
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to the DRM"""
        # prepare the job
        include_metadata = asbool(job_wrapper.job_destination.params.get("embed_metadata_in_job", DEFAULT_EMBED_METADATA_IN_JOB))
        if not self.prepare_job(job_wrapper, include_metadata=include_metadata):
            return

        # Get shell and job execution interface
        job_destination = job_wrapper.job_destination
        shell_params, job_params = self.parse_destination_params(job_destination.params)
        shell, job_interface = self.get_cli_plugins(shell_params, job_params)

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper)

        job_file_kwargs = job_interface.job_script_kwargs(ajs.output_file, ajs.error_file, ajs.job_name)
        script = self.get_job_file(
            job_wrapper,
            exit_code_path=ajs.exit_code_file,
            shell=job_wrapper.shell,
            **job_file_kwargs
        )

        try:
            self.write_executable_script(ajs.job_file, script)
        except Exception:
            log.exception(f"({galaxy_id_tag}) failure writing job script")
            job_wrapper.fail("failure preparing job script", exception=True)
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED):
            log.debug("(%s) Job deleted/stopped by user before it entered the queue", galaxy_id_tag)
            if job_wrapper.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug(f"({galaxy_id_tag}) submitting file: {ajs.job_file}")

        returncode, stdout = self.submit(shell, job_interface, ajs.job_file, galaxy_id_tag, retry=MAX_SUBMIT_RETRY)
        if returncode != 0:
            job_wrapper.fail("failure submitting job")
            return
        # Some job runners return something like 'Submitted batch job XXXX'
        # Strip and split to get job ID.
        external_job_id = stdout.strip().split()[-1]
        if not external_job_id:
            log.error(f'({galaxy_id_tag}) submission did not return a job identifier, failing job')
            job_wrapper.fail("failure submitting job")
            return

        log.info(f"({galaxy_id_tag}) queued with identifier: {external_job_id}")

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(external_job_id)

        # Store state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(ajs)
Пример #22
0
    def queue_job(self, job_wrapper):
        """Create job script and submit it to the DRM"""
        # prepare the job

        # external_runJob_script can be None, in which case it's not used.
        external_runjob_script = job_wrapper.get_destination_configuration("drmaa_external_runjob_script", None)

        include_metadata = asbool(job_wrapper.job_destination.params.get("embed_metadata_in_job", True))
        if not self.prepare_job(job_wrapper, include_metadata=include_metadata):
            return

        # get configured job destination
        job_destination = job_wrapper.job_destination

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        job_name = self._job_name(job_wrapper)
        ajs = AsynchronousJobState(files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name)

        # set up the drmaa job template
        jt = dict(
            remoteCommand=ajs.job_file,
            jobName=ajs.job_name,
            workingDirectory=job_wrapper.working_directory,
            outputPath=f":{ajs.output_file}",
            errorPath=f":{ajs.error_file}"
        )

        # Avoid a jt.exitCodePath for now - it's only used when finishing.
        native_spec = job_destination.params.get('nativeSpecification', None)
        if native_spec is None:
            native_spec = job_destination.params.get('native_specification', None)
        if native_spec is not None:
            jt['nativeSpecification'] = native_spec

        # fill in the DRM's job run template
        script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file, shell=job_wrapper.shell)
        try:
            self.write_executable_script(ajs.job_file, script, job_io=job_wrapper.job_io)
        except Exception:
            job_wrapper.fail("failure preparing job script", exception=True)
            log.exception(f"({galaxy_id_tag}) failure writing job script")
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() in (model.Job.states.DELETED, model.Job.states.STOPPED):
            log.debug("(%s) Job deleted/stopped by user before it entered the queue", galaxy_id_tag)
            if job_wrapper.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug("(%s) submitting file %s", galaxy_id_tag, ajs.job_file)
        if native_spec:
            log.debug("(%s) native specification is: %s", galaxy_id_tag, native_spec)

        # runJob will raise if there's a submit problem
        if external_runjob_script is None:
            # TODO: create a queue for retrying submission indefinitely
            # TODO: configurable max tries and sleep
            trynum = 0
            external_job_id = None
            fail_msg = None
            while external_job_id is None and trynum < 5:
                try:
                    external_job_id = self.ds.run_job(**jt)
                    break
                except (drmaa.InternalException, drmaa.DeniedByDrmException) as e:
                    trynum += 1
                    log.warning('(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e)
                    fail_msg = "Unable to run this job due to a cluster error, please retry it later"
                    time.sleep(5)
                except Exception:
                    log.exception('(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag)
                    trynum = 5
            else:
                log.error(f"({galaxy_id_tag}) All attempts to submit job failed")
                if not fail_msg:
                    fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE
                job_wrapper.fail(fail_msg)
                return
        else:
            job_wrapper.change_ownership_for_run()
            # if user credentials are not available, use galaxy credentials (if permitted)
            allow_guests = asbool(job_wrapper.job_destination.params.get("allow_guests", False))
            pwent = job_wrapper.user_system_pwent
            if pwent is None:
                if not allow_guests:
                    fail_msg = f"User {job_wrapper.user} is not mapped to any real user, and not permitted to start jobs."
                    job_wrapper.fail(fail_msg)
                    return
                pwent = job_wrapper.galaxy_system_pwent
            log.debug(f'({galaxy_id_tag}) submitting with credentials: {pwent[0]} [uid: {pwent[2]}]')
            filename = self.store_jobtemplate(job_wrapper, jt)
            self.userid = pwent[2]
            external_job_id = self.external_runjob(external_runjob_script, filename, pwent[2])
            if external_job_id is None:
                job_wrapper.fail(f"({galaxy_id_tag}) could not queue job")
                return
        log.info(f"({galaxy_id_tag}) queued as {external_job_id}")

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_external_id(external_job_id)

        # Store DRM related state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put(ajs)
Пример #23
0
    def queue_job( self, job_wrapper ):
        """Create job script and submit it to the DRM"""
        # prepare the job
        if not self.prepare_job( job_wrapper, include_metadata=True ):
            return

        # command line has been added to the wrapper by prepare_job()
        command_line = job_wrapper.runner_command_line

        # Get shell and job execution interface
        job_destination = job_wrapper.job_destination
        shell_params, job_params = self.parse_destination_params(job_destination.params)
        shell, job_interface = self.get_cli_plugins(shell_params, job_params)

        # Updated by jinchao
        print "\nQueue job shell/job interface"
        print "shell: %s  |  job: %s" % (shell_params['plugin'], job_params['plugin'])

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper )

        # fill in the DRM's job run template
        script = job_interface.get_job_template(ajs.output_file, ajs.error_file, ajs.job_name, job_wrapper, command_line, ajs.exit_code_file)

        # Updated by jinchao
        print "\nQueue job script"
        print script

        try:
            fh = file(ajs.job_file, "w")
            fh.write(script)
            fh.close()
        except:
            log.exception("(%s) failure writing job script" % galaxy_id_tag )
            job_wrapper.fail("failure preparing job script", exception=True)
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.info("(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
            if self.app.config.cleanup_job in ("always", "onsuccess"):
                job_wrapper.cleanup()
            return

        log.debug( "(%s) submitting file: %s" % ( galaxy_id_tag, ajs.job_file ) )
        log.debug( "(%s) command is: %s" % ( galaxy_id_tag, command_line ) )

        cmd_out = shell.execute(job_interface.submit(ajs.job_file, job_wrapper))

        # Updated by jinchao
        print "\nQueue job cmd_out"
        print cmd_out

        if cmd_out.returncode != 0:
            log.error('(%s) submission failed (stdout): %s' % (galaxy_id_tag, cmd_out.stdout))
            log.error('(%s) submission failed (stderr): %s' % (galaxy_id_tag, cmd_out.stderr))
            job_wrapper.fail("failure submitting job")
            return

        # Updated by jinchao
        print "\nQueue job cmd_out returncode:"
        print cmd_out.returncode
        print "\nQueue job cmd_out stdout:"
        print cmd_out.stdout
        print "\nQueue job cmd_out stderr:"
        print cmd_out.stderr
        print "\nQueue job cmd_out job params"
        print job_params

        #external_job_id = cmd_out.stdout.strip()

        # Updated by jinchao: handle job info to get job id
        if job_params['plugin'] == 'Hadoop':
            # The stdout was sent to stderr under hadoop job plugin
            external_job_id = job_interface.parse_job_info(cmd_out.stderr)
        else:
            external_job_id = job_interface.parse_job_info(cmd_out.stdout)
        print "\Queue job JOB ID:"
        print external_job_id

        if not external_job_id:
            log.error('(%s) submission did not return a job identifier, failing job' % galaxy_id_tag)
            job_wrapper.fail("failure submitting job")
            return

        log.info("(%s) queued with identifier: %s" % ( galaxy_id_tag, external_job_id ) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination( job_destination, external_job_id )

        # Store state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( ajs )
Пример #24
0
    def queue_job( self, job_wrapper ):
        """Create job script and submit it to the DRM"""
        # prepare the job
        include_metadata = asbool( job_wrapper.job_destination.params.get( "embed_metadata_in_job", True) )
        if not self.prepare_job( job_wrapper, include_metadata=include_metadata):
            return

        # get configured job destination
        job_destination = job_wrapper.job_destination

        # wrapper.get_id_tag() instead of job_id for compatibility with TaskWrappers.
        galaxy_id_tag = job_wrapper.get_id_tag()

        # define job attributes
        job_name = 'g%s' % galaxy_id_tag
        if job_wrapper.tool.old_id:
            job_name += '_%s' % job_wrapper.tool.old_id
        if self.external_runJob_script is None:
            job_name += '_%s' % job_wrapper.user
        job_name = ''.join( map( lambda x: x if x in ( string.letters + string.digits + '_' ) else '_', job_name ) )
        ajs = AsynchronousJobState( files_dir=job_wrapper.working_directory, job_wrapper=job_wrapper, job_name=job_name )

        # set up the drmaa job template
        jt = self.ds.createJobTemplate()
        jt.remoteCommand = ajs.job_file
        jt.jobName = ajs.job_name
        jt.workingDirectory = job_wrapper.working_directory
        jt.outputPath = ":%s" % ajs.output_file
        jt.errorPath = ":%s" % ajs.error_file

        # Avoid a jt.exitCodePath for now - it's only used when finishing.
        native_spec = job_destination.params.get('nativeSpecification', None)
        if native_spec is not None:
            jt.nativeSpecification = native_spec

        # fill in the DRM's job run template
        script = self.get_job_file(job_wrapper, exit_code_path=ajs.exit_code_file)
        try:
            fh = file( ajs.job_file, "w" )
            fh.write( script )
            fh.close()
            os.chmod( ajs.job_file, 0o755 )
        except:
            job_wrapper.fail( "failure preparing job script", exception=True )
            log.exception( "(%s) failure writing job script" % galaxy_id_tag )
            return

        # job was deleted while we were preparing it
        if job_wrapper.get_state() == model.Job.states.DELETED:
            log.debug( "(%s) Job deleted by user before it entered the queue" % galaxy_id_tag )
            if self.app.config.cleanup_job in ( "always", "onsuccess" ):
                job_wrapper.cleanup()
            return

        log.debug( "(%s) submitting file %s", galaxy_id_tag, ajs.job_file )
        if native_spec:
            log.debug( "(%s) native specification is: %s", galaxy_id_tag, native_spec )

        # runJob will raise if there's a submit problem
        if self.external_runJob_script is None:
            # TODO: create a queue for retrying submission indefinitely
            # TODO: configurable max tries and sleep
            trynum = 0
            external_job_id = None
            fail_msg = None
            while external_job_id is None and trynum < 5:
                try:
                    external_job_id = self.ds.runJob(jt)
                    break
                except ( drmaa.InternalException, drmaa.DeniedByDrmException ) as e:
                    trynum += 1
                    log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e )
                    fail_msg = "Unable to run this job due to a cluster error, please retry it later"
                    time.sleep( 5 )
                except:
                    log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag )
                    trynum = 5
            else:
                log.error( "(%s) All attempts to submit job failed" % galaxy_id_tag )
                if not fail_msg:
                    fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE
                job_wrapper.fail( fail_msg )
                self.ds.deleteJobTemplate( jt )
                return
        else:
            job_wrapper.change_ownership_for_run()
            # if user credentials are not available, use galaxy credentials (if permitted)
            allow_guests = asbool(job_wrapper.job_destination.params.get( "allow_guests", False) )
            pwent = job_wrapper.user_system_pwent
            if pwent is None:
                if not allow_guests:
                    fail_msg = "User %s is not mapped to any real user, and not permitted to start jobs." % job_wrapper.user
                    job_wrapper.fail( fail_msg )
                    self.ds.deleteJobTemplate( jt )
                    return
                pwent = job_wrapper.galaxy_system_pwent
            log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, pwent[0], pwent[2] ) )
            filename = self.store_jobtemplate(job_wrapper, jt)
            self.userid = pwent[2]
            external_job_id = self.external_runjob(filename, pwent[2]).strip()
        log.info( "(%s) queued as %s" % ( galaxy_id_tag, external_job_id ) )

        # store runner information for tracking if Galaxy restarts
        job_wrapper.set_job_destination( job_destination, external_job_id )

        # Store DRM related state information for job
        ajs.job_id = external_job_id
        ajs.old_state = 'new'
        ajs.job_destination = job_destination

        # delete the job template
        self.ds.deleteJobTemplate( jt )

        # Add to our 'queue' of jobs to monitor
        self.monitor_queue.put( ajs )
Пример #25
0
    def queue_job(self, job_wrapper):
        job_destination = job_wrapper.job_destination
        self._populate_parameter_defaults(job_destination)

        command_line, client, remote_job_config, compute_environment, remote_container = self.__prepare_job(
            job_wrapper, job_destination)

        if not command_line:
            return

        try:
            dependencies_description = PulsarJobRunner.__dependencies_description(
                client, job_wrapper)
            rewrite_paths = not PulsarJobRunner.__rewrite_parameters(client)
            path_rewrites_unstructured = {}
            output_names = []
            if compute_environment:
                path_rewrites_unstructured = compute_environment.path_rewrites_unstructured
                output_names = compute_environment.output_names()

                client_inputs_list = []
                for input_dataset_wrapper in job_wrapper.get_input_paths():
                    # str here to resolve false_path if set on a DatasetPath object.
                    path = str(input_dataset_wrapper)
                    object_store_ref = {
                        "dataset_id": input_dataset_wrapper.dataset_id,
                        "dataset_uuid":
                        str(input_dataset_wrapper.dataset_uuid),
                        "object_store_id":
                        input_dataset_wrapper.object_store_id,
                    }
                    client_inputs_list.append(
                        ClientInput(path,
                                    CLIENT_INPUT_PATH_TYPES.INPUT_PATH,
                                    object_store_ref=object_store_ref))

                for input_extra_path in compute_environment.path_rewrites_input_extra.keys(
                ):
                    # TODO: track dataset for object_Store_ref...
                    client_inputs_list.append(
                        ClientInput(
                            input_extra_path,
                            CLIENT_INPUT_PATH_TYPES.INPUT_EXTRA_FILES_PATH))

                for input_metadata_path in compute_environment.path_rewrites_input_metadata.keys(
                ):
                    # TODO: track dataset for object_Store_ref...
                    client_inputs_list.append(
                        ClientInput(
                            input_metadata_path,
                            CLIENT_INPUT_PATH_TYPES.INPUT_METADATA_PATH))

                input_files = None
                client_inputs = ClientInputs(client_inputs_list)
            else:
                input_files = self.get_input_files(job_wrapper)
                client_inputs = None

            if self.app.config.metadata_strategy == "legacy":
                # Drop this branch in 19.09.
                metadata_directory = job_wrapper.working_directory
            else:
                metadata_directory = os.path.join(
                    job_wrapper.working_directory, "metadata")

            remote_pulsar_app_config = job_destination.params.get(
                "pulsar_app_config", {})
            job_directory_files = []
            config_files = job_wrapper.extra_filenames
            tool_script = os.path.join(job_wrapper.working_directory,
                                       "tool_script.sh")
            if os.path.exists(tool_script):
                log.debug("Registering tool_script for Pulsar transfer [%s]" %
                          tool_script)
                job_directory_files.append(tool_script)
            client_job_description = ClientJobDescription(
                command_line=command_line,
                input_files=input_files,
                client_inputs=
                client_inputs,  # Only one of these input defs should be non-None
                client_outputs=self.__client_outputs(client, job_wrapper),
                working_directory=job_wrapper.tool_working_directory,
                metadata_directory=metadata_directory,
                tool=job_wrapper.tool,
                config_files=config_files,
                dependencies_description=dependencies_description,
                env=client.env,
                rewrite_paths=rewrite_paths,
                arbitrary_files=path_rewrites_unstructured,
                touch_outputs=output_names,
                remote_pulsar_app_config=remote_pulsar_app_config,
                job_directory_files=job_directory_files,
                container=None
                if not remote_container else remote_container.container_id,
            )
            job_id = pulsar_submit_job(client, client_job_description,
                                       remote_job_config)
            log.info("Pulsar job submitted with job_id %s" % job_id)
            job_wrapper.set_job_destination(job_destination, job_id)
            job_wrapper.change_state(model.Job.states.QUEUED)
        except Exception:
            job_wrapper.fail("failure running job", exception=True)
            log.exception("failure running job %d", job_wrapper.job_id)
            return

        pulsar_job_state = AsynchronousJobState()
        pulsar_job_state.job_wrapper = job_wrapper
        pulsar_job_state.job_id = job_id
        pulsar_job_state.old_state = True
        pulsar_job_state.running = False
        pulsar_job_state.job_destination = job_destination
        self.monitor_job(pulsar_job_state)