Exemplo n.º 1
0
Arquivo: tool.py Projeto: liek51/civet
        def tokenReplace(m):
            # get current pipeline symbols
            import pipeline_parse as PL
            tok = m.group(1)
            if tok in self.options:
                o = self.options[tok]
                if o.type == 'boolean':
                    if o.value:
                        return o.command_text
                    else:
                        return ''
                elif len(o.command_text) > 0 and (o.command_text[-1] == '=' or
                                                  o.command_text[-1] == ':'):
                    return o.command_text + o.value
                else:
                    return o.command_text + ' ' + o.value
            if tok in self.tool_files:
                f = self.tool_files[tok]
                if f.is_list:
                    if f.list_from_param:
                        return f.path.replace(',', ' ')
                    else:
                        # Emit the code to invoke a file filter.
                        return "$(process_filelist.py {0} '{1}')".format(
                            f.path, f.pattern)
                return f.path

            # We didn't match a known option, or a file id. Put out an error.
            print("\n\nUNKNOWN OPTION OR FILE ID: '{}' in file {}".format(
                tok, self.tool.xml_file),
                  file=sys.stderr)
            print('Tool files: {}'.format(self.tool_files), file=sys.stderr)
            print('Options: {}\n\n'.format(self.options), file=sys.stderr)
            PL.abort_submit('UNKNOWN OPTION OR FILE ID: ' + tok)
Exemplo n.º 2
0
    def submit(self, name_prefix):

        import pipeline_parse as PL

        # Get file list
        # for file in list
            # build related filenames
            # register file and related in pipeline files list
            # submit step(s)
            # clean up files from pipeline files list
        matched_files = []
        job_ids = []
        iteration = 0
        all_files = os.listdir(self.pipelineFiles[self.dir].path)
        for fn in all_files:
            if self.file.pattern.match(fn):
                matched_files.append(fn)

        # figure out if this foreach loop will exceed the limit
        total_jobs = 0
        for s in self.steps:
            for child in s:
                if child.tag == 'tool':
                    total_jobs += 1
        total_jobs *= len(matched_files)

        if total_jobs > ForEach.MAX_JOBS:
            PL.abort_submit("error submitting foreach: {} jobs exceed limit "
                            "(max = {})\n".format(total_jobs, ForEach.MAX_JOBS))

        for fn in matched_files:
            iteration += 1
            cleanups = []
            files_to_delete = []
            iteration_ids = []
            #TODO this is impossible to make sense of, create a static method in
            #PipelineFile that only takes the id, path, file list, and directory
            PipelineFile(self.file.id, fn, self.pipelineFiles, True, False,
                         True, False, False, None, None, None, None,
                         None, None, self.dir, False, False, None)
            cleanups.append(self.file.id)

            for id in self.relatedFiles:
                rel = self.relatedFiles[id]
                rfn = rel.pattern.sub(rel.replace, fn)
                if rel.is_input and not rel.indir:
                    #if no dir we assume related input files are in the same
                    #directory as the foreach file it is related to
                    directory = self.dir
                elif rel.indir:
                    directory = rel.indir
                else:
                    #related file is an output file and indir not specified
                    #write it to the default output directory
                    directory = None
                #TODO see comments for PipelineFile above. this is wicked ugly
                PipelineFile(rel.id, rfn, self.pipelineFiles,  True, False,
                             rel.is_input, False, False, None, None, None,
                             None, None, None, directory, False, False, None)
                cleanups.append(rel.id)
                if rel.is_temp:
                    files_to_delete.append(rel.id)
            PipelineFile.fix_up_files(self.pipelineFiles)

            step_iteration = 0
            for s in self.steps:
                step_iteration += 1
                step = Step(s, self.pipelineFiles)
                prefix = "{}-{}_S{}".format(name_prefix, iteration, step_iteration)
                for jid in step.submit(prefix):
                    job_ids.append(jid)
                    iteration_ids.append(jid)

            #submit a job that deletes all of the temporary files
            tmps = []
            for id in files_to_delete:
                tmps.append(self.pipelineFiles[id].path)

            if len(tmps):
                cmd = 'rm -f ' + ' '.join(tmps)
                cleanup_job = BatchJob(cmd, workdir=PipelineFile.get_output_dir(),
                               depends_on=iteration_ids,
                               name="{}-{}_temp_file_cleanup".format(name_prefix, iteration),
                               walltime="00:30:00",
                               email_list=PL.error_email_address)
                try:
                    cleanup_job_id = PL.job_runner.queue_job(cleanup_job)
                except Exception as e:
                    PL.abort_submit(e, PL.BATCH_ERROR)

                PL.all_batch_jobs.append(cleanup_job_id)

            for jid in cleanups:
                del self.pipelineFiles[jid]



        #enqueue "barrier" job here
        barrier_job = BatchJob('echo "placeholder job used for synchronizing foreach jobs"',
                               workdir=PipelineFile.get_output_dir(),
                               depends_on=job_ids,
                               name="{0}_barrier".format(name_prefix),
                               walltime="00:02:00",
                               email_list=PL.error_email_address)
        try:
            job_id = PL.job_runner.queue_job(barrier_job)
        except Exception as e:
            sys.stderr.write(str(e) + '\n')
            sys.exit(PL.BATCH_ERROR)
        PL.all_batch_jobs.append(job_id)
        PL.foreach_barriers[self.id] = job_id
        return job_ids
Exemplo n.º 3
0
Arquivo: tool.py Projeto: liek51/civet
    def submit(self, job_name, silent):
        """
        Submit the commands that comprise the tool as a single cluster job.


        :param job_name:  a unique (to the pipeline) job name.
        :param silent: if true, don't print job ID after it's submitted
        :return: job_id: a value which can be passed in as a depends_on list
                element in a subsequent tool submission.
        """

        # Get the current symbols in the pipeline...
        import pipeline_parse as PL

        # Now it is time to fix up the commands and write the script file.
        # We couldn't do this before, because we have to ensure that ALL
        # pipeline XML processing is done. (Tempfiles need an output dir,
        # and might have been specified before the output dir.)
        # Tempfiles specified in the pipeline have already been fixed up
        # and have paths.  Here in the tool, they appear as normal files.
        # This is different from tempfiles specified in the tool; they
        # really are temp, and can be stored locally on the node and cleaned
        # up on tool exit.
        #
        for c in self.commands:
            c.fixupOptionsFiles()
            # Add the command names to the verify_files list
            p = c.program
            if p and (p not in self.verify_files):
                self.verify_files.append(c.program)

        multi_command = self._build_multi_command()

        # Determine what jobs we depend on based on our input files.
        depends_on = []
        for fid in self.ins:
            f = self.pipeline_files[fid]
            if f.creator_job:
                j = f.creator_job
                if j not in depends_on:
                    depends_on.append(j)
            if f.is_list:
                if f.foreach_dep:
                    depends_on.append(PL.foreach_barriers[f.foreach_dep])

        # Do the actual batch job submission
        submit_threads = self.thread_option_max if self.thread_option_max else self.default_threads
        verify_file_list = self._build_veryify_file_list()

        if PL.delay:
            date_time = PL.delay_timestamp
        else:
            date_time = None

        batch_job = BatchJob(multi_command,
                             workdir=PipelineFile.get_output_dir(),
                             files_to_validate=verify_file_list,
                             ppn=submit_threads,
                             walltime=self.walltime,
                             modules=self.modules,
                             depends_on=depends_on,
                             name=job_name,
                             error_strings=self.error_strings,
                             version_cmds=self.collect_version_commands(),
                             files_to_test=self.exit_if_exists,
                             file_test_logic=self.exit_test_logic,
                             mem=self.mem,
                             date_time=date_time,
                             email_list=PL.error_email_address,
                             info=("Tool Definition File: " +
                                   os.path.abspath(self.xml_file)),
                             tool_path=self.path)

        try:
            job_id = PL.job_runner.queue_job(batch_job)
        except Exception as e:
            PL.abort_submit(e, PL.BATCH_ERROR)

        # Any files that we created and that will be passed to other jobs
        # need to be marked with our job id.  It is OK if we overwrite
        # a previous job.
        for fid in self.outs:
            f = self.pipeline_files[fid]
            f.set_creator_job(job_id)

        # Mark the files we depend on so that they're not cleaned up too
        # early.  Really only needs to be done for temp files, but for
        # simplicity, we mark them all.
        for fid in self.ins:
            f = self.pipeline_files[fid]
            f.add_consumer_job(job_id)

        if not silent:
            print("{0}: {1}".format(job_id, self.name_from_pipeline))
        return job_id
Exemplo n.º 4
0
    def submit(self, name_prefix):
        """
        Submit the commands that comprise the tool as a single cluster job.


        :param name_prefix: a string, which when combined with this tool's
                name attribute, will result in a unique (to the pipeline)
                job name for the cluster.
        :return: job_id: a value which can be passed in as a depends_on list
                element in a subsequent tool submission.
        """

        # Get the current symbols in the pipeline...
        import pipeline_parse as PL

        #
        # Now it is time to fix up the commands and write the script file.
        # We couldn't do this before, because we have to ensure that ALL
        # pipeline XML processing is done. (Tempfiles need an output dir,
        # and might have been specified before the output dir.)
        # Tempfiles specified in the pipeline have already been fixed up
        # and have paths.  Here in the tool, they appear as normal files.
        # This is different from tempfiles specified in the tool; they
        # really are temp, and can be stored locally on the node and cleaned
        # up on tool exit.
        #
        for c in self.commands:
            c.fixupOptionsFiles()
            # Add the command names to the verify_files list
            p = c.program
            if p and (p not in self.verify_files):
                self.verify_files.append(c.program)

        # actually run the tool; get the date/time at the start of every
        # command, and at the end of the run.
        name = '{0}_{1}'.format(name_prefix, self.name_from_pipeline)
        multi_command_list = []
        for c in self.commands:
            # We're calling date too many times.
            # If we decide we need to time each command in the future, just
            # uncomment these two date lines.
            # multi_command_list.append('date')
            multi_command_list.append(c.real_command)
        # multi_command_list.append('date')

        # Tack on a final command to delete our temp files.
        if self.tempfile_ids:
            # Convert from file ids to paths.
            for n in range(len(self.tempfile_ids)):
                self.tempfile_ids[n] = (
                    self.tool_files[self.tempfile_ids[n]].path)

            # Use rm -f because if a command is executed conditionally
            # due to if_exists and if_not_exists, a temp file may not
            # exist.  Without -f the rm command would fail, causing
            # the entire pipeline to fail. We also need -r to take care of
            # <dir> tags declared temp
            rm_cmd = 'rm -rf ' + ' '.join(self.tempfile_ids)
            multi_command_list.append(rm_cmd)

        multi_command = '  && \\\n'.join(multi_command_list)

        # Determine what jobs we depend on based on our input files.
        depends_on = []
        for fid in self.ins:
            f = self.pipeline_files[fid]
            if f.creator_job:
                j = f.creator_job
                if j not in depends_on:
                    depends_on.append(j)
            if f.is_list:
                if f.foreach_dep:
                    depends_on.append(PL.foreach_barriers[f.foreach_dep])

        # Do the actual batch job submission
        if self.thread_option_max:
            submit_threads = self.thread_option_max
        else:
            submit_threads = self.default_threads

        if self.skip_validation:
            verify_file_list = None
        else:
            verify_file_list = self.verify_files
            #do we need to load a Python modulefile?
            need_python = True
            for m in self.modules:
                if m.startswith('python'):
                    need_python = False
            if need_python:
                if config.civet_job_python_module:
                    self.modules.append(config.civet_job_python_module)
                verify_file_list.append('python')

        if PL.delay:
            date_time = PL.delay_timestamp
        else:
            date_time = None

        batch_job = BatchJob(multi_command,
                             workdir=PipelineFile.get_output_dir(),
                             files_to_check=verify_file_list,
                             ppn=submit_threads,
                             walltime=self.walltime,
                             modules=self.modules,
                             depends_on=depends_on,
                             name=name,
                             error_strings=self.error_strings,
                             version_cmds=self.collect_version_commands(),
                             files_to_test=self.exit_if_exists,
                             file_test_logic=self.exit_test_logic,
                             mem=self.mem,
                             date_time=date_time,
                             email_list=PL.error_email_address,
                             info=("Tool Definition File: " +
                                   os.path.abspath(self.xml_file)),
                             tool_path=self.path)

        try:
            job_id = PL.job_runner.queue_job(batch_job)
        except Exception as e:
            PL.abort_submit(e, PL.BATCH_ERROR)

        # Any files that we created and that will be passed to other jobs
        # need to be marked with our job id.  It is OK if we overwrite
        # a previous job.
        for fid in self.outs:
            f = self.pipeline_files[fid]
            f.set_creator_job(job_id)

        # Mark the files we depend on so that they're not cleaned up too
        # early.  Really only needs to be done for temp files, but for
        # simplicity, we mark them all.
        for fid in self.ins:
            f = self.pipeline_files[fid]
            f.add_consumer_job(job_id)

        print("{0}: {1}".format(job_id, self.name_from_pipeline))
        return job_id