Пример #1
0
    def poll(self, nonBlocking):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """
        text_line = '-' * 120

        # Set a delimiter, which distinguishes entries b/w different jobs
        delimiter = ','
        # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length):
        # 1) squeue -h -u {{user}} -o '%i %256k'
        #      Collects the list of running jobs
        #        a) -h omits header
        #        b) -u {{user}} looks only for jobs submitted by {{user}}
        #        c) -o '%i %256k' specifies the output format
        #           i)  %i -- job ID (1st column)
        #           ii) %256k -- comment with width of 256 characters (2nd column)
        #               If the job has no comments, the entry simply reads (null)
        # 2) grep {{comment}}
        #       Filter the jobs by the comment which must be unique per sbatchManager instance at all times
        # 3) awk '{print $1}'
        #       Filter only the jobIds out
        # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'
        #       Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read)
        command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \
                           "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'"
        command = jinja2.Template(command_template).render(
            user=self.user,
            pool_id_length=self.max_pool_id_length,
            comment=self.pool_id,
            delimiter=delimiter)

        # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes
        # even if some of them have already finished
        jobIds_set = set([
            job_id for job_id in self.submittedJobs
            if self.submittedJobs[job_id]['status'] == Status.submitted
        ])
        nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
        while nofJobs_left > 0:
            # Get the list of jobs submitted to batch system and convert their jobIds to a set
            poll_result, poll_result_err = '', ''
            while True:
                poll_result, poll_result_err = run_cmd(command,
                                                       do_not_log=False,
                                                       return_stderr=True)
                if not poll_result and poll_result_err:
                    logging.warning(
                        'squeue caught an error: {squeue_error}'.format(
                            squeue_error=poll_result_err))
                else:
                    break
                # sleep a minute and then try again
                # in principle we could limit the number of retries, but hopefully that's not necessary
                logging.debug("sleeping for %i seconds." % 60)
                time.sleep(60)
            polled_ids = set()
            if poll_result != '':
                polled_ids = set(poll_result.split(delimiter))

            # Check if number of jobs submitted to batch system is below maxSubmittedJobs;
            # if it is, take jobs from queuedJobs list and submit them,
            # until a total of maxSubmittedJobs is submitted to batch system
            nofJobs_toSubmit = min(len(self.queuedJobs),
                                   self.maxSubmittedJobs - len(polled_ids))
            if nofJobs_toSubmit > 0:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs."
                    .format(len(polled_ids), len(self.queuedJobs),
                            nofJobs_toSubmit))
            else:
                logging.debug(
                    "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing."
                    .format(len(polled_ids), len(self.queuedJobs)))
            for i in range(0, nofJobs_toSubmit):
                # randomly submit a job from the queue
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_idx = len(self.queuedJobs) - 1
                random_idx = random.randint(0, max_idx)
                job = self.queuedJobs.pop(random_idx)
                job['status'] = Status.submitted
                job_id = self.submit(job['sbatch_command'])
                self.submittedJobs[job_id] = job

            # Now check status of jobs submitted to batch system:
            # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of
            # jobs that have finished already
            finished_ids = list(jobIds_set - polled_ids)

            # Do not poll anything if currently there are no finished jobs
            if finished_ids:
                # Based on job's exit code what if the job has failed or completed successfully
                # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here
                # Therefore, we want to restrict the output by grepping specific job IDs
                # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable,
                # which is of order 2e6
                # This means that we have to split the job IDs into chunks each of which we have to check separately
                finished_ids_chunks = [
                    finished_ids[i:i + self.max_nof_greps]
                    for i in range(0, len(finished_ids), self.max_nof_greps)
                ]
                for finished_ids_chunk in finished_ids_chunks:
                    completion = self.check_job_completion(finished_ids_chunk)
                    completed_jobs, running_jobs, failed_jobs = [], [], []
                    for job_id, details in completion.iteritems():
                        if details.status == Status.completed:
                            completed_jobs.append(job_id)
                        elif details.status == Status.running:
                            running_jobs.append(job_id)
                        else:
                            failed_jobs.append(job_id)
                    # If there are any failed jobs, throw
                    if failed_jobs:

                        failed_jobs_str = ','.join(failed_jobs)
                        errors = [
                            completion[job_id].status for job_id in failed_jobs
                        ]
                        logging.error(
                            "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}"
                            .format(
                                jobIds=failed_jobs_str,
                                reasons=', '.join(map(Status.toString,
                                                      errors)),
                            ))

                        # Let's print a table where the first column corresponds to the job ID
                        # and the second column lists the exit code, the derived exit code, the status
                        # and the classification of the failed job
                        logging.error("Error table:")
                        for job_id in failed_jobs:
                            sys.stderr.write(
                                "{jobId} {exitCode} {derivedExitCode} {state} {status}\n"
                                .format(
                                    jobId=job_id,
                                    exitCode=completion[job_id].exit_code,
                                    derivedExitCode=completion[job_id].
                                    derived_exit_code,
                                    state=completion[job_id].state,
                                    status=Status.toString(
                                        completion[job_id].status),
                                ))

                        sys.stderr.write('%s\n' % text_line)
                        for failed_job in failed_jobs:
                            for log in zip(['wrapper', 'executable'],
                                           ['log_wrap', 'log_exec']):
                                logfile = self.submittedJobs[failed_job][
                                    log[1]]
                                if os.path.isfile(logfile):
                                    logfile_contents = open(logfile,
                                                            'r').read()
                                else:
                                    logfile_contents = '<file is missing>'
                                sys.stderr.write(
                                    'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n'
                                    .format(
                                        id=failed_job,
                                        description=log[0],
                                        path=logfile,
                                        log=logfile_contents,
                                        line=text_line,
                                    ))

                            if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \
                               completion[failed_job].status == Status.io_error:
                                # The job is eligible for resubmission if the job hasn't been resubmitted more
                                # than a preset limit of resubmissions AND if the job failed due to I/O errors
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED because: {reason} "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=failed_job,
                                        args=self.submittedJobs[failed_job]
                                        ['args'],
                                        reason=Status.toString(
                                            completion[failed_job].status),
                                        attempt=self.submittedJobs[failed_job]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[failed_job]['args'])
                                # The old ID must be deleted, b/c otherwise it would be used to compare against
                                # squeue output and we would resubmit the failed job ad infinitum
                                del self.submittedJobs[failed_job]
                            else:
                                # We've exceeded the maximum number of resubmissions -> fail the workflow
                                raise Status.raiseError(
                                    completion[failed_job].status)
                    else:
                        logging.debug(
                            "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}"
                            .format(
                                completedIds=','.join(completed_jobs),
                                runningInfo='(%s still running)' %
                                ','.join(running_jobs) if running_jobs else '',
                            ))
                    # Mark successfully finished jobs as completed so that won't request their status code again
                    # Otherwise they will be still at ,,submitted'' state
                    for job_id in completed_jobs:
                        if not all(
                                map(
                                    lambda outputFile: is_file_ok(
                                        outputFile,
                                        validate_outputs=True,
                                        min_file_size=self.min_file_size), self
                                    .submittedJobs[job_id]['outputFiles'])):
                            if self.submittedJobs[job_id][
                                    'nof_submissions'] < self.max_resubmissions:
                                logging.warning(
                                    "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file "
                                    "-> resubmission attempt #{attempt}".
                                    format(
                                        id=job_id,
                                        args=self.submittedJobs[job_id]
                                        ['args'],
                                        attempt=self.submittedJobs[job_id]
                                        ['nof_submissions'],
                                    ))
                                self.submitJob(
                                    *self.submittedJobs[job_id]['args'])
                                del self.submittedJobs[job_id]
                            else:
                                raise ValueError(
                                    "Job w/ ID {id} FAILED because it repeatedly produces bogus output "
                                    "file {output} yet the job still exits w/o any errors"
                                    .format(
                                        id=job_id,
                                        output=', '.join(
                                            self.submittedJobs[job_id]
                                            ['outputFiles']),
                                    ))
                        else:
                            # Job completed just fine
                            self.submittedJobs[job_id][
                                'status'] = Status.completed

            jobIds_set = set([
                job_id for job_id in self.submittedJobs
                if self.submittedJobs[job_id]['status'] == Status.submitted
            ])
            nofJobs_left = len(jobIds_set) + len(self.queuedJobs)
            logging.info(
                "Waiting for sbatch to finish (%d job(s) still left) ..." %
                nofJobs_left)
            if nofJobs_left > 0:
                if nonBlocking:
                    return False
                two_pow_sixteen = 65536
                random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen)
                max_delay = 300
                random_delay = random.randint(0, max_delay)
                logging.debug("sleeping for %i seconds." % random_delay)
                time.sleep(self.poll_interval + random_delay)
            else:
                break

        return True
Пример #2
0
    def submitJob(
        self,
        inputFiles,
        executable,
        command_line_parameter,
        outputFilePath,
        outputFiles,
        scriptFile,
        logFile=None,
        skipIfOutputFileExists=False,
        job_template_file='sbatch-node.sh.template',
        copy_output_file=True,
        nof_submissions=0,
    ):
        """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing
        """

        logging.debug("<sbatchManager::submitJob>: job_template_file = '%s'" %
                      job_template_file)

        job_template_file = os.path.join(jinja_template_dir, job_template_file)
        job_template = open(job_template_file, 'r').read()

        # raise if logfile missing
        if not logFile:
            if not self.logFileDir:
                raise ValueError(
                    "Please call 'setLogFileDir' before calling 'submitJob' !!"
                )
            logFile = os.path.join(
                self.logFileDir,
                os.path.basename(scriptFile).replace(".sh", ".log"))

        # skip only if none of the output files are missing in the file system
        outputFiles_fullpath = map(
            lambda outputFile: os.path.join(outputFilePath, outputFile),
            outputFiles)
        if skipIfOutputFileExists:
            outputFiles_missing = [
                outputFile for outputFile in outputFiles_fullpath \
                if not is_file_ok(outputFile, validate_outputs = True, min_file_size = self.min_file_size)
            ]
            if not outputFiles_missing:
                logging.debug(
                  "output file(s) = %s exist(s) --> skipping !!" % \
                  '; '.join(map(lambda x: "'%s'" % x, outputFiles_fullpath))
                )
                return

        if not self.workingDir:
            raise ValueError(
                "Please call 'setWorkingDir' before calling 'submitJob' !!")

        if not self.cmssw_base_dir:
            logging.warning("cmssw_base_dir not set, setting it to '%s'" %
                            os.environ.get('CMSSW_BASE'))
            self.cmssw_base_dir = os.environ.get('CMSSW_BASE')

        job_dir = self.get_job_dir()

        # create script for executing jobs
        wrapper_log_file = logFile.replace('.log', '_wrapper.log')
        executable_log_file = logFile.replace('.log', '_executable.log')
        wrapper_log_file, executable_log_file = get_log_version(
            (wrapper_log_file, executable_log_file))

        sbatch_command = "sbatch --partition={partition} --output={output} --comment='{comment}' " \
                         "{max_mem} {args} {cmd}".format(
          partition = self.queue,
          output    = wrapper_log_file,
          comment   = self.pool_id,
          args      = self.sbatchArgs,
          cmd       = scriptFile,
          max_mem   = '--mem={}'.format(self.max_mem) if self.max_mem else '',
        )

        two_pow_sixteen = 65536
        random.seed((abs(hash(command_line_parameter))) % two_pow_sixteen)
        max_delay = 60
        random_delay = random.randint(0, max_delay)

        script = jinja2.Template(job_template).render(
            working_dir=self.workingDir,
            cmssw_base_dir=self.cmssw_base_dir,
            job_dir=job_dir,
            job_template_file=job_template_file,
            exec_name=executable,
            command_line_parameter=command_line_parameter,
            inputFiles=" ".join(inputFiles),
            outputDir=outputFilePath,
            outputFiles=" ".join(outputFiles),
            wrapper_log_file=wrapper_log_file,
            executable_log_file=executable_log_file,
            script_file=scriptFile,
            RUNNING_COMMAND=sbatch_command,
            random_sleep=random_delay,
            copy_output_file=copy_output_file,
        )
        logging.debug("writing sbatch script file = '%s'" % scriptFile)
        with codecs.open(scriptFile, "w", "utf-8") as f:
            f.write(script)
            f.flush()
            os.fsync(f.fileno())

        if self.dry_run:
            return

        nof_submissions += 1
        job = {
            'sbatch_command':
            sbatch_command,
            'status':
            Status.in_queue,
            'log_wrap':
            wrapper_log_file,
            'log_exec':
            executable_log_file,
            'args': (
                inputFiles,
                executable,
                command_line_parameter,
                outputFilePath,
                outputFiles,
                scriptFile,
                logFile,
                skipIfOutputFileExists,
                job_template_file,
                nof_submissions,
            ),
            'nof_submissions':
            nof_submissions,
            'outputFiles':
            outputFiles_fullpath,
        }
        self.queuedJobs.append(job)