Пример #1
0
    def prepare_outfiles(self, data):
        """
        Construct validated FileSpec objects for output and log files from raw dict `data`
        Note: final preparation for output files can only be done after the payload has finished in case the payload
        has produced a job report with e.g. output file guids. This is verified in
        pilot/control/payload/process_job_report().

        :param data:
        :return: (list of `FileSpec` for output, list of `FileSpec` for log)
        """

        # form raw list data from input comma-separated values for further validataion by FileSpec
        kmap = {
            # 'internal_name': 'ext_key_structure'
            'lfn': 'outFiles',
            ##'??': 'destinationDblock', '??define_proper_internal_name': 'destinationDBlockToken',
            'dataset': 'realDatasets', 'scope': 'scopeOut',
            ##'??define_internal_key':'prodDBlocks', '??':'dispatchDBlockTokenForOut',
            'ddmendpoint': 'ddmEndPointOut',
        }

        ksources = dict([k, self.clean_listdata(data.get(k, ''), list, k, [])] for k in kmap.itervalues())

        # unify scopeOut structure: add scope of log file
        log_lfn = data.get('logFile')
        if log_lfn:
            scope_out = []
            for lfn in ksources.get('outFiles', []):
                if lfn == log_lfn:
                    scope_out.append(data.get('scopeLog'))
                else:
                    if not ksources['scopeOut']:
                        raise Exception('Failed to extract scopeOut parameter from Job structure sent by Panda, please check input format!')
                    scope_out.append(ksources['scopeOut'].pop(0))
            ksources['scopeOut'] = scope_out

        ret_output, ret_log = [], []

        lfns = set()
        for ind, lfn in enumerate(ksources['outFiles']):
            if lfn in ['', 'NULL'] or lfn in lfns:  # exclude null data and duplicates
                continue
            lfns.add(lfn)
            idat = {}
            for attrname, k in kmap.iteritems():
                idat[attrname] = ksources[k][ind] if len(ksources[k]) > ind else None

            ftype = 'output'
            ret = ret_output
            if lfn == log_lfn:  # log file case
                ftype = 'log'
                idat['guid'] = data.get('logGUID')
                ret = ret_log
            elif lfn.endswith('.lib.tgz'):  # build job case, generate a guid for the lib file
                idat['guid'] = get_guid()

            finfo = FileSpec(filetype=ftype, **idat)
            ret.append(finfo)

        return ret_output, ret_log
Пример #2
0
    def _get_all_output(self, ksources, kmap, log_lfn, data):
        """
        Create lists of FileSpecs for output + log files.
        Helper function for prepare_output().

        :param ksources:
        :param kmap:
        :param log_lfn: log file name (string).
        :param data:
        :return: ret_output (list of FileSpec), ret_log (list of FileSpec)
        """

        ret_output, ret_log = [], []

        lfns = set()
        for ind, lfn in enumerate(ksources['outFiles']):
            if lfn in ['', 'NULL'
                       ] or lfn in lfns:  # exclude null data and duplicates
                continue
            lfns.add(lfn)
            idat = {}
            try:
                for attrname, k in list(kmap.items()):  # Python 3
                    idat[attrname] = ksources[k][ind] if len(
                        ksources[k]) > ind else None
            except Exception:
                for attrname, k in kmap.iteritems():  # Python 2
                    idat[attrname] = ksources[k][ind] if len(
                        ksources[k]) > ind else None

            ftype = 'output'
            ret = ret_output
            if lfn == log_lfn:  # log file case
                ftype = 'log'
                idat['guid'] = data.get('logGUID')
                ret = ret_log
            elif lfn.endswith(
                    '.lib.tgz'
            ):  # build job case, generate a guid for the lib file
                idat['guid'] = get_guid()

            finfo = FileSpec(filetype=ftype, **idat)
            ret.append(finfo)

        return ret_output, ret_log
Пример #3
0
def process_metadata_from_xml(job):
    """
    Extract necessary metadata from XML when job report is not available.

    :param job: job object.
    :return: [updated job object - return not needed].
    """

    # get the metadata from the xml file instead, which must exist for most production transforms
    path = os.path.join(job.workdir, config.Payload.metadata)
    if os.path.exists(path):
        job.metadata = read_file(path)
    else:
        if not job.is_analysis() and job.transformation != 'Archive_tf.py':
            diagnostics = 'metadata does not exist: %s' % path
            logger.warning(diagnostics)
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                errors.NOPAYLOADMETADATA)
            job.piloterrorcode = errors.NOPAYLOADMETADATA
            job.piloterrordiag = diagnostics

    # add missing guids
    for dat in job.outdata:
        if not dat.guid:
            # try to read it from the metadata before the last resort of generating it
            metadata = None
            try:
                metadata = get_metadata_from_xml(job.workdir)
            except Exception as e:
                msg = "Exception caught while interpreting XML: %s (ignoring it, but guids must now be generated)" % e
                logger.warning(msg)
            if metadata:
                dat.guid = get_guid_from_xml(metadata, dat.lfn)
                logger.info('read guid for lfn=%s from xml: %s' %
                            (dat.lfn, dat.guid))
            else:
                dat.guid = get_guid()
                logger.info('generated guid for lfn=%s: %s' %
                            (dat.lfn, dat.guid))
Пример #4
0
def execute_payloads(queues, traces, args):  # noqa: C901
    """
    Execute queued payloads.

    Extract a Job object from the "validated_payloads" queue and put it in the "monitored_jobs" queue. The payload
    stdout/err streams are opened and the pilot state is changed to "starting". A payload executor is selected (for
    executing a normal job, an event service job or event service merge job). After the payload (or rather its executor)
    is started, the thread will wait for it to finish and then check for any failures. A successfully completed job is
    placed in the "finished_payloads" queue, and a failed job will be placed in the "failed_payloads" queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    job = None
    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            job = queues.validated_payloads.get(block=True, timeout=1)

            q_snapshot = list(queues.finished_data_in.queue)
            peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid]
            if len(peek) == 0:
                #queues.validated_payloads.put(job)
                put_in_queue(job, queues.validated_payloads)
                for i in range(10):  # Python 3
                    if args.graceful_stop.is_set():
                        break
                    time.sleep(1)
                continue

            # this job is now to be monitored, so add it to the monitored_payloads queue
            #queues.monitored_payloads.put(job)
            put_in_queue(job, queues.monitored_payloads)

            logger.info('job %s added to monitored payloads queue' % job.jobid)

            try:
                out = open(
                    os.path.join(job.workdir, config.Payload.payloadstdout),
                    'wb')
                err = open(
                    os.path.join(job.workdir, config.Payload.payloadstderr),
                    'wb')
            except Exception as e:
                logger.warning('failed to open payload stdout/err: %s' % e)
                out = None
                err = None
            send_state(job, args, 'starting')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - abort execute_payloads()')
                break

            payload_executor = get_payload_executor(args, job, out, err,
                                                    traces)
            logger.info("Got payload executor: %s" % payload_executor)

            show_memory_usage()

            # run the payload and measure the execution time
            job.t0 = os.times()
            exit_code = payload_executor.run()

            set_cpu_consumption_time(job)
            job.transexitcode = exit_code % 255

            out.close()
            err.close()

            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()

            # some HPO jobs will produce new output files (following lfn name pattern), discover those and replace the job.outdata list
            if job.is_hpo:
                user = __import__('pilot.user.%s.common' % pilot_user,
                                  globals(), locals(), [pilot_user],
                                  0)  # Python 2/3
                try:
                    user.update_output_for_hpo(job)
                except Exception as e:
                    logger.warning(
                        'exception caught by update_output_for_hpo(): %s' % e)
                else:
                    for dat in job.outdata:
                        if not dat.guid:
                            dat.guid = get_guid()
                            logger.warning(
                                'guid not set: generated guid=%s for lfn=%s' %
                                (dat.guid, dat.lfn))

            #if traces.pilot['nr_jobs'] == 1:
            #    logger.debug('faking job failure in first multi-job')
            #    job.transexitcode = 1
            #    exit_code = 1

            # analyze and interpret the payload execution output
            perform_initial_payload_error_analysis(job, exit_code)

            # was an error already found?
            #if job.piloterrorcodes:
            #    exit_code_interpret = 1
            #else:
            user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            try:
                exit_code_interpret = user.interpret(job)
            except Exception as e:
                logger.warning('exception caught: %s' % e)
                #exit_code_interpret = -1
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.INTERNALPILOTPROBLEM)

            if job.piloterrorcodes:
                exit_code_interpret = 1

            if exit_code_interpret == 0 and exit_code == 0:
                logger.info(
                    'main payload error analysis completed - did not find any errors'
                )

                # update output lists if zipmaps were used
                #job.add_archives_to_output_lists()

                # queues.finished_payloads.put(job)
                put_in_queue(job, queues.finished_payloads)
            else:
                logger.debug(
                    'main payload error analysis completed - adding job to failed_payloads queue'
                )
                #queues.failed_payloads.put(job)
                put_in_queue(job, queues.failed_payloads)

        except queue.Empty:
            continue
        except Exception as e:
            logger.fatal(
                'execute payloads caught an exception (cannot recover): %s, %s'
                % (e, traceback.format_exc()))
            if job:
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.PAYLOADEXECUTIONEXCEPTION)
                #queues.failed_payloads.put(job)
                put_in_queue(job, queues.failed_payloads)
            while not args.graceful_stop.is_set():
                # let stage-out of log finish, but stop running payloads as there should be a problem with the pilot
                time.sleep(5)

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.info('[payload] execute_payloads thread has finished')
Пример #5
0
def process_job_report(job):
    """
    Process the job report produced by the payload/transform if it exists.
    Payload error codes and diagnostics, as well as payload metadata (for output files) and stageout type will be
    extracted. The stageout type is either "all" (i.e. stage-out both output and log files) or "log" (i.e. only log file
    will be staged out).
    Note: some fields might be experiment specific. A call to a user function is therefore also done.

    :param job: job dictionary will be updated by the function and several fields set.
    :return:
    """

    log = get_logger(job.jobid)

    # get the job report
    path = os.path.join(job.workdir, config.Payload.jobreport)
    if not os.path.exists(path):
        log.warning(
            'job report does not exist: %s (any missing output file guids must be generated)'
            % path)

        # get the metadata from the xml file instead, which must exist for most production transforms
        path = os.path.join(job.workdir, config.Payload.metadata)
        if os.path.exists(path):
            job.metadata = read_file(path)
        else:
            if not job.is_analysis() and job.transformation != 'Archive_tf.py':
                diagnostics = 'metadata does not exist: %s' % path
                log.warning(diagnostics)
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.NOPAYLOADMETADATA)
                job.piloterrorcode = errors.NOPAYLOADMETADATA
                job.piloterrordiag = diagnostics

        # add missing guids
        for dat in job.outdata:
            if not dat.guid:
                dat.guid = get_guid()
                log.warning('guid not set: generated guid=%s for lfn=%s' %
                            (dat.guid, dat.lfn))

    else:
        with open(path) as data_file:
            # compulsory field; the payload must produce a job report (see config file for file name), attach it to the
            # job object
            job.metadata = json.load(data_file)

            #
            update_job_data(job)

            # compulsory fields
            try:
                job.exitcode = job.metadata['exitCode']
            except Exception as e:
                log.warning(
                    'could not find compulsory payload exitCode in job report: %s (will be set to 0)'
                    % e)
                job.exitcode = 0
            else:
                log.info('extracted exit code from job report: %d' %
                         job.exitcode)
            try:
                job.exitmsg = job.metadata['exitMsg']
            except Exception as e:
                log.warning(
                    'could not find compulsory payload exitMsg in job report: %s '
                    '(will be set to empty string)' % e)
                job.exitmsg = ""
            else:
                # assign special payload error code
                if "got a SIGSEGV signal" in job.exitmsg:
                    diagnostics = 'Invalid memory reference or a segmentation fault in payload: %s (job report)' % \
                                  job.exitmsg
                    log.warning(diagnostics)
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                        errors.PAYLOADSIGSEGV)
                    job.piloterrorcode = errors.PAYLOADSIGSEGV
                    job.piloterrordiag = diagnostics
                else:
                    log.info('extracted exit message from job report: %s' %
                             job.exitmsg)
                    if job.exitmsg != 'OK':
                        job.exeerrordiag = job.exitmsg
                        job.exeerrorcode = job.exitcode

            if job.exitcode != 0:
                # get list with identified errors in job report
                job_report_errors = get_job_report_errors(job.metadata, log)

                # is it a bad_alloc failure?
                bad_alloc, diagnostics = is_bad_alloc(job_report_errors, log)
                if bad_alloc:
                    job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                        errors.BADALLOC)
                    job.piloterrorcode = errors.BADALLOC
                    job.piloterrordiag = diagnostics