Exemplo n.º 1
0
def interrupt(args, signum, frame):
    """
    Interrupt function on the receiving end of kill signals.
    This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs
    the threads to abort the job.

    :param args: pilot arguments.
    :param signum: signal.
    :param frame: stack/execution frame pointing to the frame that was interrupted by the signal.
    :return:
    """

    try:
        sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0]  # Python 2
    except Exception:
        sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0]  # Python 3
    add_to_pilot_timing('0', PILOT_KILL_SIGNAL, time(), args)
    add_to_pilot_timing('1', PILOT_KILL_SIGNAL, time(), args)
    logger.warning('caught signal: %s' % sig)
    args.signal = sig
    logger.warning('will instruct threads to abort and update the server')
    args.abort_job.set()
    logger.warning('waiting for threads to finish')
    args.job_aborted.wait()
    logger.warning('setting graceful stop (in case it was not set already), pilot will abort')
    args.graceful_stop.set()
Exemplo n.º 2
0
 def post_payload(self, job):
     """
     Functions to run pilot post payload
     :param job: job object
     """
     # write time stamps to pilot timing file
     add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), self.__args)
Exemplo n.º 3
0
 def post_setup(self, job):
     """
     Functions to run post setup
     :param job: job object
     """
     # write time stamps to pilot timing file
     add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), self.__args)
Exemplo n.º 4
0
    def post_payload(self, job):
        """
        Calls to functions to run after payload.
        E.g. write time stamps to timing file.

        :param job: job object
        """
        # write time stamps to pilot timing file
        add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), self.__args)
Exemplo n.º 5
0
def interrupt(args, signum, frame):
    """
    Interrupt function on the receiving end of kill signals.
    This function is forwarded any incoming signals (SIGINT, SIGTERM, etc) and will set abort_job which instructs
    the threads to abort the job.

    :param args: pilot arguments.
    :param signum: signal.
    :param frame: stack/execution frame pointing to the frame that was interrupted by the signal.
    :return:
    """

    try:
        sig = [v for v, k in signal.__dict__.iteritems() if k == signum][0]
    except Exception:
        sig = [v for v, k in list(signal.__dict__.items()) if k == signum][0]

    args.signal_counter += 1

    # keep track of when first kill signal arrived, any stuck loops should abort at a defined cut off time
    if args.kill_time == 0:
        args.kill_time = int(time())

    max_kill_wait_time = MAX_KILL_WAIT_TIME + 60  # add another minute of grace to let threads finish
    current_time = int(time())
    if args.kill_time and current_time - args.kill_time > max_kill_wait_time:
        logger.warning(
            'passed maximum waiting time after first kill signal - will commit suicide - farewell'
        )
        try:
            rmtree(args.sourcedir)
        except Exception as e:
            logger.warning(e)
        logging.shutdown()
        kill_processes(getpid())

    add_to_pilot_timing('0', PILOT_KILL_SIGNAL, time(), args)
    add_to_pilot_timing('1', PILOT_KILL_SIGNAL, time(), args)
    logger.warning('caught signal: %s in FRAME=\n%s' %
                   (sig, '\n'.join(traceback.format_stack(frame))))

    args.signal = sig
    logger.warning('will instruct threads to abort and update the server')
    args.abort_job.set()
    logger.warning('waiting for threads to finish')
    args.job_aborted.wait()
    logger.warning(
        'setting graceful stop (in case it was not set already), pilot will abort'
    )
    args.graceful_stop.set()
Exemplo n.º 6
0
def _stage_in(args, job):
    """
        :return: True in case of success
    """

    # tested ok:
    #logger.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args)

    # any DBRelease files should not be staged in
    skip_special_files(job)

    # now that the trace report has been created, remove any files that are not to be transferred (DBRelease files) from the indata list
    update_indata(job)

    label = 'stage-in'

    # should stage-in be done by a script (for containerisation) or by invoking the API (ie classic mode)?
    use_container = pilot.util.middleware.use_middleware_script(
        job.infosys.queuedata.container_type.get("middleware"))
    if use_container:
        logger.info('stage-in will be done by a script')
        try:
            eventtype, localsite, remotesite = get_trace_report_variables(
                job, label=label)
            pilot.util.middleware.containerise_middleware(
                job,
                job.indata,
                args.queue,
                eventtype,
                localsite,
                remotesite,
                job.infosys.queuedata.container_options,
                args.input_dir,
                label=label,
                container_type=job.infosys.queuedata.container_type.get(
                    "middleware"))
        except PilotException as error:
            logger.warning(
                'stage-in containerisation threw a pilot exception: %s', error)
        except Exception as error:
            import traceback
            logger.warning('stage-in containerisation threw an exception: %s',
                           error)
            logger.error(traceback.format_exc())
    else:
        try:
            logger.info('stage-in will not be done in a container')

            # create the trace report
            trace_report = create_trace_report(job, label=label)

            if job.is_eventservicemerge:
                client = StageInESClient(job.infosys,
                                         logger=logger,
                                         trace_report=trace_report)
                activity = 'es_events_read'
            else:
                client = StageInClient(job.infosys,
                                       logger=logger,
                                       trace_report=trace_report)
                activity = 'pr'
            use_pcache = job.infosys.queuedata.use_pcache
            kwargs = dict(workdir=job.workdir,
                          cwd=job.workdir,
                          usecontainer=False,
                          use_pcache=use_pcache,
                          use_bulk=False,
                          input_dir=args.input_dir,
                          use_vp=job.use_vp,
                          catchall=job.infosys.queuedata.catchall)
            client.prepare_sources(job.indata)
            client.transfer(job.indata, activity=activity, **kwargs)
        except PilotException as error:
            import traceback
            error_msg = traceback.format_exc()
            logger.error(error_msg)
            msg = errors.format_diagnostics(error.get_error_code(), error_msg)
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                error.get_error_code(), msg=msg)
        except Exception as error:
            logger.error('failed to stage-in: error=%s', error)

    logger.info('summary of transferred files:')
    for infile in job.indata:
        status = infile.status if infile.status else "(not transferred)"
        logger.info(" -- lfn=%s, status_code=%s, status=%s", infile.lfn,
                    infile.status_code, status)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args)

    remain_files = [
        infile for infile in job.indata
        if infile.status not in ['remote_io', 'transferred', 'no_transfer']
    ]
    logger.info("stage-in finished") if not remain_files else logger.info(
        "stage-in failed")

    return not remain_files
Exemplo n.º 7
0
def _stage_out_new(job, args):
    """
    Stage-out of all output files.
    If job.stageout=log then only log files will be transferred.

    :param job: job object.
    :param args: pilot args object.
    :return: True in case of success, False otherwise.
    """

    #logger.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)

    is_success = True

    if not job.outdata or job.is_eventservice:
        logger.info(
            'this job does not have any output files, only stage-out log file')
        job.stageout = 'log'

    if job.stageout != 'log':  ## do stage-out output files
        if not _do_stageout(job,
                            job.outdata, ['pw', 'w'],
                            args.queue,
                            title='output',
                            output_dir=args.output_dir):
            is_success = False
            logger.warning('transfer of output file(s) failed')

    if job.stageout in ['log', 'all'
                        ] and job.logdata:  ## do stage-out log files
        # prepare log file, consider only 1st available log file
        status = job.get_status('LOG_TRANSFER')
        if status != LOG_TRANSFER_NOT_DONE:
            logger.warning('log transfer already attempted')
            return False

        job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS
        logfile = job.logdata[0]

        try:
            tarball_name = 'tarball_PandaJob_%s_%s' % (job.jobid,
                                                       job.infosys.pandaqueue)
            input_files = [fspec.lfn for fspec in job.indata]
            output_files = [fspec.lfn for fspec in job.outdata]
            create_log(job.workdir,
                       logfile.lfn,
                       tarball_name,
                       args.cleanup,
                       input_files=input_files,
                       output_files=output_files,
                       is_looping=errors.LOOPINGJOB in job.piloterrorcodes,
                       debugmode=job.debug)
        except LogFileCreationFailure as error:
            logger.warning('failed to create tar file: %s', error)
            set_pilot_state(job=job, state="failed")
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                errors.LOGFILECREATIONFAILURE)
            return False

        if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'],
                            args.queue,
                            title='log',
                            output_dir=args.output_dir):
            is_success = False
            logger.warning('log transfer failed')
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED
        else:
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE
    elif not job.logdata:
        logger.info('no log was defined - will not create log file')
        job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

    # generate fileinfo details to be send to Panda
    fileinfo = {}
    for iofile in job.outdata + job.logdata:
        if iofile.status in ['transferred']:
            fileinfo[iofile.lfn] = {
                'guid': iofile.guid,
                'fsize': iofile.filesize,
                'adler32': iofile.checksum.get('adler32'),
                'surl': iofile.turl
            }

    job.fileinfo = fileinfo

    # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS
    if not is_success:
        # set error code + message (a more precise error code might have been set already)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            errors.STAGEOUTFAILED)
        set_pilot_state(job=job, state="failed")
        logger.warning('stage-out failed')
        return False

    logger.info('stage-out finished correctly')

    if not job.state or (
            job.state and job.state == 'stageout'
    ):  # is the job state already set? if so, don't change the state (unless it's the stageout state)
        logger.debug('changing job state from %s to finished', job.state)
        set_pilot_state(job=job, state="finished")

    # send final server update since all transfers have finished correctly
    # send_state(job, args, 'finished', xml=dumps(fileinfodict))

    return is_success
Exemplo n.º 8
0
Arquivo: data.py Projeto: ptrlv/pilot2
def _stage_out_new(job, args):
    """
    Stage-out of all output files.
    If job.stageout=log then only log files will be transferred.

    :param job: job object.
    :param args: pilot args object.
    :return: True in case of success, False otherwise.
    """

    log = get_logger(job.jobid)

    #log.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)

    is_success = True

    if not job.outdata or job.is_eventservice:
        log.info('this job does not have any output files, only stage-out log file')
        job.stageout = 'log'

    if job.stageout != 'log':  ## do stage-out output files
        if not _do_stageout(job, job.outdata, ['pw', 'w'], title='output'):
            is_success = False
            log.warning('transfer of output file(s) failed')

    if job.stageout in ['log', 'all'] and job.logdata:  ## do stage-out log files
        # prepare log file, consider only 1st available log file
        status = job.get_status('LOG_TRANSFER')
        if status != LOG_TRANSFER_NOT_DONE:
            log.warning('log transfer already attempted')
            return False

        job.status['LOG_TRANSFER'] = LOG_TRANSFER_IN_PROGRESS
        logfile = job.logdata[0]

        try:
            create_log(job, logfile, 'tarball_PandaJob_%s_%s' % (job.jobid, job.infosys.pandaqueue))
        except LogFileCreationFailure as e:
            log.warning('failed to create tar file: %s' % e)
            set_pilot_state(job=job, state="failed")
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.LOGFILECREATIONFAILURE)
            return False

        if not _do_stageout(job, [logfile], ['pl', 'pw', 'w'], title='log'):
            is_success = False
            log.warning('log transfer failed')
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_FAILED
        else:
            job.status['LOG_TRANSFER'] = LOG_TRANSFER_DONE

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

    # generate fileinfo details to be send to Panda
    fileinfo = {}
    for e in job.outdata + job.logdata:
        if e.status in ['transferred']:
            fileinfo[e.lfn] = {'guid': e.guid, 'fsize': e.filesize,
                               'adler32': e.checksum.get('adler32'),
                               'surl': e.turl}

    job.fileinfo = fileinfo
    log.info('prepared job.fileinfo=%s' % job.fileinfo)

    # WARNING THE FOLLOWING RESETS ANY PREVIOUS STAGEOUT ERRORS
    if not is_success:
        # set error code + message (a more precise error code might have been set already)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEOUTFAILED)
        set_pilot_state(job=job, state="failed")
        log.warning('stage-out failed')  # with error: %d, %s (setting job state to failed)' %
        # log.warning('stage-out failed with error: %d, %s (setting job state to failed)' %
        #  (job['pilotErrorCode'], job['pilotErrorDiag']))
        # send_state(job, args, 'failed')
        return False

    log.info('stage-out finished correctly')

    if not job.state:  # is the job state already set? if so, don't change the state
        set_pilot_state(job=job, state="finished")

    # send final server update since all transfers have finished correctly
    # send_state(job, args, 'finished', xml=dumps(fileinfodict))

    return is_success
Exemplo n.º 9
0
Arquivo: data.py Projeto: ptrlv/pilot2
def _stage_in(args, job):
    """
        :return: True in case of success
    """

    log = get_logger(job.jobid)

    # tested ok:
    #log.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args)

    # any DBRelease files should not be staged in
    for fspec in job.indata:
        if 'DBRelease' in fspec.lfn:
            fspec.status = 'no_transfer'

    event_type = "get_sm"
    #if log_transfer:
    #    eventType += '_logs'
    #if special_log_transfer:
    #    eventType += '_logs_os'
    if job.is_analysis():
        event_type += "_a"
    rse = get_rse(job.indata)
    localsite = remotesite = rse
    trace_report = TraceReport(pq='', localSite=localsite, remoteSite=remotesite, dataset="", eventType=event_type)
    trace_report.init(job)

    # now that the trace report has been created, remove any files that are not to be transferred (DBRelease files) from the indata list
    toberemoved = []
    for fspec in job.indata:
        if fspec.status == 'no_transfer':
            toberemoved.append(fspec)
    for fspec in toberemoved:
        logger.info('removing fspec object (lfn=%s) from list of input files' % fspec.lfn)
        job.indata.remove(fspec)

    try:
        if job.is_eventservicemerge:
            client = StageInESClient(job.infosys, logger=log, trace_report=trace_report)
            activity = 'es_events_read'
        else:
            client = StageInClient(job.infosys, logger=log, trace_report=trace_report)
            activity = 'pr'
        kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job)  #, mode='stage-in')

        client.transfer(job.indata, activity=activity, **kwargs)
    except PilotException as error:
        log.error('PilotException caught: %s' % error)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code())
    except Exception as error:
        log.error('failed to stage-in: error=%s' % error)

    log.info('summary of transferred files:')
    for e in job.indata:
        if not e.status:
            status = "(not transferred)"
        else:
            status = e.status
        log.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status))

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args)

    remain_files = [e for e in job.indata if e.status not in ['remote_io', 'transferred', 'no_transfer']]
    if not remain_files:
        log.info("stage-in finished")
    else:
        log.info("stage-in failed")

    return not remain_files
Exemplo n.º 10
0
    """

    # get the args from the arg parser
    args = get_args()

    # Define and set the main harvester control boolean
    args.harvester = is_harvester_mode(args)

    # initialize the pilot timing dictionary
    args.timing = {}  # TODO: move to singleton?

    # initialize job status dictionary (e.g. used to keep track of log transfers)
    args.job_status = {}  # TODO: move to singleton or to job object directly?

    # store T0 time stamp
    add_to_pilot_timing('0', PILOT_START_TIME, time.time(), args)
    add_to_pilot_timing('1', PILOT_MULTIJOB_START_TIME, time.time(), args)

    # if requested by the wrapper via a pilot option, create the main pilot workdir and cd into it
    args.sourcedir = getcwd()  #get_pilot_source_dir()

    exit_code, mainworkdir = create_main_work_dir(args)
    if exit_code != 0:
        sys.exit(exit_code)

    # set environment variables (to be replaced with singleton implementation)
    set_environment_variables(args, mainworkdir)

    # setup and establish standard logging
    establish_logging(debug=args.debug, nopilotlog=args.nopilotlog)
Exemplo n.º 11
0
def set_scratch_workdir(job, work_dir, args):
    """
    Copy input files and some db files to RAM disk.

    :param job: job object.
    :param work_dir: job working directory (permanent FS) (string).
    :param args: args dictionary to collect timing metrics.
    :return: job working directory in scratch (string).
    """

    scratch_path = config.HPC.scratch
    du = disk_usage(scratch_path)
    logger.info("Scratch dir available space: {0} used: {1}".format(
        du.free, du.used))
    job_scratch_dir = os.path.join(scratch_path, str(job.jobid))
    for inp_file in job.input_files:
        job.input_files[inp_file]["scratch_path"] = job_scratch_dir
    logger.debug("Job scratch path: {0}".format(job_scratch_dir))
    # special data, that should be preplaced in RAM disk
    dst_db_path = 'sqlite200/'
    dst_db_filename = 'ALLP200.db'
    dst_db_path_2 = 'geomDB/'
    dst_db_filename_2 = 'geomDB_sqlite'
    tmp_path = 'tmp/'
    src_file = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/sqlite200/ALLP200.db'
    src_file_2 = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/geomDB/geomDB_sqlite'

    if os.path.exists(scratch_path):
        try:
            add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(),
                                args)
            logger.debug("Prepare \'tmp\' dir in scratch ")
            if not os.path.exists(scratch_path + tmp_path):
                os.makedirs(scratch_path + tmp_path)
            logger.debug("Prepare dst and copy sqlite db files")
            t0 = time.time()
            if not os.path.exists(scratch_path + dst_db_path):
                os.makedirs(scratch_path + dst_db_path)
            shutil.copyfile(src_file,
                            scratch_path + dst_db_path + dst_db_filename)
            logger.debug("")
            sql_cp_time = time.time() - t0
            logger.debug("Copy of sqlite files took: {0}".format(sql_cp_time))
            logger.debug("Prepare dst and copy geomDB files")
            t0 = time.time()
            if not os.path.exists(scratch_path + dst_db_path_2):
                os.makedirs(scratch_path + dst_db_path_2)
            shutil.copyfile(src_file_2,
                            scratch_path + dst_db_path_2 + dst_db_filename_2)
            geomdb_cp_time = time.time() - t0
            logger.debug(
                "Copy of geomDB files took: {0} s".format(geomdb_cp_time))
            logger.debug("Prepare job scratch dir")
            t0 = time.time()
            if not os.path.exists(job_scratch_dir):
                os.makedirs(job_scratch_dir)
            logger.debug("Copy input file")
            for inp_file in job.input_files:
                logger.debug("Copy: {0} to {1}".format(
                    os.path.join(work_dir, inp_file),
                    job.input_files[inp_file]["scratch_path"]))
                shutil.copyfile(
                    os.path.join(work_dir, inp_file),
                    os.path.join(job.input_files[inp_file]["scratch_path"],
                                 inp_file))
            input_cp_time = time.time() - t0
            logger.debug(
                "Copy of input files took: {0} s".format(input_cp_time))
        except IOError as e:
            logger.error("I/O error({0}): {1}".format(e.errno, e.strerror))
            logger.error(
                "Copy to scratch failed, execution terminated': \n %s " %
                (sys.exc_info()[1]))
            raise FileHandlingFailure("Copy to RAM disk failed")
        finally:
            add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(),
                                args)
    else:
        logger.info('Scratch directory (%s) dos not exist' % scratch_path)
        return work_dir

    os.chdir(job_scratch_dir)
    logger.debug("Current directory: {0}".format(os.getcwd()))
    true_dir = '/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files'
    pseudo_dir = "./poolcond"
    os.symlink(true_dir, pseudo_dir)
    du = disk_usage(scratch_path)
    logger.info("Scratch dir available space for job: {0} used: {1}".format(
        du.free, du.used))

    return job_scratch_dir
Exemplo n.º 12
0
def run(args):
    """
    Main execution function for the generic HPC workflow.

    :param args: pilot arguments.
    :returns: traces object.
    """

    # set communication point. Worker report should be placed there, matched with working directory of Harvester
    if args.harvester_workdir:
        communication_point = args.harvester_workdir
    else:
        communication_point = os.getcwd()
    work_report = get_initial_work_report()
    worker_attributes_file = config.Harvester.workerAttributesFile
    worker_stageout_declaration = config.Harvester.StageOutnFile
    payload_report_file = config.Payload.jobreport
    payload_stdout_file = config.Payload.payloadstdout
    payload_stderr_file = config.Payload.payloadstderr

    try:
        logger.info('setting up signal handling')
        signal.signal(signal.SIGINT, functools.partial(interrupt, args))

        logger.info('setting up tracing')
        traces = namedtuple('traces', ['pilot'])
        traces.pilot = {'state': SUCCESS, 'nr_jobs': 0}

        if args.hpc_resource == '':
            logger.critical('hpc resource not specified, cannot continue')
            traces.pilot['state'] = FAILURE
            return traces

        # get the resource reference
        resource = __import__('pilot.resource.%s' % args.hpc_resource,
                              globals(), locals(), [args.hpc_resource], -1)

        # get the user reference
        user = __import__('pilot.user.%s.common' % args.pilot_user.lower(),
                          globals(), locals(), [args.pilot_user.lower()], -1)

        # get job (and rank)
        add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args)
        job, rank = resource.get_job(communication_point)
        add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args)
        # cd to job working directory

        add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args)
        work_dir = resource.set_job_workdir(job, communication_point)
        work_report['workdir'] = work_dir
        worker_attributes_file = os.path.join(work_dir, worker_attributes_file)
        logger.debug("Worker attributes will be publeshied in: {0}".format(
            worker_attributes_file))

        set_pilot_state(job=job, state="starting")
        work_report["jobStatus"] = job.state
        publish_work_report(work_report, worker_attributes_file)

        # Get HPC specific setup commands
        logger.info('setup for resource %s: %s' %
                    (args.hpc_resource, str(resource.get_setup())))
        setup_str = "; ".join(resource.get_setup())

        # Prepare job scratch directory (RAM disk etc.)
        job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args)

        my_command = " ".join([job.script, job.script_parameters])
        my_command = resource.command_fix(my_command, job_scratch_dir)
        my_command = setup_str + my_command
        add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args)

        # Basic execution. Should be replaced with something like 'run_payload'
        logger.debug("Going to launch: {0}".format(my_command))
        logger.debug("Current work directory: {0}".format(job_scratch_dir))
        payloadstdout = open(payload_stdout_file, "w")
        payloadstderr = open(payload_stderr_file, "w")

        add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args)
        set_pilot_state(job=job, state="running")
        work_report["jobStatus"] = job.state
        work_report["startTime"] = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")
        start_time = time.asctime(time.localtime(time.time()))
        job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        publish_work_report(work_report, worker_attributes_file)

        stime = time.time()
        t0 = os.times()
        exit_code, stdout, stderr = execute(my_command,
                                            stdout=payloadstdout,
                                            stderr=payloadstderr,
                                            shell=True)
        logger.debug("Payload exit code: {0}".format(exit_code))
        t1 = os.times()
        exetime = time.time() - stime
        end_time = time.asctime(time.localtime(time.time()))
        t = map(lambda x, y: x - y, t1, t0)
        t_tot = reduce(lambda x, y: x + y, t[2:3])
        job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        payloadstdout.close()
        payloadstderr.close()
        add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args)

        state = 'finished' if exit_code == 0 else 'failed'
        set_pilot_state(job=job, state=state)
        job.exitcode = exit_code

        work_report["startTime"] = job.startTime
        work_report["endTime"] = job.endTime
        work_report["jobStatus"] = job.state
        work_report["cpuConsumptionTime"] = t_tot
        work_report["transExitCode"] = job.exitcode

        log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format(
            exit_code, job.jobid)
        log_jobreport += "CPU comsumption time: {0}  JobID: {1} \n".format(
            t_tot, job.jobid)
        log_jobreport += "Start time: {0}  JobID: {1} \n".format(
            start_time, job.jobid)
        log_jobreport += "End time: {0}  JobID: {1} \n".format(
            end_time, job.jobid)
        log_jobreport += "Execution time: {0} sec.  JobID: {1} \n".format(
            exetime, job.jobid)
        logger.info(log_jobreport)
        log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format(
            job.startTime, job.endTime)
        logger.debug(log_jobreport)

        # Parse job report file and update of work report
        if os.path.exists(payload_report_file):
            payload_report = user.parse_jobreport_data(
                read_json(payload_report_file))
            work_report.update(payload_report)
            resource.process_jobreport(payload_report_file, job_scratch_dir,
                                       work_dir)

        resource.postprocess_workdir(job_scratch_dir)

        # output files should not be packed with logs
        protectedfiles = job.output_files.keys()

        # log file not produced (yet), so should be excluded
        if job.log_file in protectedfiles:
            protectedfiles.remove(job.log_file)
        else:
            logger.info("Log files was not declared")

        logger.info("Cleanup of working directory")

        protectedfiles.extend(
            [worker_attributes_file, worker_stageout_declaration])
        user.remove_redundant_files(job_scratch_dir, protectedfiles)
        res = tar_files(job_scratch_dir, protectedfiles, job.log_file)
        if res > 0:
            raise FileHandlingFailure("Log file tar failed")

        add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)
        # Copy of output to shared FS for stageout
        if not job_scratch_dir == work_dir:
            copy_output(job, job_scratch_dir, work_dir)
        add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

        logger.info("Declare stage-out")
        add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(),
                            args)
        declare_output(job, work_report, worker_stageout_declaration)

        logger.info("All done")
        publish_work_report(work_report, worker_attributes_file)
        traces.pilot['state'] = SUCCESS
        logger.debug("Final report: {0}".format(work_report))
        add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(),
                            args)

    except Exception as e:
        work_report["jobStatus"] = "failed"
        work_report["exitMsg"] = str(e)
        publish_work_report(work_report, worker_attributes_file)
        logging.exception('exception caught:')
        traces.pilot['state'] = FAILURE

    return traces
Exemplo n.º 13
0
def _stage_in(args, job):
    """
        :return: True in case of success
    """

    log = get_logger(job.jobid)

    # tested ok:
    #log.info('testing sending SIGUSR1')
    #import signal
    #os.kill(os.getpid(), signal.SIGUSR1)

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(), args)

    # any DBRelease files should not be staged in
    for fspec in job.indata:
        if 'DBRelease' in fspec.lfn:
            fspec.status = 'no_transfer'

    event_type = "get_sm"
    if job.is_analysis():
        event_type += "_a"
    rse = get_rse(job.indata)
    localsite = remotesite = rse
    trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''),
                               localSite=localsite,
                               remoteSite=remotesite,
                               dataset="",
                               eventType=event_type)
    trace_report.init(job)

    # now that the trace report has been created, remove any files that are not to be transferred (DBRelease files) from the indata list
    toberemoved = []
    for fspec in job.indata:
        if fspec.status == 'no_transfer':
            toberemoved.append(fspec)
    for fspec in toberemoved:
        logger.info('removing fspec object (lfn=%s) from list of input files' %
                    fspec.lfn)
        job.indata.remove(fspec)

    ########### bulk transfer test
    # THE FOLLOWING WORKS BUT THERE IS AN ISSUE WITH TRACES, CHECK STAGEIN SCRIPT IF STORED CORRECTLY
    #filename = 'initial_trace_report.json'
    #tpath = os.path.join(job.workdir, filename)
    #write_json(tpath, trace_report)
    #lfns, scopes = get_filedata_strings(job.indata)
    #script = 'stagein.py'
    #srcdir = os.environ.get('PILOT_SOURCE_DIR')
    #scriptpath = os.path.join(os.path.join(srcdir, 'pilot/scripts'), script)
    #copy(scriptpath, srcdir)
    #cmd = 'python %s --lfns=%s --scopes=%s --tracereportname=%s -w %s -d -q %s' %\
    #      (os.path.join(srcdir, script), lfns, scopes, tpath, job.workdir, args.queue)
    #logger.debug('could have executed: %s' % script)
    #exit_code, stdout, stderr = execute(cmd, mode='python')
    #logger.debug('exit_code=%d' % exit_code)
    #logger.debug('stdout=%s' % stdout)
    #logger.debug('stderr=%s' % stderr)
    ########### bulk transfer test

    try:
        if job.is_eventservicemerge:
            client = StageInESClient(job.infosys,
                                     logger=log,
                                     trace_report=trace_report)
            activity = 'es_events_read'
        else:
            client = StageInClient(job.infosys,
                                   logger=log,
                                   trace_report=trace_report)
            activity = 'pr'
        kwargs = dict(workdir=job.workdir,
                      cwd=job.workdir,
                      usecontainer=False,
                      job=job,
                      use_bulk=False)
        client.prepare_sources(job.indata)
        client.transfer(job.indata, activity=activity, **kwargs)
    except PilotException as error:
        import traceback
        error_msg = traceback.format_exc()
        log.error(error_msg)
        msg = errors.format_diagnostics(error.get_error_code(), error_msg)
        job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
            error.get_error_code(), msg=msg)
    except Exception as error:
        log.error('failed to stage-in: error=%s' % error)

    log.info('summary of transferred files:')
    for e in job.indata:
        status = e.status if e.status else "(not transferred)"
        log.info(" -- lfn=%s, status_code=%s, status=%s" %
                 (e.lfn, e.status_code, status))

    # write time stamps to pilot timing file
    add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(), args)

    remain_files = [
        e for e in job.indata
        if e.status not in ['remote_io', 'transferred', 'no_transfer']
    ]
    if not remain_files:
        log.info("stage-in finished")
    else:
        log.info("stage-in failed")

    return not remain_files