def get_postgetjob_time(job_id, args): """ Return the post getjob time. :param job_id: job object. :param args: pilot arguments. :return: post getjob time measurement (int). In case of failure, return None. """ time_measurement = None timing_constant = PILOT_POST_GETJOB if job_id in args.timing: # extract time measurements time_measurement_dictionary = args.timing.get(job_id, None) if time_measurement_dictionary: time_measurement = time_measurement_dictionary.get( timing_constant, None) if not time_measurement: log = get_logger(job_id) log.warning( 'failed to extract time measurement %s from %s (no such key)' % (timing_constant, time_measurement_dictionary)) return time_measurement
def get_time_measurement(timing_constant, time_measurement_dictionary, timing_dictionary, job_id): """ Return a requested time measurement from the time measurement dictionary, read from the pilot timing file. :param timing_constant: timing constant (e.g. PILOT_MULTIJOB_START_TIME) :param time_measurement_dictionary: time measurement dictionary, extracted from pilot timing dictionary. :param timing_dictionary: full timing dictionary from pilot timing file. :param job_id: PanDA job id (string). :return: time measurement (float). """ time_measurement = time_measurement_dictionary.get(timing_constant, None) if not time_measurement: # try to get the measurement for the PILOT_MULTIJOB_START_TIME dictionary i = '0' if timing_constant == PILOT_START_TIME else '1' time_measurement_dictionary_0 = timing_dictionary.get(i, None) if time_measurement_dictionary_0: time_measurement = time_measurement_dictionary_0.get( timing_constant, None) else: log = get_logger(job_id) log.warning( 'failed to extract time measurement %s from %s (no such key)' % (timing_constant, time_measurement_dictionary)) return time_measurement
def get_events(self, req): """ Get events """ self.get_events_lock.acquire() resp = None try: log = get_logger(str(req.jobid), logger) if not req.num_ranges: # ToBeFix num_ranges with corecount req.num_ranges = 1 data = { 'pandaID': req.jobid, 'jobsetID': req.jobsetid, 'taskID': req.taskid, 'nRanges': req.num_ranges } log.info("Downloading new event ranges: %s" % data) res = https.request( '{pandaserver}/server/panda/getEventRanges'.format( pandaserver=config.Pilot.pandaserver), data=data) log.info("Downloaded event ranges: %s" % res) if res['StatusCode'] == 0 or str(res['StatusCode']) == '0': resp_attrs = { 'status': 0, 'content': res['eventRanges'], 'exception': None } else: resp_attrs = { 'status': res['StatusCode'], 'content': None, 'exception': exception.CommunicationFailure( "Get events from panda returns non-zero value: %s" % res['StatusCode']) } resp = CommunicationResponse(resp_attrs) except Exception, e: log.error("Failed to download event ranges: %s, %s" % (e, traceback.format_exc())) resp_attrs = { 'status': -1, 'content': None, 'exception': exception.UnknownException("Failed to get events: %s" % (traceback.format_exc())) } resp = CommunicationResponse(resp_attrs)
def timing_report(job_id, args): """ Write a timing report to the job log and return relevant timing measurements. :param job_id: job id (string). :param args: pilot arguments. :return: time_getjob, time_stagein, time_payload, time_stageout, time_total_setup (integer strings). """ log = get_logger(job_id) # collect pilot timing data time_getjob = get_getjob_time(job_id, args) time_initial_setup = get_initial_setup_time(job_id, args) time_setup = get_setup_time(job_id, args) time_total_setup = time_initial_setup + time_setup time_stagein = get_stagein_time(job_id, args) time_payload = get_payload_execution_time(job_id, args) time_stageout = get_stageout_time(job_id, args) log.info('.' * 30) log.info('. Timing measurements:') log.info('. get job = %d s' % time_getjob) log.info('. initial setup = %d s' % time_initial_setup) log.info('. payload setup = %d s' % time_setup) log.info('. total setup = %d s' % time_total_setup) log.info('. stage-in = %d s' % time_stagein) log.info('. payload execution = %d s' % time_payload) log.info('. stage-out = %d s' % time_stageout) log.info('.' * 30) return time_getjob, time_stagein, time_payload, time_stageout, time_total_setup
def clean(self): """ Clean temp produced files """ job = self.get_job() log = get_logger(job.jobid, logger) for msg in self.__all_out_messages: if msg['status'] in ['failed', 'fatal']: pass elif 'output' in msg: try: log.info("Removing es premerge file: %s" % msg['output']) os.remove(msg['output']) except Exception as e: log.error("Failed to remove file(%s): %s" % (msg['output'], str(e))) self.__queued_out_messages = [] self.__last_stageout_time = None self.__all_out_messages = [] if self.proc: self.proc.stop() while self.proc.is_alive(): time.sleep(0.1) self.stop_communicator()
def get_log_extracts(job, state): """ Extract special warnings and other other info from special logs. This function also discovers if the payload had any outbound connections. :param job: job object. :param state: job state (string). :return: log extracts (string). """ log = get_logger(job.jobid) log.info("building log extracts (sent to the server as \'pilotLog\')") # did the job have any outbound connections? # look for the pandatracerlog.txt file, produced if the user payload attempted any outgoing connections extracts = get_panda_tracer_log(job) # for failed/holding jobs, add extracts from the pilot log file, but always add it to the pilot log itself _extracts = get_pilot_log_extracts(job) if _extracts != "": log.warning( 'detected the following tail of warning/fatal messages in the pilot log:\n%s' % _extracts) if state == 'failed' or state == 'holding': extracts += _extracts # add extracts from payload logs # (see buildLogExtracts in Pilot 1) return extracts
def get_panda_tracer_log(job): """ Return the contents of the PanDA tracer log if it exists. This file will contain information about outbound connections. :param job: job object. :return: log extracts from pandatracerlog.txt (string). """ extracts = "" log = get_logger(job.jobid) tracerlog = os.path.join(job.workdir, "pandatracerlog.txt") if os.path.exists(tracerlog): # only add if file is not empty if os.path.getsize(tracerlog) > 0: message = "PandaID=%s had outbound connections: " % (job.jobid) extracts += message message = read_file(tracerlog) extracts += message log.warning(message) else: log.info( "PanDA tracer log (%s) has zero size (no outbound connections detected)" % tracerlog) else: log.debug("PanDA tracer log does not exist: %s (ignoring)" % tracerlog) return extracts
def verify_looping_job(current_time, mt, job): """ Verify that the job is not looping. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :param job: job object. :return: exit code (int), error diagnostics (string). """ log = get_logger(job.jobid) looping_verification_time = convert_to_int(config.Pilot.looping_verification_time, default=600) if current_time - mt.get('ct_looping') > looping_verification_time: # is the job looping? try: exit_code, diagnostics = looping_job(job, mt) except Exception as e: diagnostics = 'exception caught in looping job algorithm: %s' % e log.warning(diagnostics) if "No module named" in diagnostics: exit_code = errors.BLACKHOLE else: exit_code = errors.UNKNOWNEXCEPTION return exit_code, diagnostics else: if exit_code != 0: return exit_code, diagnostics # update the ct_proxy with the current time mt.update('ct_looping') return 0, ""
def check_output_file_sizes(job): """ Are the output files within the allowed size limits? :param job: job object. :return: exit code (int), error diagnostics (string) """ exit_code = 0 diagnostics = "" log = get_logger(job.jobid) # loop over all known output files for fspec in job.outdata: path = os.path.join(job.workdir, fspec.lfn) if os.path.exists(path): # get the current file size fsize = get_local_file_size(path) max_fsize = human2bytes(config.Pilot.maximum_output_file_size) if fsize and fsize < max_fsize: log.info('output file %s is within allowed size limit (%d B < %d B)' % (path, fsize, max_fsize)) else: exit_code = errors.OUTPUTFILETOOLARGE diagnostics = 'output file %s is not within allowed size limit (%d B > %d B)' % (path, fsize, max_fsize) log.warning(diagnostics) else: log.info('output file size check: skipping output file %s since it does not exist' % path) return exit_code, diagnostics
def get_job_metrics(job): """ Return a properly formatted job metrics string. Job metrics are highly user specific, so this function merely calls a corresponding get_job_metrics() in the user code. The format of the job metrics string is defined by the server. It will be reported to the server during updateJob. Example of job metrics: Number of events read | Number of events written | vmPeak maximum | vmPeak average | RSS average | .. Format: nEvents=<int> nEventsW=<int> vmPeakMax=<int> vmPeakMean=<int> RSSMean=<int> hs06=<float> shutdownTime=<int> cpuFactor=<float> cpuLimit=<float> diskLimit=<float> jobStart=<int> memLimit=<int> runLimit=<float> :param job: job object. :return: job metrics (string). """ log = get_logger(job.jobid) user = environ.get('PILOT_USER', 'generic').lower() # TODO: replace with singleton try: job_metrics_module = __import__('pilot.user.%s.jobmetrics' % user, globals(), locals(), [user], 0) # Python 2/3 except AttributeError as e: job_metrics = None log.warning('function not implemented in jobmetrics module: %s' % e) else: job_metrics = job_metrics_module.get_job_metrics(job) return job_metrics
def utility_after_payload_started(self, job): """ Functions to run after payload started :param job: job object """ log = get_logger(job.jobid, logger) # get the payload command from the user specific code pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], -1) # should any additional commands be executed after the payload? cmds = user.get_utility_commands_list(order=UTILITY_AFTER_PAYLOAD_STARTED) if cmds != []: for utcmd in cmds: log.info('utility command to be executed after the payload: %s' % utcmd) # how should this command be executed? utilitycommand = user.get_utility_command_setup(utcmd, job) if not utilitycommand: log.warning('empty utility command - nothing to run') return try: proc1 = execute(utilitycommand, workdir=job.workdir, returnproc=True, usecontainer=False, stdout=PIPE, stderr=PIPE, cwd=job.workdir, job=job) except Exception as e: log.error('could not execute: %s' % e) else: # store process handle in job object, and keep track on how many times the command has been launched # also store the full command in case it needs to be restarted later (by the job_monitor() thread) job.utilities[utcmd] = [proc1, 1, utilitycommand]
def find_number_of_events(job): """ Locate the number of events. :param job: job object. :return: """ log = get_logger(job.jobid) if job.nevents: log.info('number of events already known: %d' % job.nevents) return log.info('looking for number of processed events (source #1: jobReport.json)') find_number_of_events_in_jobreport(job) if job.nevents > 0: log.info('found %d processed events' % job.nevents) return log.info('looking for number of processed events (source #2: metadata.xml)') find_number_of_events_in_xml(job) if job.nevents > 0: log.info('found %d processed events' % job.nevents) return log.info('looking for number of processed events (source #3: athena summary file(s)') n1, n2 = process_athena_summary(job) if n1 > 0: job.nevents = n1 log.info('found %d processed (read) events' % job.nevents) if n2 > 0: job.neventsw = n2 log.info('found %d processed (written) events' % job.neventsw)
def get_time_since(job_id, timing_constant, args): """ Return the amount of time that has passed since the time measurement of timing_constant. :param job_id: PanDA job id (string). :param timing_constant: :param args: pilot arguments. :return: time in seconds (int). """ diff = 0 log = get_logger(job_id) if job_id in args.timing: # extract time measurements time_measurement_dictionary = args.timing.get(job_id, None) if time_measurement_dictionary: time_measurement = get_time_measurement(timing_constant, time_measurement_dictionary, args.timing, job_id) if time_measurement: diff = time.time() - time_measurement else: log.warning('failed to extract time measurement dictionary from %s' % str(args.timing)) else: log.warning('job id %s not found in timing dictionary' % job_id) return diff
def perform_initial_payload_error_analysis(job, exit_code): """ Perform an initial analysis of the payload. Singularity errors are caught here. :param job: job object. :param exit_code: exit code from payload execution. :return: """ log = get_logger(job.jobid, logger) if exit_code != 0: log.warning('main payload execution returned non-zero exit code: %d' % exit_code) stderr = read_file( os.path.join(job.workdir, config.Payload.payloadstderr)) if stderr != "": msg = errors.extract_stderr_msg(stderr) if msg != "": log.warning("extracted message from stderr:\n%s" % msg) ec = errors.resolve_transform_error(exit_code, stderr) if ec != 0: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( ec) else: if job.piloterrorcodes: log.warning('error code(s) already set: %s' % str(job.piloterrorcodes)) else: log.warning('initial error analysis did not resolve the issue') else: log.info( 'main payload execution returned zero exit code, but will check it more carefully' )
def _do_stageout(job, xdata, activity, title): """ Use the `StageOutClient` in the Data API to perform stage-out. :param job: job object. :param xdata: list of FileSpec objects. :param activity: copytool activity or preferred list of activities to resolve copytools :param title: type of stage-out (output, log) (string). :return: True in case of success transfers """ log = get_logger(job.jobid) log.info('prepare to stage-out %d %s file(s)' % (len(xdata), title)) event_type = "put_sm" #if log_transfer: # eventType += '_logs' #if special_log_transfer: # eventType += '_logs_os' if job.is_analysis(): event_type += "_a" rse = get_rse(xdata) localsite = remotesite = rse trace_report = TraceReport(pq=os.environ.get('PILOT_SITENAME', ''), localSite=localsite, remoteSite=remotesite, dataset="", eventType=event_type) trace_report.init(job) try: client = StageOutClient(job.infosys, logger=log, trace_report=trace_report) kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job) #, mode='stage-out') # prod analy unification: use destination preferences from PanDA server for unified queues if job.infosys.queuedata.type != 'unified': client.prepare_destinations(xdata, activity) ## FIX ME LATER: split activities: for astorages and for copytools (to unify with ES workflow) client.transfer(xdata, activity, **kwargs) except PilotException as error: import traceback error_msg = traceback.format_exc() log.error(error_msg) msg = errors.format_diagnostics(error.get_error_code(), error_msg) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(error.get_error_code(), msg=msg) except Exception: import traceback log.error(traceback.format_exc()) # do not raise the exception since that will prevent also the log from being staged out # error = PilotException("stageOut failed with error=%s" % e, code=ErrorCodes.STAGEOUTFAILED) else: log.debug('stage-out client completed') log.info('summary of transferred files:') for e in xdata: if not e.status: status = "(not transferred)" else: status = e.status log.info(" -- lfn=%s, status_code=%s, status=%s" % (e.lfn, e.status_code, status)) remain_files = [e for e in xdata if e.status not in ['transferred']] log.debug('remain_files=%s' % str(remain_files)) log.debug('xdata=%s' % str(xdata)) return not remain_files
def stageout_es_real(self, output_file): """ Stage out event service output file. :param output_file: output file name. """ job = self.get_job() log = get_logger(job.jobid, logger) log.info('prepare to stage-out eventservice files') error = None try: file_data = { 'scope': 'transient', 'lfn': os.path.basename(output_file), } file_spec = FileSpec(filetype='output', **file_data) xdata = [file_spec] client = StageOutESClient(job.infosys, logger=log) kwargs = dict(workdir=job.workdir, cwd=job.workdir, usecontainer=False, job=job) client.transfer(xdata, activity=['es_events', 'pw'], **kwargs) except exception.PilotException, error: log.error(error.get_detail())
def validate_post(queues, traces, args): """ Validate finished payloads. If payload finished correctly, add the job to the data_out queue. If it failed, add it to the data_out queue as well but only for log stage-out (in failed_post() below). :param queues: :param traces: :param args: :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads try: job = queues.finished_payloads.get(block=True, timeout=1) except queue.Empty: time.sleep(0.1) continue log = get_logger(job.jobid, logger) # by default, both output and log should be staged out job.stageout = 'all' log.debug('adding job to data_out queue') #queues.data_out.put(job) set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) logger.info('[payload] validate_post thread has finished')
def failed_post(queues, traces, args): """ (add description) :param queues: :param traces: :param args: :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) # finished payloads try: job = queues.failed_payloads.get(block=True, timeout=1) except queue.Empty: time.sleep(0.1) continue log = get_logger(job.jobid, logger) log.debug('adding log for log stageout') job.stageout = 'log' # only stage-out log file #queues.data_out.put(job) set_pilot_state(job=job, state='stageout') put_in_queue(job, queues.data_out) logger.info('[payload] failed_post thread has finished')
def post_memory_monitor_action(job): """ Perform post action items for memory monitor. :param job: job object. :return: """ log = get_logger(job.jobid) nap = 3 path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename()) path2 = os.environ.get('PILOT_HOME') i = 0 maxretry = 20 while i <= maxretry: if os.path.exists(path1): break log.info("taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)" % (nap, i, maxretry)) time.sleep(nap) i += 1 try: copy(path1, path2) except Exception as e: log.warning('failed to copy memory monitor output: %s' % e)
def _validate_payload(job): """ Perform validation tests for the payload. :param job: job object. :return: boolean. """ status = True log = get_logger(job.jobid, logger) log.debug('PandaID = %s' % os.environ.get('PandaID', 'unknown')) log.debug('PanDA_TaskID = %s' % os.environ.get('PanDA_TaskID', 'unknown')) # perform user specific validation pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: status = user.validate(job) except Exception as e: log.fatal('failed to execute user validate() function: %s' % e) status = False return status
def get_looping_job_limit(job): """ Get the time limit for looping job detection. :param job: job object. :return: looping job time limit (int). """ log = get_logger(job.jobid) is_analysis = job.is_analysis() looping_limit = convert_to_int(config.Pilot.looping_limit_default_prod, default=12 * 3600) if is_analysis: looping_limit = convert_to_int(config.Pilot.looping_limit_default_user, default=3 * 3600) if job.maxcpucount and job.maxcpucount >= config.Pilot.looping_limit_min_default: _looping_limit = max(config.Pilot.looping_limit_min_default, job.maxcpucount) else: _looping_limit = max(looping_limit, job.maxcpucount) if _looping_limit != looping_limit: log.info( "task request has updated looping job limit from %d s to %d s using maxCpuCount" % (looping_limit, _looping_limit)) looping_limit = _looping_limit else: log.info("using standard looping job limit: %d s" % looping_limit) return looping_limit
def wait_graceful(self, args, proc, job): """ (add description) :param args: :param proc: :param job: :return: """ log = get_logger(job.jobid, logger) t1 = time.time() while proc.is_alive(): if args.graceful_stop.is_set(): log.debug("Graceful stop is set, stopping work executor") proc.stop() break if time.time() > t1 + 300: # 5 minutes log.info("Process is still running") t1 = time.time() time.sleep(2) while proc.is_alive(): time.sleep(2) exit_code = proc.get_exit_code() return exit_code
def get_core_count(job): """ Return the core count from ATHENA_PROC_NUMBER. :param job: job object. :return: core count (int). """ log = get_logger(job.jobid) if "HPC_HPC" in job.infosys.queuedata.catchall: if job.corecount is None: job.corecount = 0 else: if job.corecount: # Always use the ATHENA_PROC_NUMBER first, if set if 'ATHENA_PROC_NUMBER' in os.environ: try: job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER')) except Exception as e: log.warning( "ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.corecount value)" % e) else: try: job.corecount = int(os.environ.get('ATHENA_PROC_NUMBER')) except Exception: log.warning( "environment variable ATHENA_PROC_NUMBER is not set. corecount is not set" ) return job.corecount
def is_out_of_memory(job): """ Did the payload run out of memory? :param job: job object. :return: Boolean. (note: True means the error was found) """ out_of_memory = False log = get_logger(job.jobid) stdout = os.path.join(job.workdir, config.Payload.payloadstdout) stderr = os.path.join(job.workdir, config.Payload.payloadstderr) files = {stderr: ["FATAL out of memory: taking the application down"], stdout: ["St9bad_alloc", "std::bad_alloc"]} for path in files: if os.path.exists(path): log.info('looking for out-of-memory errors in %s' % os.path.basename(path)) if os.path.getsize(path) > 0: matched_lines = grep(files[path], path) if len(matched_lines) > 0: log.warning("identified an out of memory error in %s %s:" % (job.payload, os.path.basename(path))) for line in matched_lines: log.info(line) out_of_memory = True else: log.warning('file does not exist: %s (cannot look for out-of-memory error in it)') return out_of_memory
def abort_jobs_in_queues(queues, sig): """ Find all jobs in the queues and abort them. :param queues: queues object. :param sig: detected kill signal. :return: """ jobs_list = [] # loop over all queues and find all jobs for q in queues._fields: _q = getattr(queues, q) jobs = list(_q.queue) for job in jobs: if job not in jobs_list: jobs_list.append(job) logger.info('found %d job(s) in %d queues' % (len(jobs_list), len(queues._fields))) for job in jobs_list: log = get_logger(job.jobid) log.info('aborting job %s' % job.jobid) declare_failed_by_kill(job, queues.failed_jobs, sig)
def run_payload(self, job, out, err): """ (add description) :param job: :param out: :param err: :return: """ log = get_logger(job.jobid, logger) self.pre_setup(job) # get the payload command from the user specific code pilot_user = os.environ.get('PILOT_USER', 'atlas').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], -1) self.post_setup(job) self.utility_before_payload(job) self.utility_with_payload(job) try: executable = user.get_payload_command(job) except exception.PilotException as e: log.fatal('could not define payload command') return None log.info("payload execution command: %s" % executable) try: payload = { 'executable': executable, 'workdir': job.workdir, 'output_file': out, 'error_file': err, 'job': job } log.debug("payload: %s" % payload) log.info("Starting EventService WorkExecutor") executor = WorkExecutor() executor.set_payload(payload) executor.start() log.info("EventService WorkExecutor started") log.info("ESProcess started with pid: %s" % executor.get_pid()) job.pid = executor.get_pid() if job.pid: job.pgrp = os.getpgid(job.pid) self.utility_after_payload_started(job) except Exception as e: log.error('could not execute: %s' % str(e)) return None return executor
def create_log(job, logfile, tarball_name): """ :param job: :param logfile: :param tarball_name: :raises LogFileCreationFailure: in case of log file creation problem :return: """ log = get_logger(job.jobid) log.debug('preparing to create log file') # perform special cleanup (user specific) prior to log file creation pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], -1) user.remove_redundant_files(job.workdir) input_files = [e.lfn for e in job.indata] output_files = [e.lfn for e in job.outdata] # remove any present input/output files before tarring up workdir for f in input_files + output_files: path = os.path.join(job.workdir, f) if os.path.exists(path): log.info('removing file: %s' % path) remove(path) # rename the workdir for the tarball creation newworkdir = os.path.join(os.path.dirname(job.workdir), tarball_name) orgworkdir = job.workdir log.debug('renaming %s to %s' % (job.workdir, newworkdir)) os.rename(job.workdir, newworkdir) job.workdir = newworkdir fullpath = os.path.join(job.workdir, logfile.lfn) # /some/path/to/dirname/log.tgz log.info('will create archive %s' % fullpath) try: #newdirnm = "tarball_PandaJob_%s" % job.jobid #tarballnm = "%s.tar.gz" % newdirnm #os.rename(job.workdir, newdirnm) cmd = "pwd;tar cvfz %s %s --dereference --one-file-system; echo $?" % ( fullpath, tarball_name) exit_code, stdout, stderr = execute(cmd) #with closing(tarfile.open(name=fullpath, mode='w:gz', dereference=True)) as archive: # archive.add(os.path.basename(job.workdir), recursive=True) except Exception as e: raise LogFileCreationFailure(e) else: log.debug('stdout = %s' % stdout) log.debug('renaming %s back to %s' % (job.workdir, orgworkdir)) try: os.rename(job.workdir, orgworkdir) except Exception as e: log.debug('exception caught: %s' % e) job.workdir = orgworkdir
def get_time_for_last_touch(job, mt, looping_limit): """ Return the time when the files in the workdir were last touched. in case no file was touched since the last check, the returned value will be the same as the previous time. :param job: job object. :param mt: `MonitoringTime` object. :param looping_limit: looping limit in seconds. :return: time in seconds since epoch (int) (or None in case of failure). """ log = get_logger(job.jobid) pilot_user = os.environ.get('PILOT_USER', 'generic').lower() loopingjob_definitions = __import__( 'pilot.user.%s.loopingjob_definitions' % pilot_user, globals(), locals(), [pilot_user], -1) # locate all files that were modified the last N minutes cmd = "find %s -mmin -%d" % (job.workdir, int(looping_limit / 60)) exit_code, stdout, stderr = execute(cmd) if exit_code == 0: if stdout != "": files = stdout.split( "\n") # find might add a \n even for single entries # remove unwanted list items (*.py, *.pyc, workdir, ...) files = loopingjob_definitions.remove_unwanted_files( job.workdir, files) if files: log.info('found %d files that were recently updated' % len(files)) updated_files = verify_file_list(files) # now get the mod times for these file, and identify the most recently update file latest_modified_file, mtime = find_latest_modified_file( updated_files) if latest_modified_file: log.info( "file %s is the most recently updated file (at time=%d)" % (latest_modified_file, mtime)) else: log.warning( 'looping job algorithm failed to identify latest updated file' ) return mt.ct_looping_last_touched # store the time of the last file modification mt.update('ct_looping_last_touched', modtime=mtime) else: log.warning("found no recently updated files!") else: log.warning('found no recently updated files') else: log.warning('find command failed: %d, %s, %s' % (exit_code, stdout, stderr)) return mt.ct_looping_last_touched
def get_time_difference(job_id, timing_constant_1, timing_constant_2, args): """ Return the positive time difference between the given constants. The order is not important and a positive difference is always returned. The function collects the time measurements corresponding to the given timing constants from the pilot timing file. The job_id is used internally as a dictionary key. The given timing constants and their timing measurements, belong to the given job_id. Structure of pilot timing dictionary: { job_id: { <timing_constant_1>: <time measurement in seconds since epoch>, .. } job_id = 0 means timing information from wrapper. Timing constants are defined in pilot.util.constants. Time measurement are time.time() values. The float value will be converted to an int as a last step. :param job_id: PanDA job id (string). :param timing_constant_1: :param timing_constant_2: :param args: pilot arguments. :return: time difference in seconds (int). """ diff = 0 log = get_logger(job_id) if job_id in args.timing: # extract time measurements time_measurement_dictionary = args.timing.get(job_id, None) if time_measurement_dictionary: time_measurement_1 = get_time_measurement( timing_constant_1, time_measurement_dictionary, args.timing, job_id) time_measurement_2 = get_time_measurement( timing_constant_2, time_measurement_dictionary, args.timing, job_id) if time_measurement_1 and time_measurement_2: diff = time_measurement_2 - time_measurement_1 else: log.warning( 'failed to extract time measurement dictionary from %s' % str(args.timing)) else: log.warning('job id %s not found in timing dictionary' % job_id) # always return a positive number if diff < 0: diff = -diff # convert to int as a last step try: diff = int(diff) except Exception as e: log.warning('failed to convert %s to int: %s (will reset to 0)' % (diff, e)) diff = 0 return diff
def check_work_dir(job): """ Check the size of the work directory. The function also updates the workdirsizes list in the job object. :param job: job object. :return: exit code (int), error diagnostics (string) """ exit_code = 0 diagnostics = "" log = get_logger(job.jobid) if os.path.exists(job.workdir): # get the limit of the workdir maxwdirsize = get_max_allowed_work_dir_size(job.infosys.queuedata) if os.path.exists(job.workdir): workdirsize = get_directory_size(directory=job.workdir) # is user dir within allowed size limit? if workdirsize > maxwdirsize: exit_code = errors.USERDIRTOOLARGE diagnostics = "work directory (%s) is too large: %d B (must be < %d B)" % \ (job.workdir, workdirsize, maxwdirsize) log.fatal("%s" % diagnostics) cmd = 'ls -altrR %s' % job.workdir exit_code, stdout, stderr = execute(cmd, mute=True) log.info("%s: %s" % (cmd + '\n', stdout)) # kill the job # pUtil.createLockFile(True, self.__env['jobDic'][k][1].workdir, lockfile="JOBWILLBEKILLED") kill_processes(job.pid) # remove any lingering input files from the work dir lfns, guids = job.get_lfns_and_guids() if lfns: remove_files(job.workdir, lfns) # remeasure the size of the workdir at this point since the value is stored below workdirsize = get_directory_size(directory=job.workdir) else: log.info( "size of work directory %s: %d B (within %d B limit)" % (job.workdir, workdirsize, maxwdirsize)) # Store the measured disk space (the max value will later be sent with the job metrics) if workdirsize > 0: job.add_workdir_size(workdirsize) else: log.warning('job work dir does not exist: %s' % job.workdir) else: log.warning( 'skipping size check of workdir since it has not been created yet') return exit_code, diagnostics