def get_looping_job_limit(job): """ Get the time limit for looping job detection. :param job: job object. :return: looping job time limit (int). """ log = get_logger(job.jobid) is_analysis = job.is_analysis() looping_limit = convert_to_int(config.Pilot.looping_limit_default_prod, default=12 * 3600) if is_analysis: looping_limit = convert_to_int(config.Pilot.looping_limit_default_user, default=3 * 3600) if job.maxcpucount and job.maxcpucount >= config.Pilot.looping_limit_min_default: _looping_limit = max(config.Pilot.looping_limit_min_default, job.maxcpucount) else: _looping_limit = max(looping_limit, job.maxcpucount) if _looping_limit != looping_limit: log.info( "task request has updated looping job limit from %d s to %d s using maxCpuCount" % (looping_limit, _looping_limit)) looping_limit = _looping_limit else: log.info("using standard looping job limit: %d s" % looping_limit) return looping_limit
def get_looping_job_limit(): """ Get the time limit for looping job detection. :return: looping job time limit in seconds (int). """ looping_limit = convert_to_int(config.Pilot.looping_limit_default, default=2 * 3600) looping_limit_min_default = convert_to_int( config.Pilot.looping_limit_min_default, default=2 * 3600) looping_limit = max(looping_limit, looping_limit_min_default) logger.info("using looping job limit: %d s", looping_limit) return looping_limit
def verify_running_processes(current_time, mt, pid): """ Verify the number of running processes. The function sets the environmental variable PILOT_MAXNPROC to the maximum number of found (child) processes corresponding to the main payload process id. The function does not return an error code (always returns exit code 0). :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :param pid: payload process id (int). :return: exit code (int), error diagnostics (string). """ nproc_env = 0 process_verification_time = convert_to_int(config.Pilot.process_verification_time, default=300) if current_time - mt.get('ct_process') > process_verification_time: # time to check the number of processes nproc = get_number_of_child_processes(pid) try: nproc_env = int(os.environ.get('PILOT_MAXNPROC', 0)) except Exception as error: logger.warning('failed to convert PILOT_MAXNPROC to int: %s', error) else: if nproc > nproc_env: # set the maximum number of found processes os.environ['PILOT_MAXNPROC'] = str(nproc) if nproc_env > 0: logger.info('maximum number of monitored processes: %d', nproc_env) return 0, ""
def verify_user_proxy(current_time, mt): """ Verify the user proxy. This function is called by the job_monitor_tasks() function. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :return: exit code (int), error diagnostics (string). """ pilot_user = os.environ.get('PILOT_USER', 'generic').lower() userproxy = __import__('pilot.user.%s.proxy' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 # is it time to verify the proxy? proxy_verification_time = convert_to_int(config.Pilot.proxy_verification_time, default=600) if current_time - mt.get('ct_proxy') > proxy_verification_time: # is the proxy still valid? exit_code, diagnostics = userproxy.verify_proxy(test=False) # use test=True to test expired proxy if exit_code != 0: return exit_code, diagnostics else: # update the ct_proxy with the current time mt.update('ct_proxy') return 0, ""
def verify_looping_job(current_time, mt, job): """ Verify that the job is not looping. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :param job: job object. :return: exit code (int), error diagnostics (string). """ looping_verification_time = convert_to_int(config.Pilot.looping_verification_time, default=600) if current_time - mt.get('ct_looping') > looping_verification_time: # is the job looping? try: exit_code, diagnostics = looping_job(job, mt) except Exception as e: diagnostics = 'exception caught in looping job algorithm: %s' % e logger.warning(diagnostics) if "No module named" in diagnostics: exit_code = errors.BLACKHOLE else: exit_code = errors.UNKNOWNEXCEPTION return exit_code, diagnostics else: if exit_code != 0: return exit_code, diagnostics # update the ct_proxy with the current time mt.update('ct_looping') return 0, ""
def verify_memory_usage(current_time, mt, job): """ Verify the memory usage (optional). Note: this function relies on a stand-alone memory monitor tool that may be executed by the Pilot. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :param job: job object. :return: exit code (int), error diagnostics (string). """ pilot_user = os.environ.get('PILOT_USER', 'generic').lower() memory = __import__('pilot.user.%s.memory' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 if not memory.allow_memory_usage_verifications(): return 0, "" # is it time to verify the memory usage? memory_verification_time = convert_to_int(config.Pilot.memory_usage_verification_time, default=60) if current_time - mt.get('ct_memory') > memory_verification_time: # is the used memory within the allowed limit? try: exit_code, diagnostics = memory.memory_usage(job) except Exception as e: logger.warning('caught exception: %s' % e) exit_code = -1 if exit_code != 0: logger.warning('ignoring failure to parse memory monitor output') #return exit_code, diagnostics else: # update the ct_proxy with the current time mt.update('ct_memory') return 0, ""
def convert_text_file_to_dictionary(path): """ Convert row-column text file to dictionary. User first row identifiers as dictionary keys. Note: file must follow the convention: NAME1 NAME2 .. value1 value2 .. .. .. .. :param path: path to file (string). :return: dictionary. """ summary_keys = [] # to keep track of content header_locked = False dictionary = {} with open(path) as f: for line in f: line = convert_unicode_string(line) if line != "": try: # Remove empty entries from list (caused by multiple \t) _l = line.replace('\n', '') if is_python3(): _l = [_f for _f in _l.split('\t') if _f] # Python 3 else: _l = filter(None, _l.split('\t')) # Python 2 # define dictionary keys if type(_l[0]) == str and not header_locked: summary_keys = _l for key in _l: dictionary[key] = [] header_locked = True else: # sort the memory measurements in the correct columns for i, key in enumerate(_l): # for key in _l: key_entry = summary_keys[i] # e.g. Time value = convert_to_int(key) dictionary[key_entry].append(value) except Exception: logger.warning("unexpected format of utility output: %s" % line) return dictionary
def verify_disk_usage(current_time, mt, job): """ Verify the disk usage. The function checks 1) payload stdout size, 2) local space, 3) work directory size, 4) output file sizes. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :param job: job object. :return: exit code (int), error diagnostics (string). """ disk_space_verification_time = convert_to_int( config.Pilot.disk_space_verification_time, default=300) if current_time - mt.get('ct_diskspace') > disk_space_verification_time: # time to check the disk space # check the size of the payload stdout exit_code, diagnostics = check_payload_stdout(job) if exit_code != 0: return exit_code, diagnostics # check the local space, if it's enough left to keep running the job exit_code, diagnostics = check_local_space() if exit_code != 0: return exit_code, diagnostics # check the size of the workdir exit_code, diagnostics = check_work_dir(job) if exit_code != 0: return exit_code, diagnostics # check the output file sizes exit_code, diagnostics = check_output_file_sizes(job) if exit_code != 0: return exit_code, diagnostics # update the ct_diskspace with the current time mt.update('ct_diskspace') return 0, ""
def should_abort_payload(current_time, mt): """ Should the pilot abort the payload? In the case of Raythena, the Driver is monitoring the time to end jobs and may decide that the pilot should abort the payload. Internally, this is achieved by letting the Actors know it's time to end, and they in turn contacts the pilot by placing a 'pilot_kill_payload' file in the run directory. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :return: exit code (int), error diagnostics (string). """ # is it time to look for the kill instruction file? killing_time = convert_to_int(config.Pilot.kill_instruction_time, default=600) if current_time - mt.get('ct_kill') > killing_time: path = os.path.join(os.environ.get('PILOT_HOME'), config.Pilot.kill_instruction_filename) if os.path.exists(path): logger.info('pilot encountered payload kill instruction file - will abort payload') return errors.KILLPAYLOAD, "" # note, this is not an error return 0, ""
def get_average_summary_dictionary_prmon(path): """ Loop over the memory monitor output file and create the averaged summary dictionary. prmon keys: 'Time', 'nprocs', 'nthreads', 'pss', 'rchar', 'read_bytes', 'rss', 'rx_bytes', 'rx_packets', 'stime', 'swap', 'tx_bytes', 'tx_packets', 'utime', 'vmem', 'wchar', 'write_bytes', 'wtime' The function uses the first line in the output file to define the dictionary keys used later in the function. This means that any change in the format such as new columns will be handled automatically. :param path: path to memory monitor txt output file (string). :return: summary dictionary. """ dictionary = {} summary_dictionary = {} summary_keys = [] # to keep track of content header_locked = False with open(path) as f: for line in f: line = convert_unicode_string(line) if line != "": try: # Remove empty entries from list (caused by multiple \t) _l = line.replace('\n', '') if is_python3(): _l = [_f for _f in _l.split('\t') if _f] # Python 3 else: _l = filter(None, _l.split('\t')) # Python 2 # define dictionary keys if type(_l[0]) == str and not header_locked: summary_keys = _l for key in _l: dictionary[key] = [] header_locked = True else: # sort the memory measurements in the correct columns for i, key in enumerate(_l): # for key in _l: key_entry = summary_keys[i] # e.g. Time value = convert_to_int(key) dictionary[key_entry].append(value) except Exception: logger.warning("unexpected format of utility output: %s" % line) # if dictionary: # Calculate averages and store all values summary_dictionary = {"Max": {}, "Avg": {}, "Other": {}} def filter_value(value): """ Inline function used to remove any string or None values from data. """ if type(value) == str or value is None: return False else: return True keys = ['vmem', 'pss', 'rss', 'swap'] values = {} for key in keys: value_list = list(filter(filter_value, dictionary.get(key, 0))) # Python 2/3 n = len(value_list) average = int(float(sum(value_list)) / float(n)) if n > 0 else 0 maximum = max(value_list) values[key] = {'avg': average, 'max': maximum} summary_dictionary["Max"] = {"maxVMEM": values['vmem'].get('max'), "maxPSS": values['pss'].get('max'), "maxRSS": values['rss'].get('max'), "maxSwap": values['swap'].get('max')} summary_dictionary["Avg"] = {"avgVMEM": values['vmem'].get('avg'), "avgPSS": values['pss'].get('avg'), "avgRSS": values['rss'].get('avg'), "avgSwap": values['swap'].get('avg')} # add the last of the rchar, .., values keys = ['rchar', 'wchar', 'read_bytes', 'write_bytes'] # warning: should read_bytes/write_bytes be reported as rbytes/wbytes? for key in keys: value = get_last_value(dictionary.get(key, None)) if value: summary_dictionary["Other"][key] = value return summary_dictionary