def get_payload_command(self, job): """ Return the payload command string. :param job: job object. :return: command (string). """ show_memory_usage() cmd = "" # for testing looping job: cmd = user.get_payload_command(job) + ';sleep 240' try: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 show_memory_usage() cmd = user.get_payload_command(job) # + 'sleep 480' except PilotException as error: self.post_setup(job) import traceback logger.error(traceback.format_exc()) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( error.get_error_code()) self.__traces.pilot['error_code'] = job.piloterrorcodes[0] logger.fatal( 'could not define payload command (traces error set to: %d)' % self.__traces.pilot['error_code']) return cmd
def verify_memory_usage(current_time, mt, job): """ Verify the memory usage (optional). Note: this function relies on a stand-alone memory monitor tool that may be executed by the Pilot. :param current_time: current time at the start of the monitoring loop (int). :param mt: measured time object. :param job: job object. :return: exit code (int), error diagnostics (string). """ show_memory_usage() pilot_user = os.environ.get('PILOT_USER', 'generic').lower() memory = __import__('pilot.user.%s.memory' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 if not memory.allow_memory_usage_verifications(): return 0, "" # is it time to verify the memory usage? memory_verification_time = convert_to_int(config.Pilot.memory_usage_verification_time, default=60) if current_time - mt.get('ct_memory') > memory_verification_time: # is the used memory within the allowed limit? try: exit_code, diagnostics = memory.memory_usage(job) except Exception as error: logger.warning('caught exception: %s', error) exit_code = -1 if exit_code != 0: logger.warning('ignoring failure to parse memory monitor output') #return exit_code, diagnostics else: # update the ct_proxy with the current time mt.update('ct_memory') return 0, ""
def execute_payloads(queues, traces, args): # noqa: C901 """ Execute queued payloads. Extract a Job object from the "validated_payloads" queue and put it in the "monitored_jobs" queue. The payload stdout/err streams are opened and the pilot state is changed to "starting". A payload executor is selected (for executing a normal job, an event service job or event service merge job). After the payload (or rather its executor) is started, the thread will wait for it to finish and then check for any failures. A successfully completed job is placed in the "finished_payloads" queue, and a failed job will be placed in the "failed_payloads" queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ job = None while not args.graceful_stop.is_set(): time.sleep(0.5) try: job = queues.validated_payloads.get(block=True, timeout=1) q_snapshot = list(queues.finished_data_in.queue) peek = [s_job for s_job in q_snapshot if job.jobid == s_job.jobid] if len(peek) == 0: #queues.validated_payloads.put(job) put_in_queue(job, queues.validated_payloads) for i in range(10): # Python 3 if args.graceful_stop.is_set(): break time.sleep(1) continue # this job is now to be monitored, so add it to the monitored_payloads queue #queues.monitored_payloads.put(job) put_in_queue(job, queues.monitored_payloads) logger.info('job %s added to monitored payloads queue' % job.jobid) try: out = open( os.path.join(job.workdir, config.Payload.payloadstdout), 'wb') err = open( os.path.join(job.workdir, config.Payload.payloadstderr), 'wb') except Exception as e: logger.warning('failed to open payload stdout/err: %s' % e) out = None err = None send_state(job, args, 'starting') # note: when sending a state change to the server, the server might respond with 'tobekilled' if job.state == 'failed': logger.warning( 'job state is \'failed\' - abort execute_payloads()') break payload_executor = get_payload_executor(args, job, out, err, traces) logger.info("Got payload executor: %s" % payload_executor) show_memory_usage() # run the payload and measure the execution time job.t0 = os.times() exit_code = payload_executor.run() set_cpu_consumption_time(job) job.transexitcode = exit_code % 255 out.close() err.close() pilot_user = os.environ.get('PILOT_USER', 'generic').lower() # some HPO jobs will produce new output files (following lfn name pattern), discover those and replace the job.outdata list if job.is_hpo: user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: user.update_output_for_hpo(job) except Exception as e: logger.warning( 'exception caught by update_output_for_hpo(): %s' % e) else: for dat in job.outdata: if not dat.guid: dat.guid = get_guid() logger.warning( 'guid not set: generated guid=%s for lfn=%s' % (dat.guid, dat.lfn)) #if traces.pilot['nr_jobs'] == 1: # logger.debug('faking job failure in first multi-job') # job.transexitcode = 1 # exit_code = 1 # analyze and interpret the payload execution output perform_initial_payload_error_analysis(job, exit_code) # was an error already found? #if job.piloterrorcodes: # exit_code_interpret = 1 #else: user = __import__('pilot.user.%s.diagnose' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 try: exit_code_interpret = user.interpret(job) except Exception as e: logger.warning('exception caught: %s' % e) #exit_code_interpret = -1 job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.INTERNALPILOTPROBLEM) if job.piloterrorcodes: exit_code_interpret = 1 if exit_code_interpret == 0 and exit_code == 0: logger.info( 'main payload error analysis completed - did not find any errors' ) # update output lists if zipmaps were used #job.add_archives_to_output_lists() # queues.finished_payloads.put(job) put_in_queue(job, queues.finished_payloads) else: logger.debug( 'main payload error analysis completed - adding job to failed_payloads queue' ) #queues.failed_payloads.put(job) put_in_queue(job, queues.failed_payloads) except queue.Empty: continue except Exception as e: logger.fatal( 'execute payloads caught an exception (cannot recover): %s, %s' % (e, traceback.format_exc())) if job: job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.PAYLOADEXECUTIONEXCEPTION) #queues.failed_payloads.put(job) put_in_queue(job, queues.failed_payloads) while not args.graceful_stop.is_set(): # let stage-out of log finish, but stop running payloads as there should be a problem with the pilot time.sleep(5) # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.info('[payload] execute_payloads thread has finished')
def run(self): # noqa: C901 """ Run all payload processes (including pre- and post-processes, and utilities). In the case of HPO jobs, this function will loop over all processes until the preprocess returns a special exit code. :return: """ # get the payload command from the user specific code self.pre_setup(self.__job) cmd = self.get_payload_command(self.__job) # extract the setup in case the preprocess command needs it self.__job.setup = self.extract_setup(cmd) self.post_setup(self.__job) # a loop is needed for HPO jobs # abort when nothing more to run, or when the preprocess returns a special exit code iteration = 0 while True: logger.info('payload iteration loop #%d', iteration + 1) os.environ['PILOT_EXEC_ITERATION_COUNT'] = '%s' % iteration show_memory_usage() # first run the preprocess (if necessary) - note: this might update jobparams -> must update cmd jobparams_pre = self.__job.jobparams exit_code = self.run_preprocess(self.__job) jobparams_post = self.__job.jobparams if exit_code: if exit_code >= 160 and exit_code <= 162: exit_code = 0 # wipe the output file list since there won't be any new files # any output files from previous iterations, should have been transferred already logger.debug( 'reset outdata since further output should not be expected after preprocess exit' ) self.__job.outdata = [] break if jobparams_pre != jobparams_post: logger.debug( 'jobparams were updated by utility_before_payload()') # must update cmd cmd = cmd.replace(jobparams_pre, jobparams_post) # now run the main payload, when it finishes, run the postprocess (if necessary) # note: no need to run any main payload in HPO Horovod jobs on Kubernetes if os.environ.get('HARVESTER_HOROVOD', '') == '': #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[before payload start] stdout=%s', _stdout) #logger.debug('[before payload start] stderr=%s', _stderr) proc = self.run_payload(self.__job, cmd, self.__out, self.__err) else: proc = None proc_co = None if proc is None: # run the post-process command even if there was no main payload if os.environ.get('HARVESTER_HOROVOD', '') != '': logger.info('No need to execute any main payload') exit_code = self.run_utility_after_payload_finished( exit_code, True, UTILITY_AFTER_PAYLOAD_FINISHED2) self.post_payload(self.__job) else: break else: # the process is now running, update the server # test 'tobekilled' from here to try payload kill send_state(self.__job, self.__args, self.__job.state) # note: when sending a state change to the server, the server might respond with 'tobekilled' if self.__job.state == 'failed': logger.warning( 'job state is \'failed\' - abort payload and run()') kill_processes(proc.pid) break # allow for a secondary command to be started after the payload (e.g. a coprocess) utility_cmd = self.get_utility_command( order=UTILITY_AFTER_PAYLOAD_STARTED2) if utility_cmd: logger.debug('starting utility command: %s', utility_cmd) label = 'coprocess' if 'coprocess' in utility_cmd else None proc_co = self.run_command(utility_cmd, label=label) logger.info('will wait for graceful exit') exit_code = self.wait_graceful(self.__args, proc) # reset error if Raythena decided to kill payload (no error) if errors.KILLPAYLOAD in self.__job.piloterrorcodes: logger.debug('ignoring KILLPAYLOAD error') self.__job.piloterrorcodes, self.__job.piloterrordiags = errors.remove_error_code( errors.KILLPAYLOAD, pilot_error_codes=self.__job.piloterrorcodes, pilot_error_diags=self.__job.piloterrordiags) exit_code = 0 state = 'finished' else: state = 'finished' if exit_code == 0 else 'failed' set_pilot_state(job=self.__job, state=state) logger.info('\n\nfinished pid=%s exit_code=%s state=%s\n', proc.pid, exit_code, self.__job.state) #exit_code, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[after payload finish] stdout=%s', _stdout) #logger.debug('[after payload finish] stderr=%s', _stderr) # stop the utility command (e.g. a coprocess if necessary if proc_co: logger.debug('stopping utility command: %s', utility_cmd) kill_processes(proc_co.pid) if exit_code is None: logger.warning( 'detected unset exit_code from wait_graceful - reset to -1' ) exit_code = -1 for order in [ UTILITY_AFTER_PAYLOAD_FINISHED, UTILITY_AFTER_PAYLOAD_FINISHED2 ]: exit_code = self.run_utility_after_payload_finished( exit_code, state, order) self.post_payload(self.__job) # stop any running utilities if self.__job.utilities != {}: self.stop_utilities() if self.__job.is_hpo and state != 'failed': # in case there are more hyper-parameter points, move away the previous log files #self.rename_log_files(iteration) iteration += 1 else: break return exit_code