def copytool_out(queues, traces, args): """ Main stage-out thread. Perform stage-out as soon as a job object can be extracted from the data_out queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ cont = True logger.debug('entering copytool_out loop') if args.graceful_stop.is_set(): logger.debug('graceful_stop already set') processed_jobs = [] while cont: time.sleep(0.5) # abort if kill signal arrived too long time ago, ie loop is stuck current_time = int(time.time()) if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning( 'loop has run for too long time after first kill signal - will abort' ) break # check for abort, print useful messages and include a 1 s sleep abort = should_abort(args, label='data:copytool_out') try: job = queues.data_out.get(block=True, timeout=1) if job: # hack to prevent stage-out to be called more than once for same job object (can apparently happen # in multi-output jobs) # should not be necessary unless job object is added to queues.data_out more than once - check this # for multiple output files if processed_jobs: if is_already_processed(queues, processed_jobs): continue logger.info('will perform stage-out for job id=%s', job.jobid) if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_out detected a set abort_job pre stage-out (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_out, args.signal) break if _stage_out_new(job, args): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_out detected a set abort_job post stage-out (due to a kill signal)' ) #declare_failed_by_kill(job, queues.failed_data_out, args.signal) break #queues.finished_data_out.put(job) processed_jobs.append(job.jobid) put_in_queue(job, queues.finished_data_out) logger.debug('job object added to finished_data_out queue') else: #queues.failed_data_out.put(job) put_in_queue(job, queues.failed_data_out) logger.debug('job object added to failed_data_out queue') else: logger.debug('no returned job - why no exception?') except queue.Empty: if abort: cont = False break continue if abort: cont = False break # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] copytool_out thread has finished')
def copytool_in(queues, traces, args): """ Call the stage-in function and put the job object in the proper queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) try: # abort if kill signal arrived too long time ago, ie loop is stuck current_time = int(time.time()) if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning( 'loop has run for too long time after first kill signal - will abort' ) break # extract a job to stage-in its input job = queues.data_in.get(block=True, timeout=1) # does the user want to execute any special commands before stage-in? pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.common' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 cmd = user.get_utility_commands(job=job, order=UTILITY_BEFORE_STAGEIN) if cmd: # xcache debug #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[before xcache start] stdout=%s', _stdout) #logger.debug('[before xcache start] stderr=%s', _stderr) _, stdout, stderr = execute(cmd.get('command')) logger.debug('stdout=%s', stdout) logger.debug('stderr=%s', stderr) # xcache debug #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh') #logger.debug('[after xcache start] stdout=%s', _stdout) #logger.debug('[after xcache start] stderr=%s', _stderr) # perform any action necessary after command execution (e.g. stdout processing) kwargs = { 'label': cmd.get('label', 'utility'), 'output': stdout } user.post_prestagein_utility_command(**kwargs) # write output to log files write_utility_output(job.workdir, cmd.get('label', 'utility'), stdout, stderr) # place it in the current stage-in queue (used by the jobs' queue monitoring) put_in_queue(job, queues.current_data_in) # ready to set the job in running state send_state(job, args, 'running') # note: when sending a state change to the server, the server might respond with 'tobekilled' if job.state == 'failed': logger.warning( 'job state is \'failed\' - order log transfer and abort copytool_in()' ) job.stageout = 'log' # only stage-out log file put_in_queue(job, queues.data_out) break os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job pre stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job post stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: logger.debug( 'job %s has been removed from the current_data_in queue', _job.jobid) # now create input file metadata if required by the payload if os.environ.get('PILOT_ES_EXECUTOR_TYPE', 'generic') == 'generic': pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata( file_dictionary, job.workdir) logger.info('created input file metadata:\n%s', xml) else: # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: logger.debug( 'job %s has been removed from the current_data_in queue', _job.jobid) logger.warning( 'stage-in failed, adding job object to failed_data_in queue' ) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") traces.pilot['error_code'] = job.piloterrorcodes[0] put_in_queue(job, queues.failed_data_in) # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) args.graceful_stop.set() except queue.Empty: continue # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] copytool_in thread has finished')
def copytool_in(queues, traces, args): """ Call the stage-in function and put the job object in the proper queue. :param queues: :param traces: :param args: :return: """ while not args.graceful_stop.is_set(): try: # extract a job to stage-in its input job = queues.data_in.get(block=True, timeout=1) # place it in the current stage-in queue (used by the jobs' queue monitoring) if job: put_in_queue(job, queues.current_data_in) # ready to set the job in running state send_state(job, args, 'running') os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING log = get_logger(job.jobid) if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_in, args.signal) break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_in, args.signal) break #queues.finished_data_in.put(job) put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: log.debug('job %s has been removed from the current_data_in queue' % _job.jobid) # now create input file metadata if required by the payload try: pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], -1) _dir = '/srv' if job.usecontainer else job.workdir file_dictionary = get_input_file_dictionary(job.indata, _dir) #file_dictionary = get_input_file_dictionary(job.indata, job.workdir) log.debug('file_dictionary=%s' % str(file_dictionary)) xml = user.create_input_file_metadata(file_dictionary, job.workdir) log.info('created input file metadata:\n%s' % xml) except Exception as e: pass else: log.warning('stage-in failed, adding job object to failed_data_in queue') job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") traces.pilot['error_code'] = job.piloterrorcodes[0] #queues.failed_data_in.put(job) put_in_queue(job, queues.failed_data_in) # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) args.graceful_stop.set() # send_state(job, args, 'failed') except queue.Empty: continue logger.debug('[data] copytool_in thread has finished')
def copytool_out(queues, traces, args): """ Main stage-out thread. Perform stage-out as soon as a job object can be extracted from the data_out queue. :param queues: pilot queues object. :param traces: pilot traces object. :param args: pilot args object. :return: """ cont = True logger.debug('entering copytool_out loop') if args.graceful_stop.is_set(): logger.debug('graceful_stop already set') first = True # while not args.graceful_stop.is_set() and cont: while cont: if first: first = False logger.debug('inside copytool_out() loop') # check for abort, print useful messages and include a 1 s sleep abort = should_abort(args, label='data:copytool_out') if abort: logger.debug('will abort ') try: job = queues.data_out.get(block=True, timeout=1) if job: log = get_logger(job.jobid) log.info('will perform stage-out') if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_out detected a set abort_job pre stage-out (due to a kill signal)') declare_failed_by_kill(job, queues.failed_data_out, args.signal) break if _stage_out_new(job, args): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' log.warning('copytool_out detected a set abort_job post stage-out (due to a kill signal)') #declare_failed_by_kill(job, queues.failed_data_out, args.signal) break #queues.finished_data_out.put(job) put_in_queue(job, queues.finished_data_out) log.debug('job object added to finished_data_out queue') else: #queues.failed_data_out.put(job) put_in_queue(job, queues.failed_data_out) log.debug('job object added to failed_data_out queue') else: log.debug('no returned job - why no exception?') except queue.Empty: if abort: logger.debug('aborting') cont = False break continue if abort: logger.debug('aborting') cont = False break logger.debug('[data] copytool_out thread has finished')
def copytool_in(queues, traces, args): """ Call the stage-in function and put the job object in the proper queue. :param queues: internal queues for job handling. :param traces: tuple containing internal pilot states. :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc). :return: """ while not args.graceful_stop.is_set(): time.sleep(0.5) try: # abort if kill signal arrived too long time ago, ie loop is stuck current_time = int(time.time()) if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME: logger.warning( 'loop has run for too long time after first kill signal - will abort' ) break # extract a job to stage-in its input job = queues.data_in.get(block=True, timeout=1) # place it in the current stage-in queue (used by the jobs' queue monitoring) if job: put_in_queue(job, queues.current_data_in) # ready to set the job in running state send_state(job, args, 'running') # note: when sending a state change to the server, the server might respond with 'tobekilled' if job.state == 'failed': logger.warning( 'job state is \'failed\' - order log transfer and abort copytool_in()' ) job.stageout = 'log' # only stage-out log file put_in_queue(job, queues.data_out) break os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job pre stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break if _stage_in(args, job): if args.abort_job.is_set(): traces.pilot['command'] = 'abort' logger.warning( 'copytool_in detected a set abort_job post stage-in (due to a kill signal)' ) declare_failed_by_kill(job, queues.failed_data_in, args.signal) break put_in_queue(job, queues.finished_data_in) # remove the job from the current stage-in queue _job = queues.current_data_in.get(block=True, timeout=1) if _job: logger.debug( 'job %s has been removed from the current_data_in queue' % _job.jobid) # now create input file metadata if required by the payload if config.Payload.executor_type.lower() != 'raythena': pilot_user = os.environ.get('PILOT_USER', 'generic').lower() user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], 0) # Python 2/3 file_dictionary = get_input_file_dictionary(job.indata) xml = user.create_input_file_metadata( file_dictionary, job.workdir) logger.info('created input file metadata:\n%s' % xml) else: logger.warning( 'stage-in failed, adding job object to failed_data_in queue' ) job.piloterrorcodes, job.piloterrordiags = errors.add_error_code( errors.STAGEINFAILED) set_pilot_state(job=job, state="failed") traces.pilot['error_code'] = job.piloterrorcodes[0] put_in_queue(job, queues.failed_data_in) # do not set graceful stop if pilot has not finished sending the final job update # i.e. wait until SERVER_UPDATE is DONE_FINAL check_for_final_server_update(args.update_server) args.graceful_stop.set() except queue.Empty: continue # proceed to set the job_aborted flag? if threads_aborted(): logger.debug('will proceed to set job_aborted') args.job_aborted.set() else: logger.debug('will not set job_aborted yet') logger.debug('[data] copytool_in thread has finished')