def request_eventranges(job_def): global sfm_har_config, sfm_har_config_done sfm_har_config_done.wait() # retrieve event request file eventRequestFile = sfm_har_config['eventRequestFile'] eventRequestFile_tmp = eventRequestFile + '.tmp' # crate event request file if not os.path.exists(eventRequestFile): # need to output a file containing: # {'nRanges': ???, 'pandaID':???, 'taskID':???, 'jobsetID':???} logger.debug( 'requesting new event ranges by writing to file "%s" with this content: %s', eventRequestFile, job_def) # get new job definition new_job_def = {job_def['pandaID']: job_def} f = open(eventRequestFile_tmp, 'w') f.write(serializer.serialize(new_job_def)) f.close() # now move tmp filename to real filename os.rename(eventRequestFile_tmp, eventRequestFile) else: logger.debug('request file already exists. Adding requests') # move current file to temp os.rename(eventRequestFile, eventRequestFile_tmp) filedata = open(eventRequestFile_tmp).read() requests = serializer.deserialize(filedata) pandaID = job_def['pandaID'] if pandaID in requests: logger.debug('adding event range count to existing request') thisjob = requests[pandaID] if thisjob['jobsetID'] == job_def['jobsetID'] and thisjob[ 'taskID'] == job_def['taskID']: thisjob['nRanges'] += job_def['nRanges'] else: logger.warning( 'existing request for PandaID %s does not match new request details %s', thisjob, job_def) else: logger.debug('adding new job definition to existing request') requests[pandaID] = job_def # output updated requests to file open(eventRequestFile_tmp, 'w').write(serializer.serialize(requests)) # now move tmp filename to real filename os.rename(eventRequestFile_tmp, eventRequestFile)
def request_eventranges(job_def): global sfm_har_config,sfm_har_config_done sfm_har_config_done.wait() # retrieve event request file eventRequestFile = sfm_har_config['eventRequestFile'] eventRequestFile_tmp = eventRequestFile + '.tmp' # crate event request file if not os.path.exists(eventRequestFile): # need to output a file containing: # {'nRanges': ???, 'pandaID':???, 'taskID':???, 'jobsetID':???} logger.debug('requesting new event ranges by writing to file "%s" with this content: %s',eventRequestFile,job_def) # get new job definition new_job_def = {job_def['pandaID']:job_def} f = open(eventRequestFile_tmp,'w') f.write(serializer.serialize(new_job_def)) f.close() # now move tmp filename to real filename os.rename(eventRequestFile_tmp,eventRequestFile) else: logger.debug('request file already exists. Adding requests') # move current file to temp os.rename(eventRequestFile,eventRequestFile_tmp) filedata = open(eventRequestFile_tmp).read() requests = serializer.deserialize(filedata) pandaID = job_def['pandaID'] if pandaID in requests: logger.debug('adding event range count to existing request') thisjob = requests[pandaID] if thisjob['jobsetID'] == job_def['jobsetID'] and thisjob['taskID'] == job_def['taskID']: thisjob['nRanges'] += job_def['nRanges'] else: logger.warning('existing request for PandaID %s does not match new request details %s',thisjob,job_def) else: logger.debug('adding new job definition to existing request') requests[pandaID] = job_def # output updated requests to file open(eventRequestFile_tmp,'w').write(serializer.serialize(requests)) # now move tmp filename to real filename os.rename(eventRequestFile_tmp,eventRequestFile)
def send_eventrange(self,eventranges,athpayloadcomm,no_more_events): logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug('have %d ready event ranges to send to AthenaMP',eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send(athena_payloadcommunicator.NO_MORE_EVENTS) else: # otherwise, raise the Exception to trigger an event request raise # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error('requested more event ranges than available, going to try waiting for more events') raise EventRangeList.NoMoreEventRanges() else: logger.info('sending eventranges to AthenaMP: %s',local_eventranges) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('found %s input files',len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(),evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send(serializer.serialize(local_eventranges))
def send_eventrange(self, eventranges, athpayloadcomm, no_more_events): logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug('have %d ready event ranges to send to AthenaMP', eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send(AthenaPayloadCommunicator.NO_MORE_EVENTS) else: # otherwise, raise the Exception to trigger an event request raise # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error( 'requested more event ranges than available, going to try waiting for more events' ) raise EventRangeList.NoMoreEventRanges() else: logger.info('sending eventranges to AthenaMP: %s', local_eventranges) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('found %s input files',len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(), evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send(serializer.serialize(local_eventranges))
def stage_out_files(file_list,output_type): global sfm_har_config,sfm_har_config_done sfm_har_config_done.wait() if output_type not in ['output','es_output','log']: raise Exception('incorrect type provided: %s' % (output_type)) # load name of eventStatusDumpJsonFile file eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile'] eventStatusDumpData = {} # loop over filelist for filedata in file_list: # make sure pandaID is a string pandaID = str(filedata['pandaid']) chksum = None if 'chksum' in filedata: chksum = filedata['chksum'] # filename = os.path.join(output_path,os.path.basename(filedata['filename'])) # format data for file: file_descriptor = {'eventRangeID':filedata['eventrangeid'], 'eventStatus':filedata['eventstatus'], 'path':filedata['filename'], 'type':output_type, 'chksum': chksum, 'guid': None, } try: eventStatusDumpData[pandaID].append(file_descriptor) except KeyError: eventStatusDumpData[pandaID] = [file_descriptor] # create a temp file to place contents # this avoids Harvester trying to read the file while it is being written eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp' # if file does not already exists, new data is just what we have if not os.path.exists(eventStatusDumpJsonFile): data = eventStatusDumpData # if the file exists, move it to a tmp filename, update its contents and then recreate it. else: # first move existing file to tmp so Harvester does not read it while we edit try: os.rename(eventStatusDumpJsonFile,eventStatusDumpJsonFile_tmp) except Exception: logger.warning('tried moving %s to a tmp filename to add more output files for Harvester.',eventStatusDumpJsonFile) if not os.path.exists(eventStatusDumpJsonFile): logger.warning('%s file no longer exists so Harvester must have grabbed it. Need to create a new file',eventStatusDumpJsonFile) data = eventStatusDumpData else: # now open and read in the data with open(eventStatusDumpJsonFile_tmp,'r') as f: data = serializer.deserialize(f.read()) logger.debug('found existing data for pandaIDs: %s',data.keys()) for pandaID in eventStatusDumpData: # if the pandaID already exists, just append the new file to that list try: logger.debug('addding data to existing panda list') data[pandaID] += eventStatusDumpData[pandaID] # if the pandaID does not exist, add a new list except KeyError: logger.debug('addding new panda id list') data[pandaID] = eventStatusDumpData[pandaID] if logger.getEffectiveLevel() == logging.DEBUG: tmpstr = ' '.join('%s:%s' % (x,len(data[x])) for x in data) logger.debug('writing output to file %s with keys: %s', eventStatusDumpJsonFile,tmpstr) # overwrite the temp file with the updated data with open(eventStatusDumpJsonFile_tmp,'w') as f: f.write(serializer.serialize(data,pretty_print=True)) # move tmp file into place os.rename(eventStatusDumpJsonFile_tmp,eventStatusDumpJsonFile) logger.debug('done')
def stage_out_file(output_type,output_path,eventRangeID,eventStatus,pandaID,chksum=None,): global sfm_har_config,sfm_har_config_done sfm_har_config_done.wait() if output_type not in ['output','es_output','log']: raise Exception('incorrect type provided: %s' % (output_type)) if not os.path.exists(output_path): raise Exception('output file not found: %s' % (output_path)) # make sure pandaID is a string pandaID = str(pandaID) # load name of eventStatusDumpJsonFile file eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile'] # first create a temp file to place contents # this avoids Harvester trying to read the file while it is being written eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp' # format data for file: file_descriptor = {'eventRangeID':eventRangeID, 'eventStatus':eventStatus, 'path':output_path, 'type':output_type, 'chksum': chksum, 'guid': None, } # if file does not already exists, new data is just what we have if not os.path.exists(eventStatusDumpJsonFile): data = {pandaID: [file_descriptor]} # if the file exists, move it to a tmp filename, update its contents and then recreate it. else: # first move existing file to tmp so Harvester does not read it while we edit try: os.rename(eventStatusDumpJsonFile,eventStatusDumpJsonFile_tmp) except Exception: logger.warning('tried moving %s to a tmp filename to add more output files for Harvester.',eventStatusDumpJsonFile) if not os.path.exists(eventStatusDumpJsonFile): logger.warning('%s file no longer exists so Harvester must have grabbed it. Need to create a new file',eventStatusDumpJsonFile) data = {pandaID: [file_descriptor]} else: # now open and read in the data with open(eventStatusDumpJsonFile_tmp,'r') as f: data = serializer.deserialize(f.read()) logger.debug('existing data contains %s',data) # if the pandaID already exists, just append the new file to that list if pandaID in data: logger.debug('addding data to existing panda list') data[pandaID].append(file_descriptor) # if the pandaID does not exist, add a new list else: logger.debug('addding new panda id list') data[pandaID] = [file_descriptor] logger.debug('output to file %s: %s',eventStatusDumpJsonFile,data) # overwrite the temp file with the updated data with open(eventStatusDumpJsonFile_tmp,'w') as f: f.write(serializer.serialize(data)) # move tmp file into place os.rename(eventStatusDumpJsonFile_tmp,eventStatusDumpJsonFile)
def run(self): ''' this is the function run as a subthread when the user runs jobComm_instance.start() ''' self.read_config() logger.debug('start yampl payloadcommunicator') athpayloadcomm = athena_payloadcommunicator(self.yampl_socket_name) payload_msg = '' # current list of output files to send via MPI output_files = [] last_output_file_mpi_send = time.time() # list of event ranges eventranges = EventRangeList.EventRangeList() no_more_events = False waiting_for_eventranges = False event_range_request_counter = 0 # current panda job that AthenaMP is configured to run current_job = None while not self.exit.is_set(): logger.debug('start loop: state: %s',self.get_state()) # in debug mode, report evenranges status if logger.getEffectiveLevel() == logging.DEBUG: ready_events = eventranges.number_ready() number_completed = eventranges.number_completed() total = len(eventranges) logger.debug('number of ready events %s; number of completed events %s; total events %s',ready_events,number_completed,total) # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) == 0: last_output_file_mpi_send = time.time() elif (time.time() - last_output_file_mpi_send) > self.aggregate_output_files_time: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager',len(output_files)) mpi_message = {'type':MessageTypes.OUTPUT_FILE, 'filelist':output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # set time for next send last_output_file_mpi_send = time.time() # reset output file list output_files = [] ################## # WAITING_FOR_JOB: waiting for the job definition to arrive, before # it does, it is assumed that there is no payload running ###################################################################### if self.get_state() == self.WAITING_FOR_JOB: logger.info(' waiting for job definition, blocking on message queue for %s ',self.loop_timeout) try: qmsg = self.queues['JobComm'].get(block=True,timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0,self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s',tmpmsg) # verify message type is as expected if 'type' not in qmsg or qmsg['type'] != MessageTypes.NEW_JOB or 'job' not in qmsg: logger.error('received unexpected message format: %s',qmsg) else: logger.info('received job definition') current_job = qmsg['job'] # change state self.set_state(self.REQUEST_EVENT_RANGES) qmsg = None ################## # REQUEST_EVENT_RANGES: Request event ranges from Yoda ###################################################################### elif self.get_state() == self.REQUEST_EVENT_RANGES: if not waiting_for_eventranges: logger.info('sending request for event ranges') # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # change state self.set_state(self.WAITING_FOR_EVENT_RANGES) ################## # WAITING_FOR_EVENT_RANGES: Waiting for event ranges from Yoda ###################################################################### elif self.get_state() == self.WAITING_FOR_EVENT_RANGES: logger.info('waiting for event ranges, blocking on message queue for %s',self.loop_timeout) try: qmsg = self.queues['JobComm'].get(block=True,timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0,self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s',tmpmsg) if 'type' not in qmsg: logger.error('received unexpected message format: %s',qmsg) elif qmsg['type'] == MessageTypes.NEW_EVENT_RANGES: logger.info('received event ranges, adding to list') eventranges += EventRangeList.EventRangeList(qmsg['eventranges']) # add event ranges to payload messenger list # payloadcomm.add_eventranges(eventranges) # change state self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s',qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info('no eventranges left to send so triggering exit') self.stop() else: logger.info('still have events to process so continuing') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('unknown message type: %s',qmsg['type']) waiting_for_eventranges = False qmsg = None ################## # WAIT_FOR_PAYLOAD_MESSAGE: initiates # a request for a message from the payload ###################################################################### if self.get_state() == self.WAIT_FOR_PAYLOAD_MESSAGE: # first check if there is an incoming message try: logger.debug('checking for queue message') qmsg = self.queues['JobComm'].get(block=False) if MessageTypes.NEW_EVENT_RANGES in qmsg['type']: logger.info('received new event range') eventranges += EventRangeList.EventRangeList(qmsg['eventranges']) waiting_for_eventranges = False elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s',qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info('no eventranges left to send so triggering exit') self.stop() else: logger.info('still have events to process so continuing') else: logger.error('received message of unknown type: %s',qmsg) except Queue.Empty: logger.debug('no messages on queue') logger.info('checking for message from payload, block for %s, pending event range requests: %s',self.loop_timeout,event_range_request_counter) payload_msg = athpayloadcomm.recv(self.loop_timeout) if len(payload_msg) > 0: logger.debug('received message: %s',payload_msg) self.set_state(self.MESSAGE_RECEIVED) else: logger.debug('did not receive message from payload') if event_range_request_counter > 0: logger.debug('have %s pending event range requests so will try sending one.',event_range_request_counter) self.set_state(self.SEND_EVENT_RANGE) # time.sleep(self.loop_timeout) ################## # MESSAGE_RECEIVED: this state indicates that a message has been # received from the payload and its meaning will be parsed ###################################################################### elif self.get_state() == self.MESSAGE_RECEIVED: # if ready for events, send them or wait for some if athena_payloadcommunicator.READY_FOR_EVENTS in payload_msg: logger.info('payload is ready for event range') self.set_state(self.SEND_EVENT_RANGE) # increment counter to keep track of how many requests are queued event_range_request_counter += 1 #### OUTPUT File received elif len(payload_msg.split(',')) == 4: # Athena sent details of an output file logger.info('received output file from AthenaMP') self.set_state(self.SEND_OUTPUT_FILE) else: logger.error('failed to parse message from Athena: %s',payload_msg) self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) ################## # SEND_EVENT_RANGE: wait until more event ranges are sent by JobComm ###################################################################### elif self.get_state() == self.SEND_EVENT_RANGE: logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug('have %d ready event ranges to send to AthenaMP',eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send(athena_payloadcommunicator.NO_MORE_EVENTS) # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # otherwise wait for more events else: logger.info('waiting for more events ranges') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error('requested more event ranges than available, waiting for more event ranges') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.info('sending %s eventranges to AthenaMP',len(local_eventranges)) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('%s: found %s input files',self.prelog,len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(),evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send(serializer.serialize(local_eventranges)) # decrement counter since we sent some events event_range_request_counter -= 1 # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None ################## # SEND_OUTPUT_FILE: send output file data to MPIService ###################################################################### elif self.get_state() == self.SEND_OUTPUT_FILE: logger.debug('send output file information') # parse message parts = payload_msg.split(',') # there should be four parts: # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" if len(parts) == 4: # parse the parts outputfilename = parts[0] eventrangeid = parts[1].replace('ID:','') cpu = parts[2].replace('CPU:','') wallclock = parts[3].replace('WALL:','') # if staging, stage and change output filename if self.stage_outputs: # move file to staging_path logger.debug('shutil.move(%s,%s)',outputfilename,self.staging_path) shutil.move(outputfilename,self.staging_path) # change output filename outputfilename = os.path.join(self.staging_path,os.path.basename(outputfilename)) logger.info('outputfilename - %s',outputfilename) # build the data for Harvester output file output_file_data = {'type':MessageTypes.OUTPUT_FILE, 'filename':outputfilename, 'eventrangeid':eventrangeid, 'cpu':cpu, 'wallclock':wallclock, 'scope':current_job['scopeOut'], 'pandaid':current_job['PandaID'], 'eventstatus':'finished', 'destination_rank': 0, } # self.output_file_data.set(output_file_data) # append output file data to list of files for transfer via MPI output_files.append(output_file_data) logger.info('received output file from AthenaMP; %s output files now on waiting list',len(output_files)) # set event range to completed: logger.debug('mark event range id %s as completed',output_file_data['eventrangeid']) try: eventranges.mark_completed(output_file_data['eventrangeid']) except Exception: logger.error('failed to mark eventrangeid %s as completed',output_file_data['eventrangeid']) self.stop() # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('failed to parse output file') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None # if ready_events is below the threshold and the no more events flag has not been set # request more event ranges if eventranges.number_ready() < self.get_more_events_threshold and not no_more_events and not waiting_for_eventranges and current_job is not None: logger.info('number of ready events %s below request threshold %s, asking for more.',eventranges.number_ready(),self.get_more_events_threshold) # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # if the number of completed events equals the number of event ranges # available, and no more events flag is set, then kill subprocess and exit. elif eventranges.number_ready() == 0 and eventranges.number_completed() == len(eventranges) and no_more_events: logger.info('no more events to process, exiting') self.stop() self.all_work_done.set() # else: # logger.info('sleeping for %s',self.loop_timeout) # self.exit.wait(timeout=self.loop_timeout) # send any remaining output files to Yoda before exitingn. # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) > 0: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager',len(output_files)) mpi_message = {'type':MessageTypes.OUTPUT_FILE, 'filelist':output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # reset output file list output_files = [] self.set_state(self.EXITED) logger.info('JobComm exiting')
def run(self): # noqa: C901 """ this is the function run as a subthread when the user runs jobComm_instance.start() """ self.read_config() logger.debug('start yampl payloadcommunicator') athpayloadcomm = AthenaPayloadCommunicator(self.yampl_socket_name) payload_msg = '' # current list of output files to send via MPI output_files = [] last_output_file_mpi_send = time.time() # list of event ranges eventranges = EventRangeList.EventRangeList() no_more_events = False waiting_for_eventranges = False event_range_request_counter = 0 # current panda job that AthenaMP is configured to run current_job = None while not self.exit.is_set(): logger.debug('start loop: state: %s', self.get_state()) # in debug mode, report evenranges status if logger.getEffectiveLevel() == logging.DEBUG: ready_events = eventranges.number_ready() number_completed = eventranges.number_completed() total = len(eventranges) logger.debug( 'number of ready events %s; number of completed events %s; total events %s', ready_events, number_completed, total) # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) == 0: last_output_file_mpi_send = time.time() elif (time.time() - last_output_file_mpi_send ) > self.aggregate_output_files_time: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager', len(output_files)) mpi_message = { 'type': MessageTypes.OUTPUT_FILE, 'filelist': output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # set time for next send last_output_file_mpi_send = time.time() # reset output file list output_files = [] ################## # WAITING_FOR_JOB: waiting for the job definition to arrive, before # it does, it is assumed that there is no payload running ###################################################################### if self.get_state() == self.WAITING_FOR_JOB: logger.info( ' waiting for job definition, blocking on message queue for %s ', self.loop_timeout) try: qmsg = self.queues['JobComm'].get( block=True, timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0, self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s', tmpmsg) # verify message type is as expected if 'type' not in qmsg or qmsg[ 'type'] != MessageTypes.NEW_JOB or 'job' not in qmsg: logger.error('received unexpected message format: %s', qmsg) else: logger.info('received job definition') current_job = qmsg['job'] # change state self.set_state(self.REQUEST_EVENT_RANGES) qmsg = None ################## # REQUEST_EVENT_RANGES: Request event ranges from Yoda ###################################################################### elif self.get_state() == self.REQUEST_EVENT_RANGES: if not waiting_for_eventranges: logger.info('sending request for event ranges') # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # change state self.set_state(self.WAITING_FOR_EVENT_RANGES) ################## # WAITING_FOR_EVENT_RANGES: Waiting for event ranges from Yoda ###################################################################### elif self.get_state() == self.WAITING_FOR_EVENT_RANGES: logger.info( 'waiting for event ranges, blocking on message queue for %s', self.loop_timeout) try: qmsg = self.queues['JobComm'].get( block=True, timeout=self.loop_timeout) except Queue.Empty: logger.debug('no message on queue') else: # shorten our message for printing if logger.getEffectiveLevel() == logging.DEBUG: tmpmsg = str(qmsg) if len(tmpmsg) > self.debug_message_char_length: tmpslice = slice(0, self.debug_message_char_length) tmpmsg = tmpmsg[tmpslice] + '...' logger.debug('received queue message: %s', tmpmsg) if 'type' not in qmsg: logger.error('received unexpected message format: %s', qmsg) elif qmsg['type'] == MessageTypes.NEW_EVENT_RANGES: logger.info('received event ranges, adding to list') eventranges += EventRangeList.EventRangeList( qmsg['eventranges']) # add event ranges to payload messenger list # payloadcomm.add_eventranges(eventranges) # change state self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s', qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info( 'no eventranges left to send so triggering exit' ) self.stop() else: logger.info( 'still have events to process so continuing') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('unknown message type: %s', qmsg['type']) waiting_for_eventranges = False qmsg = None ################## # WAIT_FOR_PAYLOAD_MESSAGE: initiates # a request for a message from the payload ###################################################################### if self.get_state() == self.WAIT_FOR_PAYLOAD_MESSAGE: # first check if there is an incoming message try: logger.debug('checking for queue message') qmsg = self.queues['JobComm'].get(block=False) if MessageTypes.NEW_EVENT_RANGES in qmsg['type']: logger.info('received new event range') eventranges += EventRangeList.EventRangeList( qmsg['eventranges']) waiting_for_eventranges = False elif qmsg['type'] == MessageTypes.NO_MORE_EVENT_RANGES: logger.info('no more event ranges for PandaID %s', qmsg['PandaID']) no_more_events = True # check for running events if len(eventranges) == eventranges.number_completed(): logger.info( 'no eventranges left to send so triggering exit' ) self.stop() else: logger.info( 'still have events to process so continuing') else: logger.error('received message of unknown type: %s', qmsg) except Queue.Empty: logger.debug('no messages on queue') logger.info( 'checking for message from payload, block for %s, pending event range requests: %s', self.loop_timeout, event_range_request_counter) payload_msg = athpayloadcomm.recv(self.loop_timeout) if len(payload_msg) > 0: logger.debug('received message: %s', payload_msg) self.set_state(self.MESSAGE_RECEIVED) else: logger.debug('did not receive message from payload') if event_range_request_counter > 0: logger.debug( 'have %s pending event range requests so will try sending one.', event_range_request_counter) self.set_state(self.SEND_EVENT_RANGE) # time.sleep(self.loop_timeout) ################## # MESSAGE_RECEIVED: this state indicates that a message has been # received from the payload and its meaning will be parsed ###################################################################### elif self.get_state() == self.MESSAGE_RECEIVED: # if ready for events, send them or wait for some if AthenaPayloadCommunicator.READY_FOR_EVENTS in payload_msg: logger.info('payload is ready for event range') self.set_state(self.SEND_EVENT_RANGE) # increment counter to keep track of how many requests are queued event_range_request_counter += 1 #### OUTPUT File received elif len(payload_msg.split(',')) == 4: # Athena sent details of an output file logger.info('received output file from AthenaMP') self.set_state(self.SEND_OUTPUT_FILE) else: logger.error('failed to parse message from Athena: %s', payload_msg) self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) ################## # SEND_EVENT_RANGE: wait until more event ranges are sent by JobComm ###################################################################### elif self.get_state() == self.SEND_EVENT_RANGE: logger.debug('sending event to payload') # if event ranges available, send one try: logger.debug( 'have %d ready event ranges to send to AthenaMP', eventranges.number_ready()) local_eventranges = eventranges.get_next() # no more event ranges available except EventRangeList.NoMoreEventRanges: logger.debug('there are no more event ranges to process') # if we have been told there are no more eventranges, then tell the AthenaMP worker there are no more events if no_more_events: logger.info('sending AthenaMP NO_MORE_EVENTS') athpayloadcomm.send( AthenaPayloadCommunicator.NO_MORE_EVENTS) # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # otherwise wait for more events else: logger.info('waiting for more events ranges') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) # something wrong with the index in the EventRangeList index except EventRangeList.RequestedMoreRangesThanAvailable: logger.error( 'requested more event ranges than available, waiting for more event ranges' ) self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.info('sending %s eventranges to AthenaMP', len(local_eventranges)) # append full path to file name for AthenaMP # and adjust event counter by the number of files # input_files = self.job_def.get()['inFiles'].split(',') # logger.debug('%s: found %s input files',self.prelog,len(input_files)) for evtrg in local_eventranges: evtrg['PFN'] = os.path.join(os.getcwd(), evtrg['LFN']) # send AthenaMP the new event ranges athpayloadcomm.send( serializer.serialize(local_eventranges)) # decrement counter since we sent some events event_range_request_counter -= 1 # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None ################## # SEND_OUTPUT_FILE: send output file data to MPIService ###################################################################### elif self.get_state() == self.SEND_OUTPUT_FILE: logger.debug('send output file information') # parse message parts = payload_msg.split(',') # there should be four parts: # "myHITS.pool.root_000.Range-6,ID:Range-6,CPU:1,WALL:1" if len(parts) == 4: # parse the parts outputfilename = parts[0] eventrangeid = parts[1].replace('ID:', '') cpu = parts[2].replace('CPU:', '') wallclock = parts[3].replace('WALL:', '') # if staging, stage and change output filename if self.stage_outputs: # move file to staging_path logger.debug('shutil.move(%s,%s)', outputfilename, self.staging_path) shutil.move(outputfilename, self.staging_path) # change output filename outputfilename = os.path.join( self.staging_path, os.path.basename(outputfilename)) logger.info('outputfilename - %s', outputfilename) # build the data for Harvester output file output_file_data = { 'type': MessageTypes.OUTPUT_FILE, 'filename': outputfilename, 'eventrangeid': eventrangeid, 'cpu': cpu, 'wallclock': wallclock, 'scope': current_job['scopeOut'], 'pandaid': current_job['PandaID'], 'eventstatus': 'finished', 'destination_rank': 0, } # self.output_file_data.set(output_file_data) # append output file data to list of files for transfer via MPI output_files.append(output_file_data) logger.info( 'received output file from AthenaMP; %s output files now on waiting list', len(output_files)) # set event range to completed: logger.debug('mark event range id %s as completed', output_file_data['eventrangeid']) try: eventranges.mark_completed( output_file_data['eventrangeid']) except Exception: logger.error( 'failed to mark eventrangeid %s as completed', output_file_data['eventrangeid']) self.stop() # return to state requesting a message self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) else: logger.error('failed to parse output file') self.set_state(self.WAIT_FOR_PAYLOAD_MESSAGE) payload_msg = None # if ready_events is below the threshold and the no more events flag has not been set # request more event ranges if eventranges.number_ready( ) < self.get_more_events_threshold and not no_more_events and not waiting_for_eventranges and current_job is not None: logger.info( 'number of ready events %s below request threshold %s, asking for more.', eventranges.number_ready(), self.get_more_events_threshold) # send MPI message to Yoda for more event ranges self.request_events(current_job) waiting_for_eventranges = True # if the number of completed events equals the number of event ranges # available, and no more events flag is set, then kill subprocess and exit. elif eventranges.number_ready( ) == 0 and eventranges.number_completed() == len( eventranges) and no_more_events: logger.info('no more events to process, exiting') self.stop() self.all_work_done.set() # else: # logger.info('sleeping for %s',self.loop_timeout) # self.exit.wait(timeout=self.loop_timeout) # send any remaining output files to Yoda before exitingn. # don't want to hammer Yoda with lots of little messages for output files # so aggregate output files for some time period then send as a group if len(output_files) > 0: # send output file data to Yoda/FileManager logger.info('sending %s output files to Yoda/FileManager', len(output_files)) mpi_message = { 'type': MessageTypes.OUTPUT_FILE, 'filelist': output_files, 'destination_rank': 0 } self.queues['MPIService'].put(mpi_message) # reset output file list output_files = [] self.set_state(self.EXITED) logger.info('JobComm exiting')
def stage_out_files(file_list, output_type): global sfm_har_config, sfm_har_config_done sfm_har_config_done.wait() if output_type not in ['output', 'es_output', 'log']: raise Exception('incorrect type provided: %s' % (output_type)) # load name of eventStatusDumpJsonFile file eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile'] eventStatusDumpData = {} # loop over filelist for filedata in file_list: # make sure pandaID is a string pandaID = str(filedata['pandaid']) chksum = None if 'chksum' in filedata: chksum = filedata['chksum'] # filename = os.path.join(output_path,os.path.basename(filedata['filename'])) # format data for file: file_descriptor = { 'eventRangeID': filedata['eventrangeid'], 'eventStatus': filedata['eventstatus'], 'path': filedata['filename'], 'type': output_type, 'chksum': chksum, 'guid': None, } try: eventStatusDumpData[pandaID].append(file_descriptor) except KeyError: eventStatusDumpData[pandaID] = [file_descriptor] # create a temp file to place contents # this avoids Harvester trying to read the file while it is being written eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp' # if file does not already exists, new data is just what we have if not os.path.exists(eventStatusDumpJsonFile): data = eventStatusDumpData # if the file exists, move it to a tmp filename, update its contents and then recreate it. else: # first move existing file to tmp so Harvester does not read it while we edit try: os.rename(eventStatusDumpJsonFile, eventStatusDumpJsonFile_tmp) except Exception: logger.warning( 'tried moving %s to a tmp filename to add more output files for Harvester.', eventStatusDumpJsonFile) if not os.path.exists(eventStatusDumpJsonFile): logger.warning( '%s file no longer exists so Harvester must have grabbed it. Need to create a new file', eventStatusDumpJsonFile) data = eventStatusDumpData else: # now open and read in the data with open(eventStatusDumpJsonFile_tmp, 'r') as f: data = serializer.deserialize(f.read()) logger.debug('found existing data for pandaIDs: %s', data.keys()) for pandaID in eventStatusDumpData: # if the pandaID already exists, just append the new file to that list try: logger.debug('addding data to existing panda list') data[pandaID] += eventStatusDumpData[pandaID] # if the pandaID does not exist, add a new list except KeyError: logger.debug('addding new panda id list') data[pandaID] = eventStatusDumpData[pandaID] if logger.getEffectiveLevel() == logging.DEBUG: tmpstr = ' '.join('%s:%s' % (x, len(data[x])) for x in data) logger.debug('writing output to file %s with keys: %s', eventStatusDumpJsonFile, tmpstr) # overwrite the temp file with the updated data with open(eventStatusDumpJsonFile_tmp, 'w') as f: f.write(serializer.serialize(data, pretty_print=True)) # move tmp file into place os.rename(eventStatusDumpJsonFile_tmp, eventStatusDumpJsonFile) logger.debug('done')
def stage_out_file( output_type, output_path, eventRangeID, eventStatus, pandaID, chksum=None, ): global sfm_har_config, sfm_har_config_done sfm_har_config_done.wait() if output_type not in ['output', 'es_output', 'log']: raise Exception('incorrect type provided: %s' % (output_type)) if not os.path.exists(output_path): raise Exception('output file not found: %s' % (output_path)) # make sure pandaID is a string pandaID = str(pandaID) # load name of eventStatusDumpJsonFile file eventStatusDumpJsonFile = sfm_har_config['eventStatusDumpJsonFile'] # first create a temp file to place contents # this avoids Harvester trying to read the file while it is being written eventStatusDumpJsonFile_tmp = eventStatusDumpJsonFile + '.tmp' # format data for file: file_descriptor = { 'eventRangeID': eventRangeID, 'eventStatus': eventStatus, 'path': output_path, 'type': output_type, 'chksum': chksum, 'guid': None, } # if file does not already exists, new data is just what we have if not os.path.exists(eventStatusDumpJsonFile): data = {pandaID: [file_descriptor]} # if the file exists, move it to a tmp filename, update its contents and then recreate it. else: # first move existing file to tmp so Harvester does not read it while we edit try: os.rename(eventStatusDumpJsonFile, eventStatusDumpJsonFile_tmp) except Exception: logger.warning( 'tried moving %s to a tmp filename to add more output files for Harvester.', eventStatusDumpJsonFile) if not os.path.exists(eventStatusDumpJsonFile): logger.warning( '%s file no longer exists so Harvester must have grabbed it. Need to create a new file', eventStatusDumpJsonFile) data = {pandaID: [file_descriptor]} else: # now open and read in the data with open(eventStatusDumpJsonFile_tmp, 'r') as f: data = serializer.deserialize(f.read()) logger.debug('existing data contains %s', data) # if the pandaID already exists, just append the new file to that list if pandaID in data: logger.debug('addding data to existing panda list') data[pandaID].append(file_descriptor) # if the pandaID does not exist, add a new list else: logger.debug('addding new panda id list') data[pandaID] = [file_descriptor] logger.debug('output to file %s: %s', eventStatusDumpJsonFile, data) # overwrite the temp file with the updated data with open(eventStatusDumpJsonFile_tmp, 'w') as f: f.write(serializer.serialize(data)) # move tmp file into place os.rename(eventStatusDumpJsonFile_tmp, eventStatusDumpJsonFile)