def on_message(self, headers, message): record_counter('daemons.conveyor.consumer.message_all') if 'vo' not in headers or headers['vo'] != 'atlas': return msg = json.loads(message[:-1]) # message always ends with an unparseable EOT character if 'job_metadata' in msg.keys() \ and isinstance(msg['job_metadata'], dict) \ and 'issuer' in msg['job_metadata'].keys() \ and str(msg['job_metadata']['issuer']) == str('rucio'): if 'job_m_replica' in msg.keys() and 'job_state' in msg.keys() \ and (str(msg['job_m_replica']) == str('false') or (str(msg['job_m_replica']) == str('true') and str(msg['job_state']) != str('ACTIVE'))): response = {'new_state': None, 'transfer_id': msg.get('tr_id').split("__")[-1], 'job_state': msg.get('t_final_transfer_state', None), 'src_url': msg.get('src_url', None), 'dst_url': msg.get('dst_url', None), 'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0)))/1000, 'reason': msg.get('t__error_message', None), 'scope': msg['job_metadata'].get('scope', None), 'name': msg['job_metadata'].get('name', None), 'src_rse': msg['job_metadata'].get('src_rse', None), 'dst_rse': msg['job_metadata'].get('dst_rse', None), 'request_id': msg['job_metadata'].get('request_id', None), 'activity': msg['job_metadata'].get('activity', None), 'dest_rse_id': msg['job_metadata'].get('dest_rse_id', None), 'previous_attempt_id': msg['job_metadata'].get('previous_attempt_id', None), 'adler32': msg['job_metadata'].get('adler32', None), 'md5': msg['job_metadata'].get('md5', None), 'filesize': msg['job_metadata'].get('filesize', None), 'external_host': msg.get('endpnt', None), 'job_m_replica': msg.get('job_m_replica', None), 'details': {'files': msg['job_metadata']}} record_counter('daemons.conveyor.consumer.message_rucio') if str(msg['t_final_transfer_state']) == str(FTSCompleteState.OK): response['new_state'] = RequestState.DONE elif str(msg['t_final_transfer_state']) == str(FTSCompleteState.ERROR): response['new_state'] = RequestState.FAILED try: if response['new_state']: logging.debug('DID %s:%s FROM %s TO %s STATE %s' % (msg['job_metadata']['scope'], msg['job_metadata']['name'], msg['job_metadata']['src_rse'], msg['job_metadata']['dst_rse'], response['new_state'])) update_request_state(response) record_counter('daemons.conveyor.consumer.update_request_state') except: logging.critical(traceback.format_exc())
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60): """ Main loop to check the status of a transfer primitive with a transfertool. """ logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) while not graceful_stop.is_set(): try: ts = time.time() logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than)) reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=RequestState.SUBMITTED, limit=bulk, older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than), process=process, total_processes=total_processes, thread=thread, total_threads=total_threads) record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000) if reqs: logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: if once: break logging.debug("no requests found. will sleep 60 seconds") time.sleep(60) # Only sleep if there is nothing to do continue for xfers in chunks(reqs, bulk): try: req_ids = {} for req in xfers: record_counter('daemons.conveyor.poller.query_request') if not req['external_host'] in req_ids: req_ids[req['external_host']] = [] req_ids[req['external_host']].append((req['request_id'], req['external_id'])) responses = {} for external_host in req_ids: ts = time.time() resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3') record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host])) responses = dict(responses.items() + resps.items()) for external_host in req_ids: for request_id, external_id in req_ids[external_host]: response = responses[request_id] if isinstance(response, Exception): logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id])) record_counter('daemons.conveyor.poller.query_request_exception') response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None} common.update_request_state(response) record_counter('daemons.conveyor.poller.update_request_state') if response['new_state'] == RequestState.LOST: record_counter('daemons.conveyor.poller.request_lost') except: logging.critical(traceback.format_exc()) except: logging.critical(traceback.format_exc()) if once: return logging.debug('%i:%i - graceful stop requests' % (process, thread)) logging.debug('%i:%i - graceful stop done' % (process, thread))
def on_message(self, headers, message): record_counter('daemons.conveyor.receiver.message_all') if 'vo' not in headers or headers['vo'] != 'atlas': return msg = json.loads( message[:-1] ) # message always ends with an unparseable EOT character if 'job_metadata' in msg.keys() \ and isinstance(msg['job_metadata'], dict) \ and 'issuer' in msg['job_metadata'].keys() \ and str(msg['job_metadata']['issuer']) == str('rucio'): if 'job_m_replica' in msg.keys() and 'job_state' in msg.keys() \ and (str(msg['job_m_replica']) == str('false') or (str(msg['job_m_replica']) == str('true') and str(msg['job_state']) != str('ACTIVE'))): if 'request_id' in msg['job_metadata']: # submitted by old submitter response = { 'new_state': None, 'transfer_id': msg.get('tr_id').split("__")[-1], 'job_state': msg.get('t_final_transfer_state', None), 'src_url': msg.get('src_url', None), 'dst_url': msg.get('dst_url', None), 'transferred_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_complete', 0)) / 1000), 'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0))) / 1000, 'reason': msg.get('t__error_message', None), 'scope': msg['job_metadata'].get('scope', None), 'name': msg['job_metadata'].get('name', None), 'src_rse': msg['job_metadata'].get('src_rse', None), 'dst_rse': msg['job_metadata'].get('dst_rse', None), 'request_id': msg['job_metadata'].get('request_id', None), 'activity': msg['job_metadata'].get('activity', None), 'dest_rse_id': msg['job_metadata'].get('dest_rse_id', None), 'previous_attempt_id': msg['job_metadata'].get('previous_attempt_id', None), 'adler32': msg['job_metadata'].get('adler32', None), 'md5': msg['job_metadata'].get('md5', None), 'filesize': msg['job_metadata'].get('filesize', None), 'external_host': msg.get('endpnt', None), 'job_m_replica': msg.get('job_m_replica', None), 'details': { 'files': msg['job_metadata'] } } else: # for new submitter, file_metadata replace the job_metadata response = { 'new_state': None, 'transfer_id': msg.get('tr_id').split("__")[-1], 'job_state': msg.get('t_final_transfer_state', None), 'src_url': msg.get('src_url', None), 'dst_url': msg.get('dst_url', None), 'started_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_start', 0)) / 1000), 'transferred_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_complete', 0)) / 1000), 'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0))) / 1000, 'reason': msg.get('t__error_message', None), 'scope': msg['file_metadata'].get('scope', None), 'name': msg['file_metadata'].get('name', None), 'src_type': msg['file_metadata'].get('src_type', None), 'dst_type': msg['file_metadata'].get('dst_type', None), 'src_rse': msg['file_metadata'].get('src_rse', None), 'dst_rse': msg['file_metadata'].get('dst_rse', None), 'request_id': msg['file_metadata'].get('request_id', None), 'activity': msg['file_metadata'].get('activity', None), 'src_rse_id': msg['file_metadata'].get('src_rse_id', None), 'dest_rse_id': msg['file_metadata'].get('dest_rse_id', None), 'previous_attempt_id': msg['file_metadata'].get('previous_attempt_id', None), 'adler32': msg['file_metadata'].get('adler32', None), 'md5': msg['file_metadata'].get('md5', None), 'filesize': msg['file_metadata'].get('filesize', None), 'external_host': msg.get('endpnt', None), 'job_m_replica': msg.get('job_m_replica', None), 'details': { 'files': msg['file_metadata'] } } record_counter('daemons.conveyor.receiver.message_rucio') if str(msg['t_final_transfer_state']) == str( FTSCompleteState.OK): response['new_state'] = RequestState.DONE elif str(msg['t_final_transfer_state']) == str( FTSCompleteState.ERROR): response['new_state'] = RequestState.FAILED try: if response['new_state']: logging.info( 'RECEIVED DID %s:%s FROM %s TO %s REQUEST %s TRANSFER_ID %s STATE %s' % (response['scope'], response['name'], response['src_rse'], response['dst_rse'], response['request_id'], response['transfer_id'], response['new_state'])) if self.__full_mode: ret = common.update_request_state(response) record_counter( 'daemons.conveyor.receiver.update_request_state.%s' % ret) else: try: logging.debug("Update request %s update time" % response['request_id']) set_transfer_update_time( response['external_host'], response['transfer_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24)) record_counter( 'daemons.conveyor.receiver.set_transfer_update_time' ) except Exception, e: logging.debug( "Failed to update transfer's update time: %s" % str(e)) except: logging.critical(traceback.format_exc())
def poller_latest(external_hosts, once=False, last_nhours=1, fts_wait=1800): """ Main loop to check the status of a transfer primitive with a transfertool. """ executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() logging.info('polling latest %s hours on hosts: %s' % (last_nhours, external_hosts)) if external_hosts: if type(external_hosts) == str: external_hosts = [external_hosts] while not graceful_stop.is_set(): try: heartbeat.live(executable, hostname, pid, hb_thread) start_time = time.time() for external_host in external_hosts: logging.debug('polling latest %s hours on host: %s' % (last_nhours, external_host)) ts = time.time() resps = None state = [ str(FTSState.FINISHED), str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED) ] try: resps = request.query_latest(external_host, state=state, last_nhours=last_nhours) except: logging.error(traceback.format_exc()) record_timer('daemons.conveyor.poller_latest.000-query_latest', (time.time() - ts) * 1000) if resps: logging.info('poller_latest - polling %i requests' % (len(resps))) if not resps or resps == []: if once: break logging.info("no requests found. will sleep 60 seconds") continue for resp in resps: try: ret = common.update_request_state(resp) # if True, really update request content; if False, only touch request record_counter( 'daemons.conveyor.poller_latest.update_request_state.%s' % ret) except: logging.error(traceback.format_exc()) if once: break time_left = fts_wait - abs(time.time() - start_time) if time_left > 0: logging.debug( "Waiting %s seconds until next FTS terminal state retrieval" % time_left) time.sleep(time_left) except RequestException, e: logging.error("Failed to contact FTS server: %s" % (str(e))) except: