def __update_bulk_replicas(replicas, req_type, rule_id, session=None): """ Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way. :param replicas: List of replicas. :param req_type: Request type: STAGEIN, STAGEOUT, TRANSFER. :param rule_id: RULE id. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states(replicas, nowait=True, session=session) except ReplicaNotFound as error: logging.warn( 'Failed to bulk update replicas, will do it one by one: %s' % str(error)) raise ReplicaNotFound(error) for replica in replicas: if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s" % (replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state']))) return True
def __update_bulk_replicas(replicas, session=None, logger=logging.log): """ Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way. :param replicas: List of replicas. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states(replicas, nowait=True, session=session) except ReplicaNotFound as error: logger(logging.WARNING, 'Failed to bulk update replicas, will do it one by one: %s', str(error)) raise ReplicaNotFound(error) for replica in replicas: if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logger(logging.INFO, "HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) return True
def __update_replica(replica, session=None): """ Used by finisher to update a replica to a finished state. :param replica: Replica as a dictionary. :param rule_id: RULE id. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states([replica], nowait=True, add_tombstone=True, session=session) if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) except (UnsupportedOperation, ReplicaNotFound) as error: logging.warn( "ERROR WHEN HANDLING REQUEST %s DID %s:%s AT RSE %s STATE %s: %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state']), str(error)) # replica cannot be found. register it and schedule it for deletion try: if replica['state'] == ReplicaState.AVAILABLE and replica[ 'request_type'] != RequestType.STAGEIN: logging.info( "Replica cannot be found. Adding a replica %s:%s AT RSE %s with tombstone=utcnow", replica['scope'], replica['name'], replica['rse_id']) replica_core.add_replica( replica['rse_id'], replica['scope'], replica['name'], replica['bytes'], pfn=replica['pfn'] if 'pfn' in replica else None, account=InternalAccount( 'root', vo=replica['scope'].vo ), # it will deleted immediately, do we need to get the accurate account from rule? adler32=replica['adler32'], tombstone=datetime.datetime.utcnow(), session=session) if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) except Exception as error: logging.error( 'Cannot register replica for DID %s:%s at RSE %s - potential dark data - %s', replica['scope'], replica['name'], replica['rse_id'], str(error)) raise return True
def __update_bulk_replicas(replicas, session=None): """ Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way. :param replicas: List of replicas. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states( replicas, nowait=True, add_tombstone=False, session=session ) # TODO: add_tombstone needs to be changed to True once set_tombstone is fixed except ReplicaNotFound as error: logging.warn( 'Failed to bulk update replicas, will do it one by one: %s', str(error)) raise ReplicaNotFound(error) for replica in replicas: if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) return True
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
rse_name)) raise except: # could not update successful lock record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000) logging.warn("Could not update replica state for successful transfer %s:%s at %s: %s" % (response['scope'], response['name'], rse_name, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000) tss = time.time() request.archive_request(response['request_id'], session=session) record_timer('daemons.conveyor.common.update_request_state.request-archive_request', (time.time()-tss)*1000) add_monitor_message(response, session=session) except exception.UnsupportedOperation, e: # The replica doesn't exist request.archive_request(response['request_id'], session=session) logging.warning(str(e).replace('\n', '')) return True except: logging.critical("Something unexpected happened when updating replica state for successful transfer %s:%s at %s (%s)" % (response['scope'], response['name'], rse_name, traceback.format_exc())) raise
def submitter(once=False, rses=[], process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activities=None): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('submitter starting - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = 'srm' logging.info('submitter started - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) while not graceful_stop.is_set(): try: if activities is None: activities = [None] for activity in activities: if rses is None: rses = [None] for rse in rses: if rse: # run in rse list mode rse_info = rsemgr.get_rse_info(rse['rse']) logging.info("Working on RSE: %s" % rse['rse']) ts = time.time() reqs = get_requests(rse_id=rse['id'], process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) else: # no rse list, run FIFO mode rse_info = None ts = time.time() reqs = get_requests(process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) if reqs: logging.debug('%i:%i - submitting %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: time.sleep(1) continue for req in reqs: try: if not rse: # no rse list, in FIFO mode dest_rse = rse_core.get_rse(rse=None, rse_id=req['dest_rse_id']) rse_info = rsemgr.get_rse_info(dest_rse['rse']) ts = time.time() transfer = get_transfer(rse_info, req, scheme, mock) record_timer('daemons.conveyor.submitter.get_transfer', (time.time() - ts) * 1000) logging.debug('Transfer for request %s: %s' % (req['request_id'], transfer)) if transfer is None: logging.warn("Request %s DID %s:%s RSE %s failed to get transfer" % (req['request_id'], req['scope'], req['name'], rse_info['rse'])) # TODO: Merge these two calls request.set_request_state(req['request_id'], RequestState.LOST) # if the DID does not exist anymore request.archive_request(req['request_id']) continue ts = time.time() tmp_metadata = transfer['file_metadata'] eids = request.submit_transfers(transfers=[transfer, ], transfertool='fts3', job_metadata=tmp_metadata) record_timer('daemons.conveyor.submitter.submit_transfer', (time.time() - ts) * 1000) ts = time.time() if req['previous_attempt_id']: logging.info('COPYING RETRY %s REQUEST %s PREVIOUS %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['retry_count'], req['request_id'], req['previous_attempt_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) else: logging.info('COPYING REQUEST %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['request_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) record_counter('daemons.conveyor.submitter.submit_request') except UnsupportedOperation, e: # The replica doesn't exist, need to cancel the request logging.warning(e) logging.info('Cancelling transfer request %s' % req['request_id']) try: # TODO: for now, there is only ever one destination request.cancel_request_did(req['scope'], req['name'], transfer['dest_urls'][0]) except Exception, e: logging.warning('Cannot cancel request: %s' % str(e))