def submit_transfer(transfertool_obj, transfers, job_params, submitter='submitter', timeout=None, logger=logging.log): """ Submit a transfer or staging request :param transfertool_obj: The transfertool object to be used for submission :param transfers: Transfer objects to be submitted :param job_params: Parameters to be used for all transfers in the given job. :param submitter: Name of the submitting entity. :param timeout: Timeout :param logger: Optional decorated logger that can be passed from the calling daemons or servers. """ for transfer in transfers: try: transfer_core.mark_submitting_and_prepare_sources_for_transfer(transfer, external_host=transfertool_obj.external_host, logger=logger) except RequestNotFound as error: logger(logging.ERROR, str(error)) return except Exception: logger(logging.ERROR, 'Failed to prepare requests %s state to SUBMITTING. Mark it SUBMISSION_FAILED and abort submission.' % [str(t.rws) for t in transfers], exc_info=True) set_request_state(request_id=transfer.rws.request_id, new_state=RequestState.SUBMISSION_FAILED) return try: _submit_transfers(transfertool_obj, transfers, job_params, submitter, timeout, logger) except DuplicateFileTransferSubmission as error: logger(logging.WARNING, 'Failed to bulk submit a job because of duplicate file : %s', str(error)) logger(logging.INFO, 'Submitting files one by one') for transfer in transfers: _submit_transfers(transfertool_obj, [transfer], job_params, submitter, timeout, logger)
def update_request_state(response, session=None): """ Used by poller and consumer to update the internal state of requests, after the response by the external transfertool. :param response: The transfertool response dictionary, retrieved via request.query_request(). :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: if not response['new_state']: request_core.touch_request(response['request_id'], session=session) return False else: request = request_core.get_request(response['request_id'], session=session) if request and request['external_id'] == response['transfer_id'] and request['state'] != response['new_state']: response['submitted_at'] = request.get('submitted_at', None) response['external_host'] = request['external_host'] transfer_id = response['transfer_id'] if 'transfer_id' in response else None logging.info('UPDATING REQUEST %s FOR TRANSFER %s STATE %s' % (str(response['request_id']), transfer_id, str(response['new_state']))) job_m_replica = response.get('job_m_replica', None) src_url = response.get('src_url', None) src_rse = response.get('src_rse', None) src_rse_id = response.get('src_rse_id', None) started_at = response.get('started_at', None) transferred_at = response.get('transferred_at', None) scope = response.get('scope', None) name = response.get('name', None) if job_m_replica and (str(job_m_replica).lower() == str('true')) and src_url: try: src_rse_name, src_rse_id = get_source_rse(response['request_id'], scope, name, src_url, session=session) except: logging.warn('Cannot get correct RSE for source url: %s(%s)' % (src_url, traceback.format_exc())) src_rse_name = None if src_rse_name and src_rse_name != src_rse: response['src_rse'] = src_rse_name response['src_rse_id'] = src_rse_id logging.debug('Correct RSE: %s for source surl: %s' % (src_rse_name, src_url)) err_msg = get_transfer_error(response['new_state'], response['reason'] if 'reason' in response else None) request_core.set_request_state(response['request_id'], response['new_state'], transfer_id=transfer_id, started_at=started_at, transferred_at=transferred_at, src_rse_id=src_rse_id, err_msg=err_msg, session=session) add_monitor_message(request, response, session=session) return True elif not request: logging.debug("Request %s doesn't exist, will not update" % (response['request_id'])) return False elif request['external_id'] != response['transfer_id']: logging.warning("Reponse %s with transfer id %s is different from the request transfer id %s, will not update" % (response['request_id'], response['transfer_id'], request['external_id'])) return False else: logging.debug("Request %s is already in %s state, will not update" % (response['request_id'], response['new_state'])) return False except UnsupportedOperation as error: logging.warning("Request %s doesn't exist - Error: %s" % (response['request_id'], str(error).replace('\n', ''))) return False except: logging.critical(traceback.format_exc())
def set_transfer_state(external_host, transfer_id, state, session=None): """ Used by poller to update the internal state of transfer, after the response by the external transfertool. :param request_host: Name of the external host. :param transfer_id: external transfer job id as a string. :param state: request state as a string. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: if state == RequestState.LOST: reqs = request_core.get_requests_by_transfer(external_host, transfer_id, session=session) for req in reqs: logging.info('REQUEST %s OF TRANSFER %s ON %s STATE %s' % (str(req['request_id']), external_host, transfer_id, str(state))) src_rse_id = req.get('source_rse_id', None) dst_rse_id = req.get('dest_rse_id', None) src_rse = None dst_rse = None if src_rse_id: src_rse = rse_core.get_rse_name(src_rse_id, session=session) if dst_rse_id: dst_rse = rse_core.get_rse_name(dst_rse_id, session=session) response = {'new_state': state, 'transfer_id': transfer_id, 'job_state': state, 'src_url': None, 'dst_url': req['dest_url'], 'duration': 0, 'reason': "The FTS job lost", 'scope': req.get('scope', None), 'name': req.get('name', None), 'src_rse': src_rse, 'dst_rse': dst_rse, 'request_id': req.get('request_id', None), 'activity': req.get('activity', None), 'src_rse_id': req.get('source_rse_id', None), 'dst_rse_id': req.get('dest_rse_id', None), 'previous_attempt_id': req.get('previous_attempt_id', None), 'adler32': req.get('adler32', None), 'md5': req.get('md5', None), 'filesize': req.get('filesize', None), 'external_host': external_host, 'job_m_replica': None, 'created_at': req.get('created_at', None), 'submitted_at': req.get('submitted_at', None), 'details': None, 'account': req.get('account', None)} err_msg = get_transfer_error(response['new_state'], response['reason'] if 'reason' in response else None) request_core.set_request_state(req['request_id'], response['new_state'], transfer_id=transfer_id, src_rse_id=src_rse_id, err_msg=err_msg, session=session) add_monitor_message(req, response, session=session) else: request_core.set_transfer_state(external_host, transfer_id, state, session=session) return True except UnsupportedOperation as error: logging.warning("Transfer %s on %s doesn't exist - Error: %s" % (transfer_id, external_host, str(error).replace('\n', ''))) return False
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
logging.warning("Request %s doesn't exist anymore, should not be updated again. Error: %s" % (response['request_id'], str(e).replace('\n', ''))) return False if not response['new_state']: return False elif response['new_state'] == RequestState.LOST: logging.debug('UPDATING REQUEST %s FOR STATE %s' % (str(response['request_id']), str(response['new_state']))) elif response['new_state'] and 'job_state' in response and response['job_state']: logging.debug('UPDATING REQUEST %s FOR TRANSFER(%s) STATE %s' % (str(response['request_id']), str(response['transfer_id']), str(response['job_state']))) else: return False if response['new_state']: request.set_request_state(response['request_id'], response['new_state'], session=session) if response['new_state'] == RequestState.DONE: rse_name = response['dst_rse'] rse_update_name = rse_name req = request.get_request(response['request_id'], session=session) if req['request_type'] == RequestType.STAGEIN: rse_update_name = rse_core.list_rse_attributes(response['dst_rse'], session=session)['staging_buffer'] logging.debug('OVERRIDE REPLICA DID %s:%s RSE %s TO %s' % (response['scope'], response['name'], response['dst_rse'], rse_update_name)) try: tss = time.time() logging.debug('UPDATE REPLICA STATE DID %s:%s RSE %s' % (response['scope'], response['name'], rse_update_name))
def submitter(once=False, rses=[], process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activities=None): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('submitter starting - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = 'srm' logging.info('submitter started - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) while not graceful_stop.is_set(): try: if activities is None: activities = [None] for activity in activities: if rses is None: rses = [None] for rse in rses: if rse: # run in rse list mode rse_info = rsemgr.get_rse_info(rse['rse']) logging.info("Working on RSE: %s" % rse['rse']) ts = time.time() reqs = get_requests(rse_id=rse['id'], process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) else: # no rse list, run FIFO mode rse_info = None ts = time.time() reqs = get_requests(process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) if reqs: logging.debug('%i:%i - submitting %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: time.sleep(1) continue for req in reqs: try: if not rse: # no rse list, in FIFO mode dest_rse = rse_core.get_rse(rse=None, rse_id=req['dest_rse_id']) rse_info = rsemgr.get_rse_info(dest_rse['rse']) ts = time.time() transfer = get_transfer(rse_info, req, scheme, mock) record_timer('daemons.conveyor.submitter.get_transfer', (time.time() - ts) * 1000) logging.debug('Transfer for request %s: %s' % (req['request_id'], transfer)) if transfer is None: logging.warn("Request %s DID %s:%s RSE %s failed to get transfer" % (req['request_id'], req['scope'], req['name'], rse_info['rse'])) # TODO: Merge these two calls request.set_request_state(req['request_id'], RequestState.LOST) # if the DID does not exist anymore request.archive_request(req['request_id']) continue ts = time.time() tmp_metadata = transfer['file_metadata'] eids = request.submit_transfers(transfers=[transfer, ], transfertool='fts3', job_metadata=tmp_metadata) record_timer('daemons.conveyor.submitter.submit_transfer', (time.time() - ts) * 1000) ts = time.time() if req['previous_attempt_id']: logging.info('COPYING RETRY %s REQUEST %s PREVIOUS %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['retry_count'], req['request_id'], req['previous_attempt_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) else: logging.info('COPYING REQUEST %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['request_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) record_counter('daemons.conveyor.submitter.submit_request') except UnsupportedOperation, e: # The replica doesn't exist, need to cancel the request logging.warning(e) logging.info('Cancelling transfer request %s' % req['request_id']) try: # TODO: for now, there is only ever one destination request.cancel_request_did(req['scope'], req['name'], transfer['dest_urls'][0]) except Exception, e: logging.warning('Cannot cancel request: %s' % str(e))
def __create_missing_replicas_and_requests( transfer_path: "List[DirectTransferDefinition]", default_tombstone_delay: int, logger: "Callable", session: "Optional[Session]" = None ) -> "Tuple[bool, bool]": """ Create replicas and requests in the database for the intermediate hops """ initial_request_id = transfer_path[-1].rws.request_id creation_successful = True must_skip_submission = False created_requests = [] # Iterate the path in reverse order. The last hop is the initial request, so # next_hop.rws.request_id will always be initialized when handling the current hop. for i in reversed(range(len(transfer_path))): hop = transfer_path[i] rws = hop.rws if rws.request_id: continue tombstone_delay = rws.dest_rse.attributes.get('multihop_tombstone_delay', default_tombstone_delay) try: tombstone = tombstone_from_delay(tombstone_delay) except ValueError: logger(logging.ERROR, "%s: Cannot parse multihop tombstone delay %s", initial_request_id, tombstone_delay) creation_successful = False break files = [{'scope': rws.scope, 'name': rws.name, 'bytes': rws.byte_count, 'adler32': rws.adler32, 'md5': rws.md5, 'tombstone': tombstone, 'state': 'C'}] try: add_replicas(rse_id=rws.dest_rse.id, files=files, account=rws.account, ignore_availability=False, dataset_meta=None, session=session) # Set replica state to Copying in case replica already existed in another state. # Can happen when a multihop transfer failed previously, and we are re-scheduling it now. update_replica_state(rse_id=rws.dest_rse.id, scope=rws.scope, name=rws.name, state=ReplicaState.COPYING, session=session) except Exception as error: logger(logging.ERROR, '%s: Problem adding replicas on %s : %s', initial_request_id, rws.dest_rse, str(error)) rws.attributes['is_intermediate_hop'] = True rws.attributes['source_replica_expression'] = hop.src.rse.name new_req = queue_requests(requests=[{'dest_rse_id': rws.dest_rse.id, 'scope': rws.scope, 'name': rws.name, 'rule_id': '00000000000000000000000000000000', # Dummy Rule ID used for multihop. TODO: Replace with actual rule_id once we can flag intermediate requests 'attributes': rws.attributes, 'request_type': rws.request_type, 'retry_count': rws.retry_count, 'account': rws.account, 'requested_at': datetime.datetime.now()}], session=session) # If a request already exists, new_req will be an empty list. if new_req: db_req = new_req[0] else: db_req = request_core.get_request_by_did(rws.scope, rws.name, rws.dest_rse.id, session=session) # A transfer already exists for part of the path. Just construct the remaining # path, but don't submit the transfer. We must wait for the existing transfer to be # completed before continuing. must_skip_submission = True models.TransferHop(request_id=db_req['id'], next_hop_request_id=transfer_path[i + 1].rws.request_id, initial_request_id=initial_request_id, ).save(session=session, flush=False) rws.request_id = db_req['id'] rws.requested_at = db_req['requested_at'] logger(logging.DEBUG, '%s: New request created for the transfer between %s and %s : %s', initial_request_id, transfer_path[0].src, transfer_path[-1].dst, rws.request_id) set_request_state(rws.request_id, RequestState.QUEUED, session=session, logger=logger) created_requests.append(rws.request_id) return creation_successful, must_skip_submission