def submit_transfers(transfers, transfertool='fts3', job_metadata={}, session=None): """ Submit transfer request to a transfertool. :param transfers: Dictionary containing request metadata. :param transfertool: Transfertool as a string. :param job_metadata: Metadata key/value pairs for all files as a dictionary. :param session: Database session to use. :returns: Transfertool external ID. """ record_counter('core.request.submit_transfer') transfer_id = None if transfertool == 'fts3': ts = time.time() transfer_ids = fts3.submit_transfers(transfers, job_metadata) record_timer('core.request.submit_transfers_fts3', (time.time() - ts) * 1000) for transfer_id in transfer_ids: session.query(models.Request)\ .filter_by(id=transfer_id)\ .update({'state': RequestState.SUBMITTED, 'external_id': transfer_ids[transfer_id]['external_id'], 'external_host': transfer_ids[transfer_id]['external_host'], 'dest_url': transfer_ids[transfer_id]['dest_urls'][0]}, synchronize_session=False) return transfer_ids
def query_latest(external_host, state, last_nhours=1): """ Query the latest transfers in last n hours with state. :param external_host: FTS host name as a string. :param state: FTS job state as a string or a dictionary. :param last_nhours: Latest n hours as an integer. :returns: Requests status information as a dictionary. """ record_counter('core.request.query_latest') start_time = time.time() resps = FTS3Transfertool(external_host=external_host).query_latest(state=state, last_nhours=last_nhours) record_timer('core.request.query_latest_fts3.%s.%s_hours' % (external_host, last_nhours), (time.time() - start_time) * 1000) if not resps: return ret_resps = [] for resp in resps: if 'job_metadata' not in resp or resp['job_metadata'] is None or 'issuer' not in resp['job_metadata'] or resp['job_metadata']['issuer'] != 'rucio': continue if 'request_id' not in resp['job_metadata']: # submitted by new submitter try: logging.debug("Transfer %s on %s is %s, decrease its updated_at." % (resp['job_id'], external_host, resp['job_state'])) set_transfer_update_time(external_host, resp['job_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24)) except Exception as error: logging.debug("Exception happened when updating transfer updatetime: %s" % str(error).replace('\n', '')) return ret_resps
def query_request_details(request_id, transfertool='fts3', session=None): """ Query the detailed status of a request. Can also be done after the external transfer has finished. :param request_id: Request-ID as a 32 character hex string. :param transfertool: Transfertool name as a string. :param session: Database session to use. :returns: Detailed request status information as a dictionary. """ record_counter('core.request.query_request_details') req = get_request(request_id, session=session) if not req: return if transfertool == 'fts3': ts = time.time() tmp = fts3.query_details(req['external_id'], req['external_host']) record_timer('core.request.query_details_fts3', (time.time() - ts) * 1000) return tmp raise NotImplementedError
def query(tid, session): """ Query the transfer job information of a single job. Has a chance to progress the job from QUEUED to either DONE or FAILED. :param tid: The transfer job id. :returns: The transfer job information. """ record_counter('daemons.mock.fts3.query') ts = time.time() new_state = random.sample(sum([[FTSState.FINISHED]*15, [FTSState.FAILED]*3, [FTSState.FINISHEDDIRTY]*2, [FTSState.ACTIVE]*80], []), 1)[0] record_timer('daemons.mock.fts3.query.000-random_sample', (time.time()-ts)*1000) ts = time.time() query = session.query(test_models.MockFTSTransfer).filter(and_(test_models.MockFTSTransfer.transfer_id == tid, or_(test_models.MockFTSTransfer.state == FTSState.SUBMITTED, test_models.MockFTSTransfer.state == FTSState.ACTIVE))) if query.update({'state': new_state, 'last_modified': datetime.datetime.utcnow()}) == 0: return None r = {'job_state': str(new_state)} if new_state == FTSState.FAILED or new_state == FTSState.FINISHEDDIRTY: r['reason'] = 'Mock FTS decided to kill your transfer.' r['files'] = [{'source_surl': 'mock_src', 'dest_surl': 'mock_dest', 'reason': 'mock failure'}] return r
def __update_atime(self): """ Bulk update atime. """ replicas = [] for report in self.__reports: try: # check if scope in report. if not skip this one. if 'scope' not in report: record_counter('daemons.tracer.kronos.missing_scope') continue else: record_counter('daemons.tracer.kronos.with_scope') # for the moment only report with eventType get* are handled. if not report['eventType'].startswith('get'): continue record_counter('daemons.tracer.kronos.total_get') if report['eventType'] == 'get': record_counter('daemons.tracer.kronos.dq2clients') elif report['eventType'] == 'get_sm': record_counter('daemons.tracer.kronos.panda_production') elif report['eventType'] == 'get_sm_a': record_counter('daemons.tracer.kronos.panda_analysis') else: record_counter('daemons.tracer.kronos.other_get') # check if the report has the right state. if report['clientState'] in self.__excluded_states: continue if report['usrdn'] in self.__excluded_usrdns: continue if not report['remoteSite']: continue replicas.append({'name': report['filename'], 'scope': report['scope'], 'rse': report['remoteSite'], 'accessed_at': datetime.utcfromtimestamp(report['traceTimeentryUnix'])}) except (KeyError, AttributeError): logging.error(format_exc()) record_counter('daemons.tracer.kronos.report_error') continue for did in list_parent_dids(report['scope'], report['filename']): if did['type'] != DIDType.DATASET: continue # do not update _dis datasets if did['scope'] == 'panda' and '_dis' in did['name']: continue self.__dataset_queue.put({'scope': did['scope'], 'name': did['name'], 'did_type': did['type'], 'rse': report['remoteSite'], 'accessed_at': datetime.utcfromtimestamp(report['traceTimeentryUnix'])}) logging.info(replicas) try: ts = time() touch_replicas(replicas) record_timer('daemons.tracer.kronos.update_atime', (time() - ts) * 1000) except: logging.error(format_exc()) record_counter('daemons.tracer.kronos.update_error') logging.info('(kronos_file) updated %d replicas' % len(replicas))
def poller_latest(external_hosts, once=False, last_nhours=1, fts_wait=1800): """ Main loop to check the status of a transfer primitive with a transfertool. """ executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() logging.info('polling latest %s hours on hosts: %s' % (last_nhours, external_hosts)) if external_hosts: if type(external_hosts) == str: external_hosts = [external_hosts] while not graceful_stop.is_set(): try: heartbeat.live(executable, hostname, pid, hb_thread) start_time = time.time() for external_host in external_hosts: logging.debug('polling latest %s hours on host: %s' % (last_nhours, external_host)) ts = time.time() resps = None state = [str(FTSState.FINISHED), str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)] try: resps = transfer.query_latest(external_host, state=state, last_nhours=last_nhours) except: logging.error(traceback.format_exc()) record_timer('daemons.conveyor.poller_latest.000-query_latest', (time.time() - ts) * 1000) if resps: logging.info('poller_latest - polling %i requests' % (len(resps))) if not resps or resps == []: if once: break logging.info("no requests found. will sleep 60 seconds") continue for resp in resps: try: ret = request.update_request_state(resp) # if True, really update request content; if False, only touch request record_counter('daemons.conveyor.poller_latest.update_request_state.%s' % ret) except: logging.error(traceback.format_exc()) if once: break time_left = fts_wait - abs(time.time() - start_time) if time_left > 0: logging.debug("Waiting %s seconds until next FTS terminal state retrieval" % time_left) graceful_stop.wait(time_left) except RequestException, e: logging.error("Failed to contact FTS server: %s" % (str(e))) except:
def _submit_transfers(transfertool_obj, transfers, job_params, submitter='submitter', timeout=None, logger=logging.log): """ helper function for submit_transfers. Performs the actual submission of one or more transfers. If the bulk submission of multiple transfers fails due to duplicate submissions, the exception is propagated to the caller context, which is then responsible for calling this function again for each of the transfers separately. """ logger(logging.INFO, 'About to submit job to %s with timeout %s' % (transfertool_obj, timeout)) # A eid is returned if the job is properly submitted otherwise an exception is raised is_bulk = len(transfers) > 1 eid = None start_time = time.time() state_to_set = RequestState.SUBMISSION_FAILED try: record_counter('core.request.submit_transfer') eid = transfertool_obj.submit(transfers, job_params, timeout) state_to_set = RequestState.SUBMITTED except DuplicateFileTransferSubmission: if is_bulk: raise except (TransferToolTimeout, TransferToolWrongAnswer) as error: logger(logging.ERROR, 'Failed to submit a job with error %s', str(error), exc_info=True) except Exception as error: logger(logging.ERROR, 'Failed to submit a job with error %s', str(error), exc_info=True) # Keep the behavior from before the refactoring: in case of unexpected exception, only # update request state on individual transfers, and do nothing for bulks. # Don't know why it's like that. # # FIXME: shouldn't we always set the state to SUBMISSION_FAILED? if is_bulk: state_to_set = None if eid is not None: duration = time.time() - start_time logger(logging.INFO, 'Submit job %s to %s in %s seconds' % (eid, transfertool_obj, duration)) record_timer('daemons.conveyor.{submitter}.submit_bulk_transfer.per_file', (time.time() - start_time) * 1000 / len(transfers) or 1, labels={'submitter': submitter}) record_counter('daemons.conveyor.{submitter}.submit_bulk_transfer', delta=len(transfers), labels={'submitter': submitter}) record_timer('daemons.conveyor.{submitter}.submit_bulk_transfer.files', len(transfers), labels={'submitter': submitter}) if state_to_set: try: transfer_core.set_transfers_state(transfers, state=state_to_set, external_host=transfertool_obj.external_host, external_id=eid, submitted_at=datetime.datetime.utcnow(), logger=logger) except Exception: logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True) if eid is not None: # The job is still submitted in the file transfer service but the request is not updated. # Possibility to have a double submission during the next cycle. Try to cancel the external request. try: logger(logging.INFO, 'Cancel transfer %s on %s', eid, transfertool_obj) request.cancel_request_external_id(transfertool_obj, eid) except Exception: logger(logging.ERROR, 'Failed to cancel transfers %s on %s with error' % (eid, transfertool_obj), exc_info=True)
def bulk_query_requests(request_host, request_ids, transfertool='fts3', session=None): """ Query the status of a request. :param request_host: Name of the external host. :param request_ids: List of (Request-ID as a 32 character hex string, External-ID as a 32 character hex string) :param transfertool: Transfertool name as a string. :param session: Database session to use. :returns: Request status information as a dictionary. """ record_counter('core.request.query_request') transfer_ids = [] for request_id, external_id in request_ids: if external_id not in transfer_ids: transfer_ids.append(external_id) if transfertool == 'fts3': try: ts = time.time() fts_resps = fts3.bulk_query(transfer_ids, request_host) record_timer('core.request.query_bulk_request_fts3', (time.time() - ts) * 1000 / len(transfer_ids)) except Exception: raise responses = {} for request_id, external_id in request_ids: fts_resp = fts_resps[external_id] if not fts_resp: req_status = {} req_status['new_state'] = RequestState.LOST req_status['request_id'] = request_id elif isinstance(fts_resp, Exception): req_status = fts_resp else: req_status = fts_resp # needed for unfinished jobs req_status['request_id'] = request_id if req_status['job_state'] in (str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)): req_status['new_state'] = RequestState.FAILED elif req_status['job_state'] == str(FTSState.FINISHED): req_status['new_state'] = RequestState.DONE responses[request_id] = req_status return responses else: raise NotImplementedError return None
def rucio_unloadhook(): """ Rucio unload Hook.""" duration = time() - ctx.env['start_time'] ip = ctx.env.get('HTTP_X_FORWARDED_FOR') if not ip: ip = ctx.ip # print ctx.env.get('request_id'), ctx.env.get('REQUEST_METHOD'), ctx.env.get('REQUEST_URI'), ctx.data, duration, ctx.env.get('issuer'), ip # Record a time serie for each REST operations time_serie_name = '.'.join(('http', 'methods', ctx.env.get('REQUEST_METHOD'), 'resources.')) time_serie_name += '.'.join(filter(None, ctx.env.get('SCRIPT_NAME').split('/'))[:4]) if ctx.path == '/list': time_serie_name += '.list' time_serie_name = time_serie_name.replace('..', '.').lower() record_timer(time_serie_name, duration * 1000)
def cancel(tid, session): """ Kills a transfer by setting its state to CANCELLED. :param tid: The transfer job id. """ record_counter('daemons.mock.fts3.cancel') ts = time.time() query = session.query(test_models.MockFTSTransfer).filter(tid=tid) query.update({'state': FTSState.CANCELED, 'last_modified': datetime.datetime.utcnow()}) record_timer('daemons.mock.fts3.cancel.update_state', (time.time()-ts)*1000)
def delegate_proxy(self, proxy, ca_path='/etc/grid-security/certificates/', duration_hours=96, timeleft_hours=72): """Delegate user proxy to fts server if the lifetime is less than timeleft_hours :param proxy: proxy to be delegated :type proxy: str :param ca_path: ca path for verification, defaults to '/etc/grid-security/certificates/' :param ca_path: str, optional :param duration_hours: delegation validity duration in hours, defaults to 48 :param duration_hours: int, optional :param timeleft_hours: minimal delegation time left, defaults to 12 :param timeleft_hours: int, optional :return: delegation ID :rtype: str """ logging.info("Delegating proxy %s to %s", proxy, self.external_host) start_time = time.time() try: context = Context(self.external_host, ucert=proxy, ukey=proxy, verify=True, capath=ca_path) delegation_id = delegate( context, lifetime=datetime.timedelta(hours=duration_hours), delegate_when_lifetime_lt=datetime.timedelta( hours=timeleft_hours)) record_timer('transfertool.fts3.delegate_proxy.success.%s' % proxy, (time.time() - start_time)) except ServerError: logging.error("Server side exception during FTS proxy delegation.") record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy, (time.time() - start_time)) raise except ClientError: logging.error("Config side exception during FTS proxy delegation.") record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy, (time.time() - start_time)) raise except BadEndpoint: logging.error("Wrong FTS endpoint: %s", self.external_host) record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy, (time.time() - start_time)) raise except ReadTimeout as error: raise TransferToolTimeout(error) except JSONDecodeError as error: raise TransferToolWrongAnswer(error) logging.info("Delegated proxy %s", delegation_id) return delegation_id, context
def get_requests(rse_id=None, process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activity=None): ts = time.time() reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=RequestState.QUEUED, limit=bulk, rse=rse_id, activity=activity, process=process, total_processes=total_processes, thread=thread, total_threads=total_threads) record_timer('daemons.conveyor.submitter.get_next', (time.time() - ts) * 1000) return reqs
def query_request(request_id, transfertool='fts3', session=None): """ Query the status of a request. :param request_id: Request-ID as a 32 character hex string. :param transfertool: Transfertool name as a string. :param session: Database session to use. :returns: Request status information as a dictionary. """ record_counter('core.request.query_request') req = get_request(request_id, session=session) req_status = {'request_id': request_id, 'new_state': None} if not req: req_status['new_state'] = RequestState.LOST return req_status if transfertool == 'fts3': try: ts = time.time() response = fts3.query(req['external_id'], req['external_host']) record_timer('core.request.query_request_fts3', (time.time() - ts) * 1000) req_status['details'] = response except Exception: raise if not response: req_status['new_state'] = RequestState.LOST else: if 'job_state' not in response: req_status['new_state'] = RequestState.LOST elif response['job_state'] in (str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)): req_status['new_state'] = RequestState.FAILED elif response['job_state'] == str(FTSState.FINISHED): req_status['new_state'] = RequestState.DONE else: raise NotImplementedError return req_status
def poll_transfers(external_host, xfers, process=0, thread=0, timeout=None): try: try: tss = time.time() logging.info('%i:%i - polling %i transfers against %s with timeout %s' % (process, thread, len(xfers), external_host, timeout)) resps = request_core.bulk_query_transfers(external_host, xfers, 'fts3', timeout) record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers)) except RequestException as error: logging.error("Failed to contact FTS server: %s" % (str(error))) return except: logging.error("Failed to query FTS info: %s" % (traceback.format_exc())) return logging.debug('%i:%i - updating %s requests status' % (process, thread, len(xfers))) for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: set_transfer_state(external_host, transfer_id, RequestState.LOST) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logging.warning("Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter('daemons.conveyor.poller.query_transfer_exception') else: for request_id in transf_resp: ret = update_request_state(transf_resp[request_id]) # if True, really update request content; if False, only touch request record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if isinstance(error.args[0], tuple) and (match('.*ORA-00054.*', error.args[0][0]) or match('.*ORA-00060.*', error.args[0][0]) or ('ERROR 1205 (HY000)' in error.args[0][0])): logging.warn("Lock detected when handling request %s - skipping" % request_id) else: logging.error(traceback.format_exc()) logging.debug('%i:%i - finished updating %s requests status' % (process, thread, len(xfers))) except: logging.error(traceback.format_exc())
def submit(tinfo, session): """ Create a new transfer job in state QUEUED. :param tinfo: The transfer job information as a string. :returns: The transfer job id. """ record_counter('daemons.mock.fts3.submit') ts = time.time() tid = generate_uuid() record_timer('daemons.mock.fts3.submit.000-generate_uuid', (time.time()-ts)*1000) ts = time.time() new_transfer = test_models.MockFTSTransfer(transfer_id=tid, transfer_metadata=str(tinfo)) new_transfer.save(session=session) record_timer('daemons.mock.fts3.submit.001-new_transfer', (time.time()-ts)*1000) return {'job_id': tid}
def submit_bulk_transfers(external_host, files, transfertool='fts3', job_params={}, timeout=None, user_transfer_job=False): """ Submit transfer request to a transfertool. :param external_host: External host name as string :param files: List of Dictionary containing request file. :param transfertool: Transfertool as a string. :param job_params: Metadata key/value pairs for all files as a dictionary. :returns: Transfertool external ID. """ record_counter('core.request.submit_transfer') transfer_id = None if transfertool == 'fts3': start_time = time.time() job_files = [] for file in files: job_file = {} for key in file: if key == 'sources': # convert sources from (src_rse, url, src_rse_id, rank) to url job_file[key] = [] for source in file[key]: job_file[key].append(source[1]) else: job_file[key] = file[key] job_files.append(job_file) if not user_transfer_job: transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) elif USER_TRANSFERS == "cms": transfer_id = FTS3MyProxyTransfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) else: # if no valid USER TRANSFER cases --> go with std submission transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) record_timer('core.request.submit_transfers_fts3', (time.time() - start_time) * 1000 / len(files)) return transfer_id
def bulk_query_transfers(request_host, transfer_ids, transfertool='fts3', timeout=None): """ Query the status of a transfer. :param request_host: Name of the external host. :param transfer_ids: List of (External-ID as a 32 character hex string) :param transfertool: Transfertool name as a string. :returns: Request status information as a dictionary. """ record_counter('core.request.bulk_query_transfers') if transfertool == 'fts3': try: start_time = time.time() fts_resps = FTS3Transfertool(external_host=request_host).bulk_query(transfer_ids=transfer_ids, timeout=timeout) record_timer('core.request.bulk_query_transfers', (time.time() - start_time) * 1000 / len(transfer_ids)) except Exception: raise for transfer_id in transfer_ids: if transfer_id not in fts_resps: fts_resps[transfer_id] = Exception("Transfer id %s is not returned" % transfer_id) if fts_resps[transfer_id] and not isinstance(fts_resps[transfer_id], Exception): for request_id in fts_resps[transfer_id]: if fts_resps[transfer_id][request_id]['file_state'] in (str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)): fts_resps[transfer_id][request_id]['new_state'] = RequestState.FAILED elif fts_resps[transfer_id][request_id]['file_state'] in str(FTSState.FINISHED): fts_resps[transfer_id][request_id]['new_state'] = RequestState.DONE return fts_resps else: raise NotImplementedError return None
def run_once(bulk, group_bulk, rse_ids, scheme, failover_scheme, transfertool_kwargs, heartbeat_handler, activity): worker_number, total_workers, logger = heartbeat_handler.live() start_time = time.time() transfers = next_transfers_to_submit( total_workers=total_workers, worker_number=worker_number, failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, transfertools_by_name={'fts3': FTS3Transfertool}, older_than=None, request_type=RequestType.STAGEIN, logger=logger, ) total_transfers = len(list(hop for paths in transfers.values() for path in paths for hop in path)) record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers) record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers) logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity)) for builder, transfer_paths in transfers.items(): transfertool_obj = builder.make_transfertool(logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {})) logger(logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj)) start_time = time.time() grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1)) logger(logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj)) for job in grouped_jobs: worker_number, total_workers, logger = heartbeat_handler.live() submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger) queue_empty = False if total_transfers < group_bulk: queue_empty = True logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s' % (total_transfers, activity, group_bulk)) return queue_empty
def transmogrifier(bulk=5, once=False, sleep_time=60): """ Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash, identifies the subscriptions matching the DIDs and submit a replication rule for each DID matching a subscription. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'transmogrifier' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) dids, subscriptions = [], [] tottime = 0 prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get the new DIDs based on the is_new flag for did in list_new_dids(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], chunk_size=bulk, did_type=None): dids.append({ 'scope': did['scope'], 'did_type': str(did['did_type']), 'name': did['name'] }) sub_dict = {3: []} # Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest # The priority is defined as 'policyid' for sub in list_subscriptions(None, None): if sub['state'] != SubscriptionState.INACTIVE and sub[ 'lifetime'] and (datetime.now() > sub['lifetime']): update_subscription( name=sub['name'], account=sub['account'], metadata={'state': SubscriptionState.INACTIVE}) elif sub['state'] in [ SubscriptionState.ACTIVE, SubscriptionState.UPDATED ]: priority = 3 if 'policyid' in sub: if int(sub['policyid']) not in sub_dict: sub_dict[int(sub['policyid'])] = [] priority = int(sub['policyid']) sub_dict[priority].append(sub) priorities = list(sub_dict.keys()) priorities.sort() # Order the subscriptions according to their priority for priority in priorities: subscriptions.extend(sub_dict[priority]) except SubscriptionNotFound as error: logging.warning(prepend_str + 'No subscriptions defined: %s' % (str(error))) time.sleep(10) continue except Exception as error: logging.error( prepend_str + 'Failed to get list of new DIDs or subscriptions: %s' % (str(error))) try: results = {} start_time = time.time() blacklisted_rse_id = [ rse['id'] for rse in list_rses({'availability_write': False}) ] logging.debug(prepend_str + 'In transmogrifier worker') identifiers = [] # Loop over all the new dids for did in dids: did_success = True if did['did_type'] == str( DIDType.DATASET) or did['did_type'] == str( DIDType.CONTAINER): did_tag = '%s:%s' % (did['scope'].internal, did['name']) results[did_tag] = [] try: metadata = get_metadata(did['scope'], did['name']) # Loop over all the subscriptions for subscription in subscriptions: # Check if the DID match the subscription if is_matching_subscription( subscription, did, metadata) is True: filter_string = loads(subscription['filter']) split_rule = filter_string.get( 'split_rule', False) stime = time.time() results[did_tag].append(subscription['id']) logging.info(prepend_str + '%s:%s matches subscription %s' % (did['scope'], did['name'], subscription['name'])) rules = loads( subscription['replication_rules']) created_rules = {} cnt = 0 for rule_dict in rules: cnt += 1 created_rules[cnt] = [] # Get all the rule and subscription parameters grouping = rule_dict.get( 'grouping', 'DATASET') lifetime = rule_dict.get('lifetime', None) ignore_availability = rule_dict.get( 'ignore_availability', None) weight = rule_dict.get('weight', None) source_replica_expression = rule_dict.get( 'source_replica_expression', None) locked = rule_dict.get('locked', None) if locked == 'True': locked = True else: locked = False purge_replicas = rule_dict.get( 'purge_replicas', False) if purge_replicas == 'True': purge_replicas = True else: purge_replicas = False rse_expression = str( rule_dict['rse_expression']) comment = str(subscription['comments']) subscription_id = str(subscription['id']) account = subscription['account'] copies = int(rule_dict['copies']) activity = rule_dict.get( 'activity', 'User Subscriptions') try: validate_schema(name='activity', obj=activity) except InputValidationError as error: logging.error( prepend_str + 'Error validating the activity %s' % (str(error))) activity = 'User Subscriptions' if lifetime: lifetime = int(lifetime) str_activity = "".join(activity.split()) success = False nattempt = 5 attemptnr = 0 skip_rule_creation = False selected_rses = [] chained_idx = rule_dict.get( 'chained_idx', None) if chained_idx: params = {} if rule_dict.get( 'associated_site_idx', None): params[ 'associated_site_idx'] = rule_dict.get( 'associated_site_idx', None) logging.debug( '%s Chained subscription identified. Will use %s', prepend_str, str(created_rules[chained_idx])) algorithm = rule_dict.get( 'algorithm', None) selected_rses = select_algorithm( algorithm, created_rules[chained_idx], params) else: # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm if split_rule: vo = account.vo rses = parse_expression( rse_expression, filter={'vo': vo}) list_of_rses = [ rse['id'] for rse in rses ] # Check that some rule doesn't already exist for this DID and subscription preferred_rse_ids = [] for rule in list_rules( filters={ 'subscription_id': subscription_id, 'scope': did['scope'], 'name': did['name'] }): already_existing_rses = [ (rse['rse'], rse['id']) for rse in parse_expression( rule['rse_expression'], filter={'vo': vo}) ] for rse, rse_id in already_existing_rses: if (rse_id in list_of_rses ) and ( rse_id not in preferred_rse_ids): preferred_rse_ids.append( rse_id) if len(preferred_rse_ids ) >= copies: skip_rule_creation = True rse_id_dict = {} for rse in rses: rse_id_dict[ rse['id']] = rse['rse'] try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist= blacklisted_rse_id) ] except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logging.warning( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites' % (subscription['name'], account, str(error))) # Now including the blacklisted sites try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist=[]) ] ignore_availability = True except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logging.error( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.' % (subscription['name'], account, str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str(error.__class__. __name__)), delta=1) # The DID won't be reevaluated at the next cycle did_success = did_success and True continue for attempt in range(0, nattempt): attemptnr = attempt nb_rule = 0 # Try to create the rule try: if split_rule: if not skip_rule_creation: for rse in selected_rses: if isinstance( selected_rses, dict): source_replica_expression = selected_rses[ rse].get( 'source_replica_expression', None) weight = selected_rses[ rse].get( 'weight', None) logging.info( prepend_str + 'Will insert one rule for %s:%s on %s' % (did['scope'], did['name'], rse)) rule_ids = add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=1, rse_expression=rse, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription_id, source_replica_expression =source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) created_rules[ cnt].append( rule_ids[0]) nb_rule += 1 if nb_rule == copies: success = True break else: rule_ids = add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=copies, rse_expression= rse_expression, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription['id'], source_replica_expression= source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) created_rules[cnt].append( rule_ids[0]) nb_rule += 1 monitor.record_counter( counters= 'transmogrifier.addnewrule.done', delta=nb_rule) monitor.record_counter( counters= 'transmogrifier.addnewrule.activity.%s' % str_activity, delta=nb_rule) success = True break except (InvalidReplicationRule, InvalidRuleWeight, InvalidRSEExpression, StagingAreaRuleRequiresLifetime, DuplicateRule) as error: # Errors that won't be retried success = True logging.error(prepend_str + '%s' % (str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) break except (ReplicationRuleCreationTemporaryFailed, InsufficientTargetRSEs, InsufficientAccountLimit, DatabaseException, RSEBlacklisted, RSEWriteBlocked) as error: # Errors to be retried logging.error( prepend_str + '%s Will perform an other attempt %i/%i' % (str(error), attempt + 1, nattempt)) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) except Exception: # Unexpected errors monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.unknown', delta=1) exc_type, exc_value, exc_traceback = exc_info( ) logging.critical( prepend_str + ''.join( format_exception( exc_type, exc_value, exc_traceback)).strip( )) did_success = (did_success and success) if (attemptnr + 1) == nattempt and not success: logging.error( prepend_str + 'Rule for %s:%s on %s cannot be inserted' % (did['scope'], did['name'], rse_expression)) else: logging.info( prepend_str + '%s rule(s) inserted in %f seconds' % (str(nb_rule), time.time() - stime)) except DataIdentifierNotFound as error: logging.warning(prepend_str + error) if did_success: if did['did_type'] == str(DIDType.FILE): monitor.record_counter( counters='transmogrifier.did.file.processed', delta=1) elif did['did_type'] == str(DIDType.DATASET): monitor.record_counter( counters='transmogrifier.did.dataset.processed', delta=1) elif did['did_type'] == str(DIDType.CONTAINER): monitor.record_counter( counters='transmogrifier.did.container.processed', delta=1) monitor.record_counter( counters='transmogrifier.did.processed', delta=1) identifiers.append({ 'scope': did['scope'], 'name': did['name'], 'did_type': DIDType.from_sym(did['did_type']) }) time1 = time.time() # Mark the DIDs as processed for identifier in chunks(identifiers, 100): _retrial(set_new_dids, identifier, None) logging.info(prepend_str + 'Time to set the new flag : %f' % (time.time() - time1)) tottime = time.time() - start_time for sub in subscriptions: update_subscription( name=sub['name'], account=sub['account'], metadata={'last_processed': datetime.now()}) logging.info(prepend_str + 'It took %f seconds to process %i DIDs' % (tottime, len(dids))) logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids))) monitor.record_counter(counters='transmogrifier.job.done', delta=1) monitor.record_timer(stat='transmogrifier.job.duration', time=1000 * tottime) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) monitor.record_counter(counters='transmogrifier.job.error', delta=1) monitor.record_counter(counters='transmogrifier.addnewrule.error', delta=1) if once is True: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def submit_bulk_transfers(external_host, files, job_params, timeout=None): """ Submit a transfer to FTS3 via JSON. :param external_host: FTS server as a string. :param files: List of dictionary which for a transfer. :param job_params: Dictionary containing key/value pairs, for all transfers. :returns: FTS transfer identifier. """ # FTS3 expects 'davs' as the scheme identifier instead of https for file in files: if not file['sources'] or file['sources'] == []: raise Exception('No sources defined') new_src_urls = [] new_dst_urls = [] for url in file['sources']: if url.startswith('https'): new_src_urls.append(':'.join(['davs'] + url.split(':')[1:])) else: new_src_urls.append(url) for url in file['destinations']: if url.startswith('https'): new_dst_urls.append(':'.join(['davs'] + url.split(':')[1:])) else: new_dst_urls.append(url) file['sources'] = new_src_urls file['destinations'] = new_dst_urls transfer_id = None expected_transfer_id = None if __USE_DETERMINISTIC_ID: job_params = job_params.copy() job_params["id_generator"] = "deterministic" job_params["sid"] = files[0]['metadata']['request_id'] expected_transfer_id = get_deterministic_id(external_host, job_params["sid"]) logging.debug("Submit bulk transfers in deterministic mode, sid %s, expected transfer id: %s" % (job_params["sid"], expected_transfer_id)) # bulk submission params_dict = {'files': files, 'params': job_params} params_str = json.dumps(params_dict) r = None if external_host.startswith('https://'): try: ts = time.time() r = requests.post('%s/jobs' % external_host, verify=False, cert=(__USERCERT, __USERCERT), data=params_str, headers={'Content-Type': 'application/json'}, timeout=timeout) record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(external_host), (time.time() - ts) * 1000 / len(files)) except: logging.warn('Could not submit transfer to %s - %s' % (external_host, str(traceback.format_exc()))) else: try: ts = time.time() r = requests.post('%s/jobs' % external_host, data=params_str, headers={'Content-Type': 'application/json'}, timeout=timeout) record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(external_host), (time.time() - ts) * 1000 / len(files)) except: logging.warn('Could not submit transfer to %s - %s' % (external_host, str(traceback.format_exc()))) if r and r.status_code == 200: record_counter('transfertool.fts3.%s.submission.success' % __extract_host(external_host), len(files)) transfer_id = str(r.json()['job_id']) else: if expected_transfer_id: transfer_id = expected_transfer_id logging.warn("Failed to submit transfer to %s, will use expected transfer id %s, error: %s" % (external_host, transfer_id, r.text if r is not None else r)) else: logging.warn("Failed to submit transfer to %s, error: %s" % (external_host, r.text if r is not None else r)) record_counter('transfertool.fts3.%s.submission.failure' % __extract_host(external_host), len(files)) return transfer_id
def submit_transfers(transfers, job_metadata): """ Submit a transfer to FTS3 via JSON. :param transfers: Dictionary containing 'request_id', 'src_urls', 'dest_urls', 'filesize', 'md5', 'adler32', 'overwrite', 'job_metadata', 'src_spacetoken', 'dest_spacetoken' :param job_metadata: Dictionary containing key/value pairs, for all transfers. :param transfer_host: FTS server as a string. :returns: List of FTS transfer identifiers """ # Early sanity check for transfer in transfers: if not transfer['sources'] or transfer['sources'] == []: raise Exception('No sources defined') transfer['src_urls'] = [] for source in transfer['sources']: # # convert sources from (src_rse, url, src_rse_id, rank) to url transfer['src_urls'].append(source[1]) # FTS3 expects 'davs' as the scheme identifier instead of https new_src_urls = [] new_dst_urls = [] for transfer in transfers: for url in transfer['src_urls']: if url.startswith('https'): new_src_urls.append(':'.join(['davs'] + url.split(':')[1:])) else: new_src_urls.append(url) for url in transfer['dest_urls']: if url.startswith('https'): new_dst_urls.append(':'.join(['davs'] + url.split(':')[1:])) else: new_dst_urls.append(url) transfer['src_urls'] = new_src_urls transfer['dest_urls'] = new_dst_urls # Rewrite the checksums into FTS3 format, prefer adler32 if available for transfer in transfers: transfer['checksum'] = None if 'md5' in transfer.keys() and transfer['md5']: transfer['checksum'] = 'MD5:%s' % str(transfer['md5']) if 'adler32' in transfer.keys() and transfer['adler32']: transfer['checksum'] = 'ADLER32:%s' % str(transfer['adler32']) transfer_ids = {} job_metadata['issuer'] = 'rucio' job_metadata['previous_attempt_id'] = None # we have to loop until we get proper fts3 bulk submission for transfer in transfers: job_metadata['request_id'] = transfer['request_id'] if 'previous_attempt_id' in transfer.keys(): job_metadata['previous_attempt_id'] = transfer['previous_attempt_id'] params_dict = {'files': [{'sources': transfer['src_urls'], 'destinations': transfer['dest_urls'], 'metadata': {'issuer': 'rucio'}, 'filesize': int(transfer['filesize']), 'checksum': str(transfer['checksum']), 'activity': str(transfer['activity']), 'selection_strategy': transfer.get('selection_strategy', 'auto')}], 'params': {'verify_checksum': True if transfer['checksum'] else False, 'spacetoken': transfer['dest_spacetoken'] if transfer['dest_spacetoken'] else 'null', 'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1, 'bring_online': transfer['bring_online'] if transfer['bring_online'] else None, 'job_metadata': job_metadata, 'source_spacetoken': transfer['src_spacetoken'] if transfer['src_spacetoken'] else None, 'overwrite': transfer['overwrite'], 'priority': 3}} r = None params_str = json.dumps(params_dict) transfer_host = transfer['external_host'] if transfer_host.startswith('https://'): try: ts = time.time() r = requests.post('%s/jobs' % transfer_host, verify=False, cert=(__USERCERT, __USERCERT), data=params_str, headers={'Content-Type': 'application/json'}, timeout=5) record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(transfer_host), (time.time() - ts) * 1000) except: logging.warn('Could not submit transfer to %s' % transfer_host) else: try: ts = time.time() r = requests.post('%s/jobs' % transfer_host, data=params_str, headers={'Content-Type': 'application/json'}, timeout=5) record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(transfer_host), (time.time() - ts) * 1000) except: logging.warn('Could not submit transfer to %s' % transfer_host) if r and r.status_code == 200: record_counter('transfertool.fts3.%s.submission.success' % __extract_host(transfer_host)) transfer_ids[transfer['request_id']] = {'external_id': str(r.json()['job_id']), 'dest_urls': transfer['dest_urls'], 'external_host': transfer_host} else: logging.warn("Failed to submit transfer to %s, error: %s" % (transfer_host, r.text if r is not None else r)) record_counter('transfertool.fts3.%s.submission.failure' % __extract_host(transfer_host)) return transfer_ids
def __update_atime(self): """ Bulk update atime. """ replicas = [] rses = [] for report in self.__reports: if 'vo' not in report: report['vo'] = 'def' try: # Identify suspicious files try: if self.__bad_files_patterns and report['eventType'] in [ 'get_sm', 'get_sm_a', 'get' ] and 'clientState' in report and report[ 'clientState'] not in [ 'DONE', 'FOUND_ROOT', 'ALREADY_DONE' ]: for pattern in self.__bad_files_patterns: if 'stateReason' in report and report[ 'stateReason'] and isinstance( report['stateReason'], str) and pattern.match( report['stateReason']): reason = report['stateReason'][:255] if 'url' not in report or not report['url']: self.__logger( logging.ERROR, 'Missing url in the following trace : ' + str(report)) else: try: surl = report['url'] declare_bad_file_replicas( [ surl, ], reason=reason, issuer=InternalAccount( 'root', vo=report['vo']), status=BadFilesStatus.SUSPICIOUS) self.__logger( logging.INFO, 'Declare suspicious file %s with reason %s' % (report['url'], reason)) except Exception as error: self.__logger( logging.ERROR, 'Failed to declare suspicious file' + str(error)) except Exception as error: self.__logger( logging.ERROR, 'Problem with bad trace : %s . Error %s' % (str(report), str(error))) # check if scope in report. if not skip this one. if 'scope' not in report: record_counter('daemons.tracer.kronos.missing_scope') if report['eventType'] != 'touch': continue else: record_counter('daemons.tracer.kronos.with_scope') report['scope'] = InternalScope(report['scope'], report['vo']) # handle all events starting with get* and download and touch events. if not report['eventType'].startswith('get') and not report[ 'eventType'].startswith('sm_get') and not report[ 'eventType'] == 'download' and not report[ 'eventType'] == 'touch': continue if report['eventType'].endswith('_es'): continue record_counter('daemons.tracer.kronos.total_get') if report['eventType'] == 'get': record_counter('daemons.tracer.kronos.dq2clients') elif report['eventType'] == 'get_sm' or report[ 'eventType'] == 'sm_get': if report['eventVersion'] == 'aCT': record_counter( 'daemons.tracer.kronos.panda_production_act') else: record_counter( 'daemons.tracer.kronos.panda_production') elif report['eventType'] == 'get_sm_a' or report[ 'eventType'] == 'sm_get_a': if report['eventVersion'] == 'aCT': record_counter( 'daemons.tracer.kronos.panda_analysis_act') else: record_counter('daemons.tracer.kronos.panda_analysis') elif report['eventType'] == 'download': record_counter('daemons.tracer.kronos.rucio_download') elif report['eventType'] == 'touch': record_counter('daemons.tracer.kronos.rucio_touch') else: record_counter('daemons.tracer.kronos.other_get') if report['eventType'] == 'download' or report[ 'eventType'] == 'touch': report['usrdn'] = report['account'] if report['usrdn'] in self.__excluded_usrdns: continue # handle touch and non-touch traces differently if report['eventType'] != 'touch': # check if the report has the right state. if 'eventVersion' in report: if report['eventVersion'] != 'aCT': if report['clientState'] in self.__excluded_states: continue if 'remoteSite' not in report: continue if not report['remoteSite']: continue if 'filename' not in report: if 'name' in report: report['filename'] = report['name'] rses = report['remoteSite'].strip().split(',') for rse in rses: try: rse_id = get_rse_id(rse=rse, vo=report['vo']) except RSENotFound: self.__logger( logging.WARNING, "Cannot lookup rse_id for %s. Will skip this report.", rse) record_counter( 'daemons.tracer.kronos.rse_not_found') continue replicas.append({ 'name': report['filename'], 'scope': report['scope'], 'rse': rse, 'rse_id': rse_id, 'accessed_at': datetime.utcfromtimestamp( report['traceTimeentryUnix']), 'traceTimeentryUnix': report['traceTimeentryUnix'], 'eventVersion': report['eventVersion'] }) else: # if touch event and if datasetScope is in the report then it means # that there is no file scope/name and therefore only the dataset is # put in the queue to be updated and the rest is skipped. rse_id = None rse = None if 'remoteSite' in report: rse = report['remoteSite'] try: rse_id = get_rse_id(rse=rse, vo=report['vo']) except RSENotFound: self.__logger(logging.WARNING, "Cannot lookup rse_id for %s.", rse) record_counter( 'daemons.tracer.kronos.rse_not_found') if 'datasetScope' in report: self.__dataset_queue.put({ 'scope': InternalScope(report['datasetScope'], vo=report['vo']), 'name': report['dataset'], 'rse_id': rse_id, 'accessed_at': datetime.utcfromtimestamp( report['traceTimeentryUnix']) }) continue else: if 'remoteSite' not in report: continue replicas.append({ 'name': report['filename'], 'scope': report['scope'], 'rse': rse, 'rse_id': rse_id, 'accessed_at': datetime.utcfromtimestamp( report['traceTimeentryUnix']) }) except (KeyError, AttributeError): self.__logger(logging.ERROR, "Cannot handle report.", exc_info=True) record_counter('daemons.tracer.kronos.report_error') continue except Exception: self.__logger(logging.ERROR, "Exception", exc_info=True) continue for did in list_parent_dids(report['scope'], report['filename']): if did['type'] != DIDType.DATASET: continue # do not update _dis datasets if did['scope'].external == 'panda' and '_dis' in did['name']: continue for rse in rses: try: rse_id = get_rse_id(rse=rse, vo=report['vo']) except RSENotFound: self.__logger( logging.WARNING, "Cannot lookup rse_id for %s. Will skip this report.", rse) record_counter('daemons.tracer.kronos.rse_not_found') continue self.__dataset_queue.put({ 'scope': did['scope'], 'name': did['name'], 'did_type': did['type'], 'rse_id': rse_id, 'accessed_at': datetime.utcfromtimestamp(report['traceTimeentryUnix']) }) if not len(replicas): return self.__logger(logging.DEBUG, "trying to update replicas: %s", replicas) try: start_time = time() for replica in replicas: # if touch replica hits a locked row put the trace back into queue for later retry if not touch_replica(replica): resubmit = { 'filename': replica['name'], 'scope': replica['scope'].external, 'remoteSite': replica['rse'], 'traceTimeentryUnix': replica['traceTimeentryUnix'], 'eventType': 'get', 'usrdn': 'someuser', 'clientState': 'DONE', 'eventVersion': replica['eventVersion'] } if replica['scope'].vo != 'def': resubmit['vo'] = replica['scope'].vo self.__conn.send(body=jdumps(resubmit), destination=self.__queue, headers={ 'appversion': 'rucio', 'resubmitted': '1' }) record_counter('daemons.tracer.kronos.sent_resubmitted') self.__logger(logging.WARNING, 'hit locked row, resubmitted to queue') record_timer('daemons.tracer.kronos.update_atime', (time() - start_time) * 1000) except Exception: self.__logger(logging.ERROR, "Cannot update replicas.", exc_info=True) record_counter('daemons.tracer.kronos.update_error') self.__logger(logging.INFO, 'updated %d replica(s)' % len(replicas))
logging.error("Invalid RSE exception %s for request %s: %s" % (source_replica_expression, req['request_id'], e)) allowed_source_rses = [] else: allowed_source_rses = [x['rse'] for x in parsed_rses] tmpsrc = [] metadata = {} try: ts = time.time() replications = replica.list_replicas(dids=[{'scope': req['scope'], 'name': req['name'], 'type': DIDType.FILE}], schemes=[scheme, 'gsiftp']) record_timer('daemons.conveyor.submitter.list_replicas', (time.time() - ts) * 1000) # return gracefully if there are no replicas for a DID if not replications: return None, None for source in replications: metadata['filesize'] = long(source['bytes']) metadata['md5'] = source['md5'] metadata['adler32'] = source['adler32'] # TODO: Source protection # we need to know upfront if we are mixed DISK/TAPE source mixed_source = [] for source_rse in source['rses']:
def delete_from_storage(replicas, prot, rse_info, staging_areas, prepend_str): deleted_files = [] rse_name = rse_info['rse'] rse_id = rse_info['id'] try: prot.connect() for replica in replicas: # Physical deletion try: deletion_dict = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme'] } logging.info('%s Deletion ATTEMPT of %s:%s as %s on %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) start = time.time() # For STAGING RSEs, no physical deletion if rse_id in staging_areas: logging.warning( '%s Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) continue if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info[ 'sign_url'] is not None: pfn = get_signed_url(rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning( '%s Deletion UNAVAILABLE of %s:%s as %s on %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) monitor.record_timer( 'daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse_name), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logging.info( '%s Deletion SUCCESS of %s:%s as %s on %s in %s seconds', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, duration) except SourceNotFound: err_msg = '%s Deletion NOTFOUND of %s:%s as %s on %s' % ( prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) logging.warning(err_msg) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) if replica['state'] == ReplicaState.AVAILABLE: deletion_dict['reason'] = str(err_msg) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: logging.warning( '%s Deletion NOACCESS of %s:%s as %s on %s: %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except Exception as error: logging.critical( '%s Deletion CRITICAL of %s:%s as %s on %s: %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, str(traceback.format_exc())) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in replicas: logging.warning('%s Deletion NOACCESS of %s:%s as %s on %s: %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) add_message( 'deletion-failed', { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] }) finally: prot.close() return deleted_files
def submitter(once=False, rses=[], process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activities=None): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('submitter starting - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = 'srm' logging.info('submitter started - process (%i/%i) thread (%i/%i)' % (process, total_processes, thread, total_threads)) while not graceful_stop.is_set(): try: if activities is None: activities = [None] for activity in activities: if rses is None: rses = [None] for rse in rses: if rse: # run in rse list mode rse_info = rsemgr.get_rse_info(rse['rse']) logging.info("Working on RSE: %s" % rse['rse']) ts = time.time() reqs = get_requests(rse_id=rse['id'], process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) else: # no rse list, run FIFO mode rse_info = None ts = time.time() reqs = get_requests(process=process, total_processes=total_processes, thread=thread, total_threads=total_threads, mock=mock, bulk=bulk, activity=activity) record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000) if reqs: logging.debug('%i:%i - submitting %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: time.sleep(1) continue for req in reqs: try: if not rse: # no rse list, in FIFO mode dest_rse = rse_core.get_rse(rse=None, rse_id=req['dest_rse_id']) rse_info = rsemgr.get_rse_info(dest_rse['rse']) ts = time.time() transfer = get_transfer(rse_info, req, scheme, mock) record_timer('daemons.conveyor.submitter.get_transfer', (time.time() - ts) * 1000) logging.debug('Transfer for request %s: %s' % (req['request_id'], transfer)) if transfer is None: logging.warn("Request %s DID %s:%s RSE %s failed to get transfer" % (req['request_id'], req['scope'], req['name'], rse_info['rse'])) # TODO: Merge these two calls request.set_request_state(req['request_id'], RequestState.LOST) # if the DID does not exist anymore request.archive_request(req['request_id']) continue ts = time.time() tmp_metadata = transfer['file_metadata'] eids = request.submit_transfers(transfers=[transfer, ], transfertool='fts3', job_metadata=tmp_metadata) record_timer('daemons.conveyor.submitter.submit_transfer', (time.time() - ts) * 1000) ts = time.time() if req['previous_attempt_id']: logging.info('COPYING RETRY %s REQUEST %s PREVIOUS %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['retry_count'], req['request_id'], req['previous_attempt_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) else: logging.info('COPYING REQUEST %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['request_id'], req['scope'], req['name'], transfer['src_urls'], transfer['dest_urls'], eids[req['request_id']]['external_host'], eids[req['request_id']]['external_id'])) record_counter('daemons.conveyor.submitter.submit_request') except UnsupportedOperation, e: # The replica doesn't exist, need to cancel the request logging.warning(e) logging.info('Cancelling transfer request %s' % req['request_id']) try: # TODO: for now, there is only ever one destination request.cancel_request_did(req['scope'], req['name'], transfer['dest_urls'][0]) except Exception, e: logging.warning('Cannot cancel request: %s' % str(e))
monitor.record_counter(counters="transmogrifier.did.dataset.processed", delta=1) elif did["did_type"] == str(DIDType.CONTAINER): monitor.record_counter(counters="transmogrifier.did.container.processed", delta=1) monitor.record_counter(counters="transmogrifier.did.processed", delta=1) identifiers.append( {"scope": did["scope"], "name": did["name"], "did_type": DIDType.from_sym(did["did_type"])} ) time1 = time.time() for id in chunks(identifiers, 100): _retrial(set_new_dids, id, None) logging.info("Thread %i : Time to set the new flag : %f" % (worker_number, time.time() - time1)) tottime = time.time() - start_time logging.info("Thread %i : It took %f seconds to process %i DIDs" % (worker_number, tottime, len(dids))) logging.debug("Thread %i : DIDs processed : %s" % (worker_number, str(dids))) monitor.record_counter(counters="transmogrifier.job.done", delta=1) monitor.record_timer(stat="transmogrifier.job.duration", time=1000 * tottime) except: exc_type, exc_value, exc_traceback = exc_info() logging.critical("".join(format_exception(exc_type, exc_value, exc_traceback)).strip()) monitor.record_counter(counters="transmogrifier.job.error", delta=1) monitor.record_counter(counters="transmogrifier.addnewrule.error", delta=1) logging.info(once) if once is True: break if tottime < 10: time.sleep(10 - tottime) logging.info("Thread %i : Graceful stop requested" % (worker_number)) logging.info("Thread %i : Graceful stop done" % (worker_number)) def run(total_workers=1, chunk_size=100, once=False):
def get_transfer(rse, req, scheme, mock): src_spacetoken = None ts = time.time() sources, metadata = get_sources(rse, scheme, req) record_timer('daemons.conveyor.submitter.get_sources', (time.time() - ts) * 1000) logging.debug('Sources for request %s: %s' % (req['request_id'], sources)) if sources is None: logging.error("Request %s DID %s:%s RSE %s failed to get sources" % (req['request_id'], req['scope'], req['name'], rse['rse'])) return None filesize = metadata['filesize'] md5 = metadata['md5'] adler32 = metadata['adler32'] ts = time.time() destinations, dest_spacetoken = get_destinations(rse, scheme, req, sources) record_timer('daemons.conveyor.submitter.get_destinations', (time.time() - ts) * 1000) logging.debug('Destinations for request %s: %s' % (req['request_id'], destinations)) if destinations is None: logging.error("Request %s DID %s:%s RSE %s failed to get destinations" % (req['request_id'], req['scope'], req['name'], rse['rse'])) return None # Come up with mock sources if necessary if mock: tmp_sources = [] for s in sources: tmp_sources.append((s[0], ':'.join(['mock']+s[1].split(':')[1:]))) sources = tmp_sources tmp_metadata = {'request_id': req['request_id'], 'scope': req['scope'], 'name': req['name'], 'activity': req['activity'], 'src_rse': sources[0][0], 'dst_rse': rse['rse'], 'dest_rse_id': req['dest_rse_id'], 'filesize': filesize, 'md5': md5, 'adler32': adler32} if 'previous_attempt_id' in req and req['previous_attempt_id']: tmp_metadata['previous_attempt_id'] = req['previous_attempt_id'] # Extend the metadata dictionary with request attributes copy_pin_lifetime, overwrite, bring_online = -1, True, None if req['request_type'] == RequestType.STAGEIN: if req['attributes']: if type(req['attributes']) is dict: attr = json.loads(json.dumps(req['attributes'])) else: attr = json.loads(str(req['attributes'])) copy_pin_lifetime = attr.get('lifetime') overwrite = False bring_online = 21000 # if the source for transfer is a tape rse, set bring_online if req['request_type'] == RequestType.TRANSFER\ and rse_core.get_rse(sources[0][0]).rse_type == RSEType.TAPE: bring_online = 21000 # never overwrite on tape destinations if req['request_type'] == RequestType.TRANSFER\ and rse_core.get_rse(None, rse_id=req['dest_rse_id']).rse_type == RSEType.TAPE: overwrite = False # exclude destination replica from source source_surls = [s[1] for s in sources] if req['request_type'] == RequestType.STAGEIN and source_surls.sort() == destinations.sort(): logging.debug('STAGING REQUEST %s - Will not try to ignore equivalent sources' % req['request_id']) elif req['request_type'] == RequestType.STAGEIN: logging.debug('STAGING REQUEST %s - Forcing destination to source' % req['request_id']) destinations = source_surls else: new_sources = source_surls for source_surl in source_surls: if source_surl in destinations: logging.info('Excluding source %s for request %s' % (source_surl, req['request_id'])) new_sources.remove(source_surl) # make sure we only use one source when bring_online is needed if bring_online and len(new_sources) > 1: source_surls = [new_sources[0]] logging.info('Only using first source %s for bring_online request %s' % (source_surls, req['request_id'])) if not source_surls: logging.error('All sources excluded - SKIP REQUEST %s' % req['request_id']) return # Sources are properly set, so now we can finally force the source RSE to the destination RSE for STAGEIN if req['request_type'] == RequestType.STAGEIN: tmp_metadata['dst_rse'] = sources[0][0] # get external host if rse_core.get_rse(rse['rse'])['staging_area'] or rse['rse'].endswith("STAGING"): rse_attr = rse_core.list_rse_attributes(sources[0][0]) else: rse_attr = rse_core.list_rse_attributes(rse['rse'], rse['id']) fts_hosts = rse_attr.get('fts', None) retry_count = req['retry_count'] if not retry_count: retry_count = 0 if not fts_hosts: logging.error('Destination RSE %s FTS attribute not defined - SKIP REQUEST %s' % (rse['rse'], req['request_id'])) return fts_list = fts_hosts.split(",") external_host = fts_list[retry_count/len(fts_list)] transfer = {'request_id': req['request_id'], 'src_urls': source_surls, 'dest_urls': destinations, 'filesize': filesize, 'md5': md5, 'adler32': adler32, 'src_spacetoken': src_spacetoken, 'dest_spacetoken': dest_spacetoken, 'activity': req['activity'], 'overwrite': overwrite, 'bring_online': bring_online, 'copy_pin_lifetime': copy_pin_lifetime, 'external_host': external_host, 'file_metadata': tmp_metadata} return transfer
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60): """ Main loop to check the status of a transfer primitive with a transfertool. """ logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) while not graceful_stop.is_set(): try: ts = time.time() logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than)) reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=RequestState.SUBMITTED, limit=bulk, older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than), process=process, total_processes=total_processes, thread=thread, total_threads=total_threads) record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000) if reqs: logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: if once: break logging.debug("no requests found. will sleep 60 seconds") time.sleep(60) # Only sleep if there is nothing to do continue for xfers in chunks(reqs, bulk): try: req_ids = {} for req in xfers: record_counter('daemons.conveyor.poller.query_request') if not req['external_host'] in req_ids: req_ids[req['external_host']] = [] req_ids[req['external_host']].append((req['request_id'], req['external_id'])) responses = {} for external_host in req_ids: ts = time.time() resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3') record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host])) responses = dict(responses.items() + resps.items()) for external_host in req_ids: for request_id, external_id in req_ids[external_host]: response = responses[request_id] if isinstance(response, Exception): logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id])) record_counter('daemons.conveyor.poller.query_request_exception') response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None} common.update_request_state(response) record_counter('daemons.conveyor.poller.update_request_state') if response['new_state'] == RequestState.LOST: record_counter('daemons.conveyor.poller.request_lost') except: logging.critical(traceback.format_exc()) except: logging.critical(traceback.format_exc()) if once: return logging.debug('%i:%i - graceful stop requests' % (process, thread)) logging.debug('%i:%i - graceful stop done' % (process, thread))
def poll_transfers(external_host, xfers, prepend_str='', request_ids=None, timeout=None): """ Poll a list of transfers from an FTS server :param external_host: The FTS server to query from. :param xfrs: List of transfers to poll. :param process: Process number. :param thread: Thread number. :param timeout: Timeout. """ try: if TRANSFER_TOOL == 'mock': logging.debug(prepend_str + 'Setting %s transfer requests status to DONE per mock tool' % (len(xfers))) for task_id in xfers: ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=RequestState.DONE) record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) return try: tss = time.time() logging.info(prepend_str + 'Polling %i transfers against %s with timeout %s' % (len(xfers), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, xfers, TRANSFER_TOOL, timeout) record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers)) except TransferToolTimeout as error: logging.error(prepend_str + str(error)) return except TransferToolWrongAnswer as error: logging.error(prepend_str + str(error)) logging.error(prepend_str + 'Problem querying %s on %s. All jobs are being checked individually' % (str(xfers), external_host)) for xfer in xfers: try: logging.debug(prepend_str + 'Checking %s on %s' % (xfer, external_host)) status = transfer_core.bulk_query_transfers(external_host, [xfer, ], TRANSFER_TOOL, timeout) if xfer in status and isinstance(status[xfer], Exception): logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(status[xfer]))) except Exception as err: logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(err))) break return except RequestException as error: logging.error(prepend_str + "Failed to contact FTS server: %s" % (str(error))) return except Exception: logging.error(prepend_str + "Failed to query FTS info: %s" % (traceback.format_exc())) return logging.debug(prepend_str + 'Polled %s transfer requests status in %s seconds' % (len(xfers), (time.time() - tss))) tss = time.time() logging.debug(prepend_str + 'Updating %s transfer requests status' % (len(xfers))) cnt = 0 if TRANSFER_TOOL == 'globus': for task_id in resps: ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=resps[task_id]) record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) else: for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: transfer_core.update_transfer_state(external_host, transfer_id, RequestState.LOST, logging_prepend_str=prepend_str) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logging.warning(prepend_str + "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter('daemons.conveyor.poller.query_transfer_exception') else: for request_id in transf_resp: if request_id in request_ids: ret = request_core.update_request_state(transf_resp[request_id], logging_prepend_str=prepend_str) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logging.warn(prepend_str + "Lock detected when handling request %s - skipping" % request_id) else: logging.error(traceback.format_exc()) logging.debug(prepend_str + 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(xfers), cnt, (time.time() - tss))) except Exception: logging.error(traceback.format_exc())
def submitter(once=False, rses=[], mock=False, process=0, total_processes=1, total_threads=1, bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto', activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' % (process, total_processes, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb = heartbeat.live(executable, hostname, pid, hb_thread) logging.info( 'Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)' % (process, total_processes, hb['assign_thread'], hb['nr_threads'], timeout)) threadPool = ThreadPool(total_threads) activity_next_exe_time = defaultdict(time.time) while not graceful_stop.is_set(): try: hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info("CMS user transfer activity") user_transfer = True logging.info( "%s:%s Starting to get transfer transfers for %s" % (process, hb['assign_thread'], activity)) ts = time.time() transfers = __get_transfers(process=process, total_processes=total_processes, thread=hb['assign_thread'], total_threads=hb['nr_threads'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1)) record_counter( 'daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info( "%s:%s Got %s transfers for %s" % (process, hb['assign_thread'], len(transfers), activity)) # group transfers logging.info("%s:%s Starting to group transfers for %s" % (process, hb['assign_thread'], activity)) ts = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue) record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1)) logging.info("%s:%s Starting to submit transfers for %s" % (process, hb['assign_thread'], activity)) for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers job_requests = makeRequests( submit_transfer, args_list=[((), { 'external_host': external_host, 'job': job, 'submitter': 'transfer_submitter', 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout })]) [ threadPool.putRequest(job_req) for job_req in job_requests ] else: for user, jobs in grouped_jobs[ external_host].iteritems(): # submit transfers for job in jobs: job_requests = makeRequests( submit_transfer, args_list=[((), { 'external_host': external_host, 'job': job, 'submitter': 'transfer_submitter', 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout, 'user_transfer_job': user_transfer })]) [ threadPool.putRequest(job_req) for job_req in job_requests ] threadPool.wait() if len(transfers) < group_bulk: logging.info( '%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (process, hb['assign_thread'], len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except: logging.critical( '%s:%s %s' % (process, hb['assign_thread'], traceback.format_exc())) if once: break logging.info('%s:%s graceful stop requested' % (process, hb['assign_thread'])) threadPool.dismissWorkers(total_threads, do_join=True) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s:%s graceful stop done' % (process, hb['assign_thread'])) return
def submit_transfer(external_host, job, submitter='submitter', timeout=None, user_transfer_job=False, logger=logging.log, transfertool=TRANSFER_TOOL): """ Submit a transfer or staging request :param external_host: FTS server to submit to. :param job: Job dictionary. :param submitter: Name of the submitting entity. :param timeout: Timeout :param user_transfer_job: Parameter for transfer with user credentials :param logger: Optional decorated logger that can be passed from the calling daemons or servers. """ # Prepare submitting xfers_ret = {} try: for t_file in job['files']: file_metadata = t_file['metadata'] request_id = file_metadata['request_id'] log_str = 'PREPARING REQUEST %s DID %s:%s TO SUBMITTING STATE PREVIOUS %s FROM %s TO %s USING %s ' % (file_metadata['request_id'], file_metadata['scope'], file_metadata['name'], file_metadata['previous_attempt_id'] if 'previous_attempt_id' in file_metadata else None, t_file['sources'], t_file['destinations'], external_host) xfers_ret[request_id] = {'state': RequestState.SUBMITTING, 'external_host': external_host, 'external_id': None, 'dest_url': t_file['destinations'][0]} logger(logging.INFO, "%s", log_str) xfers_ret[request_id]['file'] = t_file logger(logging.DEBUG, 'Start to prepare transfer') transfer_core.prepare_sources_for_transfers(xfers_ret) logger(logging.DEBUG, 'Finished to prepare transfer') except RequestNotFound as error: logger(logging.ERROR, str(error)) return except Exception: logger(logging.ERROR, 'Failed to prepare requests %s state to SUBMITTING (Will not submit jobs but return directly) with error: %s' % (list(xfers_ret.keys())), exc_info=True) return # Prepare the dictionary for xfers results xfers_ret = {} for t_file in job['files']: file_metadata = t_file['metadata'] request_id = file_metadata['request_id'] xfers_ret[request_id] = {'scope': file_metadata['scope'], 'name': file_metadata['name'], 'external_host': external_host, 'external_id': None, 'request_type': t_file.get('request_type', None), 'dst_rse': file_metadata.get('dst_rse', None), 'src_rse': file_metadata.get('src_rse', None), 'src_rse_id': file_metadata['src_rse_id'], 'metadata': file_metadata} # Submit the job try: start_time = time.time() logger(logging.INFO, 'About to submit job to %s with timeout %s' % (external_host, timeout)) # A eid is returned if the job is properly submitted otherwise an exception is raised eid = transfer_core.submit_bulk_transfers(external_host, files=job['files'], transfertool=transfertool, job_params=job['job_params'], timeout=timeout, user_transfer_job=user_transfer_job) duration = time.time() - start_time logger(logging.INFO, 'Submit job %s to %s in %s seconds' % (eid, external_host, duration)) record_timer('daemons.conveyor.%s.submit_bulk_transfer.per_file' % submitter, (time.time() - start_time) * 1000 / len(job['files'])) record_counter('daemons.conveyor.%s.submit_bulk_transfer' % submitter, len(job['files'])) record_timer('daemons.conveyor.%s.submit_bulk_transfer.files' % submitter, len(job['files'])) # Update all the requests to SUBMITTED and cancel the transfer job in case the update failed try: for t_file in job['files']: file_metadata = t_file['metadata'] request_id = file_metadata['request_id'] xfers_ret[request_id]['state'] = RequestState.SUBMITTED xfers_ret[request_id]['external_id'] = eid logger(logging.INFO, 'COPYING REQUEST %s DID %s:%s USING %s with state(%s) with eid(%s)' % (file_metadata['request_id'], file_metadata['scope'], file_metadata['name'], external_host, RequestState.SUBMITTED, eid)) logger(logging.DEBUG, 'Start to bulk register transfer state for eid %s' % eid) transfer_core.set_transfers_state(xfers_ret, datetime.datetime.utcnow()) logger(logging.DEBUG, 'Finished to register transfer state',) except Exception: logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True) try: logger(logging.INFO, 'Cancel transfer %s on %s', eid, external_host) request.cancel_request_external_id(eid, external_host) except Exception: # The job is still submitted in the file transfer service but the request is not updated. Possibility to have a double submission during the next cycle logger(logging.ERROR, 'Failed to cancel transfers %s on %s with error' % (eid, external_host), exc_info=True) # This exception is raised if one job is already submitted for one file except DuplicateFileTransferSubmission as error: logger(logging.WARNING, 'Failed to submit a job because of duplicate file : %s', str(error)) logger(logging.INFO, 'Submitting files one by one') try: # In this loop we submit the jobs and update the requests state one by one single_xfers_ret = {} for t_file in job['files']: single_xfers_ret = {} single_xfers_ret[request_id] = xfers_ret[request_id] file_metadata = t_file['metadata'] request_id = file_metadata['request_id'] start_time = time.time() logger(logging.INFO, 'About to submit job to %s with timeout %s', external_host, timeout) eid = transfer_core.submit_bulk_transfers(external_host, files=[t_file], transfertool=transfertool, job_params=job['job_params'], timeout=timeout, user_transfer_job=user_transfer_job) duration = time.time() - start_time logger(logging.INFO, 'Submit job %s to %s in %s seconds' % (eid, external_host, duration)) record_timer('daemons.conveyor.%s.submit_bulk_transfer.per_file' % submitter, (time.time() - start_time) * 1000) record_counter('daemons.conveyor.%s.submit_bulk_transfer' % submitter, 1) record_timer('daemons.conveyor.%s.submit_bulk_transfer.files' % submitter, 1) single_xfers_ret[request_id]['state'] = RequestState.SUBMITTED single_xfers_ret[request_id]['external_id'] = eid logger(logging.INFO, 'COPYING REQUEST %s DID %s:%s USING %s with state(%s) with eid(%s)' % (file_metadata['request_id'], file_metadata['scope'], file_metadata['name'], external_host, RequestState.SUBMITTED, eid)) try: logger(logging.DEBUG, 'Start to register transfer state') transfer_core.set_transfers_state(single_xfers_ret, datetime.datetime.utcnow()) logger(logging.DEBUG, 'Finished to register transfer state') except Exception: logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True) try: logger(logging.INFO, 'Cancel transfer %s on %s', eid, external_host) request.cancel_request_external_id(eid, external_host) except Exception: logger(logging.ERROR, 'Failed to cancel transfers %s on %s with error' % (eid, external_host), exc_info=True) except (DuplicateFileTransferSubmission, TransferToolTimeout, TransferToolWrongAnswer, Exception) as error: request_id = single_xfers_ret.keys()[0] single_xfers_ret[request_id]['state'] = RequestState.SUBMISSION_FAILED logger(logging.ERROR, 'Cannot submit the job for request %s : %s', request_id, str(error)) try: logger(logging.DEBUG, 'Update the transfer state to fail') transfer_core.set_transfers_state(single_xfers_ret, datetime.datetime.utcnow()) logger(logging.DEBUG, 'Finished to register transfer state') except Exception: logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True) # No need to cancel the job here # The following exceptions are raised if the job failed to be submitted except (TransferToolTimeout, TransferToolWrongAnswer) as error: logger(logging.ERROR, str(error)) try: for t_file in job['files']: file_metadata = t_file['metadata'] request_id = file_metadata['request_id'] xfers_ret[request_id]['state'] = RequestState.SUBMISSION_FAILED logger(logging.DEBUG, 'Start to register transfer state') transfer_core.set_transfers_state(xfers_ret, datetime.datetime.utcnow()) logger(logging.DEBUG, 'Finished to register transfer state') except Exception: logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True) # No need to cancel the job here except Exception as error: logger(logging.ERROR, 'Failed to submit a job with error %s', str(error), exc_info=True)
def handle_requests(reqs, suspicious_patterns): """ used by finisher to handle terminated requests, :param reqs: List of requests. """ undeterministic_rses = get_undeterministic_rses() rses_info, protocols = {}, {} replicas = {} for req in reqs: try: replica = {'scope': req['scope'], 'name': req['name'], 'rse_id': req['dest_rse_id'], 'bytes': req['bytes'], 'adler32': req['adler32'], 'request_id': req['request_id']} replica['pfn'] = req['dest_url'] replica['request_type'] = req['request_type'] replica['error_message'] = None if req['request_type'] not in replicas: replicas[req['request_type']] = {} if req['rule_id'] not in replicas[req['request_type']]: replicas[req['request_type']][req['rule_id']] = [] if req['state'] == RequestState.DONE: replica['state'] = ReplicaState.AVAILABLE replica['archived'] = False # for TAPE, replica path is needed if req['request_type'] == RequestType.TRANSFER and req['dest_rse_id'] in undeterministic_rses: if req['dest_rse_id'] not in rses_info: dest_rse = rse_core.get_rse_name(rse_id=req['dest_rse_id']) rses_info[req['dest_rse_id']] = rsemanager.get_rse_info(dest_rse) pfn = req['dest_url'] scheme = urlparse(pfn).scheme dest_rse_id_scheme = '%s_%s' % (req['dest_rse_id'], scheme) if dest_rse_id_scheme not in protocols: protocols[dest_rse_id_scheme] = rsemanager.create_protocol(rses_info[req['dest_rse_id']], 'write', scheme) path = protocols[dest_rse_id_scheme].parse_pfns([pfn])[pfn]['path'] replica['path'] = os.path.join(path, os.path.basename(pfn)) # replica should not be added to replicas until all info are filled replicas[req['request_type']][req['rule_id']].append(replica) elif req['state'] == RequestState.FAILED: check_suspicious_files(req, suspicious_patterns) if request_core.should_retry_request(req): tss = time.time() new_req = request_core.requeue_and_archive(req['request_id']) record_timer('daemons.conveyor.common.update_request_state.request-requeue_and_archive', (time.time() - tss) * 1000) if new_req: # should_retry_request and requeue_and_archive are not in one session, # another process can requeue_and_archive and this one will return None. logging.warn('REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'], req['name'], req['request_id'], new_req['request_id'], new_req['retry_count'])) else: logging.warn('EXCEEDED DID %s:%s REQUEST %s' % (req['scope'], req['name'], req['request_id'])) replica['state'] = ReplicaState.UNAVAILABLE replica['archived'] = False replica['error_message'] = req['err_msg'] if req['err_msg'] else get_transfer_error(req['state']) replicas[req['request_type']][req['rule_id']].append(replica) elif req['state'] == RequestState.SUBMITTING or req['state'] == RequestState.SUBMISSION_FAILED or req['state'] == RequestState.LOST: if req['updated_at'] > (datetime.datetime.utcnow() - datetime.timedelta(minutes=120)): continue if request_core.should_retry_request(req): tss = time.time() new_req = request_core.requeue_and_archive(req['request_id']) record_timer('daemons.conveyor.common.update_request_state.request-requeue_and_archive', (time.time() - tss) * 1000) if new_req: logging.warn('REQUEUED SUBMITTING DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'], req['name'], req['request_id'], new_req['request_id'], new_req['retry_count'])) else: logging.warn('EXCEEDED SUBMITTING DID %s:%s REQUEST %s' % (req['scope'], req['name'], req['request_id'])) replica['state'] = ReplicaState.UNAVAILABLE replica['archived'] = False replica['error_message'] = req['err_msg'] if req['err_msg'] else get_transfer_error(req['state']) replicas[req['request_type']][req['rule_id']].append(replica) elif req['state'] == RequestState.NO_SOURCES or req['state'] == RequestState.ONLY_TAPE_SOURCES or req['state'] == RequestState.MISMATCH_SCHEME: if request_core.should_retry_request(req): tss = time.time() new_req = request_core.requeue_and_archive(req['request_id']) record_timer('daemons.conveyor.common.update_request_state.request-requeue_and_archive', (time.time() - tss) * 1000) if new_req: # should_retry_request and requeue_and_archive are not in one session, # another process can requeue_and_archive and this one will return None. logging.warn('REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'], req['name'], req['request_id'], new_req['request_id'], new_req['retry_count'])) else: logging.warn('EXCEEDED DID %s:%s REQUEST %s' % (req['scope'], req['name'], req['request_id'])) replica['state'] = ReplicaState.UNAVAILABLE # should be broken here replica['archived'] = False replica['error_message'] = req['err_msg'] if req['err_msg'] else get_transfer_error(req['state']) replicas[req['request_type']][req['rule_id']].append(replica) except: logging.error("Something unexpected happened when handling request %s(%s:%s) at %s: %s" % (req['request_id'], req['scope'], req['name'], req['dest_rse_id'], traceback.format_exc())) handle_terminated_replicas(replicas)
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, once=False, dataset_lifetime=None): sleep(sleep_time * (total_workers - worker_number) / total_workers) while not graceful_stop.is_set(): starttime = time() logging.info('Thread [%i/%i] : Getting data distribution' % (worker_number, total_workers)) probabilities, data = get_data_distribution(inputfile) logging.debug('Thread [%i/%i] : Probabilities %s' % (worker_number, total_workers, probabilities)) account = 'root' scope = 'tests' now = datetime.now() dsn_extension = '%s.%s.%s.%s' % (now.year, now.month, now.day, generate_uuid()) totretries = 3 status = False for site in sites: for retry in xrange(0, totretries): ts = time() tmpdir = tempfile.mkdtemp() logging.info('Thread [%i/%i] : Running on site %s' % (worker_number, total_workers, site)) d = choose_element(probabilities, data) metadata = d['metadata'] metadata['version'] = str(random.randint(0, 1000)) metadata['run_number'] = str(random.randint(0, 100000)) uuid = generate_uuid() metadata['stream_name'] = 'automatix_stream' metadata['campaign'] = 'automatix_campaign' try: nbfiles = d['nbfiles'] except KeyError: nbfiles = 2 logging.warning('Thread [%i/%i] : No nbfiles defined in the configuration, will use 2' % (worker_number, total_workers)) try: filesize = d['filesize'] except KeyError: filesize = 1000000 logging.warning('Thread [%i/%i] : No filesize defined in the configuration, will use 1M files' % (worker_number, total_workers)) dsn = 'tests:%s.%s.%s.%s.%s.%s' % (metadata['project'], metadata['run_number'], metadata['stream_name'], metadata['prod_step'], metadata['datatype'], metadata['version']) fnames = [] lfns = [] for nb in xrange(nbfiles): fname = '%s.%s' % (metadata['datatype'], generate_uuid()) lfns.append(fname) fname = '%s/%s' % (tmpdir, fname) logging.info('Thread [%(worker_number)s/%(total_workers)s] : Generating file %(fname)s in dataset %(dsn)s' % locals()) generate_file(fname, filesize) fnames.append(fname) logging.info('Thread [%i/%i] : Upload %s to %s' % (worker_number, total_workers, dsn, site)) status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn) for fname in fnames: remove(fname) rmdir(tmpdir) if status: monitor.record_counter(counters='automatix.addnewdataset.done', delta=1) monitor.record_counter(counters='automatix.addnewfile.done', delta=nbfiles) monitor.record_timer('automatix.datasetinjection', (time() - ts) * 1000) break else: logging.info('Thread [%i/%i] : Failed to upload files. Will retry another time (attempt %s/%s)' % (worker_number, total_workers, str(retry + 1), str(totretries))) if once is True: logging.info('Thread [%i/%i] : Run with once mode. Exiting' % (worker_number, total_workers)) break tottime = time() - starttime if status: logging.info('Thread [%i/%i] : It took %s seconds to upload one dataset on %s' % (worker_number, total_workers, str(tottime), str(sites))) if sleep_time > tottime: logging.info('Thread [%i/%i] : Will sleep for %s seconds' % (worker_number, total_workers, str(sleep_time - tottime))) sleep(sleep_time - tottime) else: logging.info('Thread [%i/%i] : Retrying a new upload' % (worker_number, total_workers))
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, scope='tests', once=False, dataset_lifetime=None, set_metadata=False): sleep(sleep_time * (total_workers - worker_number) / total_workers) executable = 'automatix' hostname = socket.getfqdn() pid = getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prefix + '%s') while not GRACEFUL_STOP.is_set(): heartbeat.live(executable, hostname, pid, hb_thread) starttime = time() prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Getting data distribution') probabilities, data = get_data_distribution(inputfile) logger(logging.DEBUG, 'Probabilities %s', probabilities) totretries = 3 status = False for site in sites: for retry in range(0, totretries): start_time = time() tmpdir = tempfile.mkdtemp() logger(logging.INFO, 'Running on site %s', site) dic = choose_element(probabilities, data) metadata = dic['metadata'] try: nbfiles = dic['nbfiles'] except KeyError: nbfiles = 2 logger( logging.WARNING, 'No nbfiles defined in the configuration, will use 2') try: filesize = dic['filesize'] except KeyError: filesize = 1000000 logger( logging.WARNING, 'No filesize defined in the configuration, will use 1M files' ) dsn = generate_didname(metadata, None, 'dataset') fnames = [] lfns = [] physical_fnames = [] for _ in range(nbfiles): fname = generate_didname(metadata=metadata, dsn=dsn, did_type='file') lfns.append(fname) logger(logging.INFO, 'Generating file %s in dataset %s', fname, dsn) physical_fname = '%s/%s' % (tmpdir, "".join( fname.split('/'))) physical_fnames.append(physical_fname) generate_file(physical_fname, filesize, logger=logger) fnames.append(fname) logger(logging.INFO, 'Upload %s to %s', dsn, site) dsn = '%s:%s' % (scope, dsn) status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata, logger=logger) for physical_fname in physical_fnames: remove(physical_fname) rmdir(tmpdir) if status: monitor.record_counter( counters='automatix.addnewdataset.done', delta=1) monitor.record_counter( counters='automatix.addnewfile.done', delta=nbfiles) monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000) break else: logger( logging.INFO, 'Failed to upload files. Will retry another time (attempt %s/%s)', str(retry + 1), str(totretries)) if once is True: logger(logging.INFO, 'Run with once mode. Exiting') break tottime = time() - starttime if status: logger(logging.INFO, 'It took %s seconds to upload one dataset on %s', str(tottime), str(sites)) if sleep_time > tottime: logger(logging.INFO, 'Will sleep for %s seconds', str(sleep_time - tottime)) sleep(sleep_time - tottime) else: logger(logging.INFO, 'Retrying a new upload') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def get_destinations(rse_info, scheme, req, sources): dsn = 'other' pfn = {} paths = {} if not rse_info['deterministic']: ts = time.time() # get rule scope and name if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) if 'ds_name' in req_attributes: dsn = req_attributes["ds_name"] if dsn == 'other': # select a containing dataset for parent in did.list_parent_dids(req['scope'], req['name']): if parent['type'] == DIDType.DATASET: dsn = parent['name'] break record_timer('daemons.conveyor.submitter.list_parent_dids', (time.time() - ts) * 1000) # always use SRM ts = time.time() nondet = rsemgr.create_protocol(rse_info, 'write', scheme='srm') record_timer('daemons.conveyor.submitter.create_protocol', (time.time() - ts) * 1000) # if there exists a prefix for SRM, use it prefix = '' for s in rse_info['protocols']: if s['scheme'] == 'srm': prefix = s['prefix'] # DQ2 path always starts with /, but prefix might not end with / path = construct_surl_DQ2(dsn, req['name']) # retrial transfers to tape need a new filename - add timestamp if req['request_type'] == RequestType.TRANSFER\ and 'previous_attempt_id' in req\ and req['previous_attempt_id']\ and rse_info['rse_type'] == 'TAPE': # TODO: RUCIO-809 - rsemanager: get_rse_info -> rse_type is string instead of RSEType path = '%s_%i' % (path, int(time.time())) logging.debug('Retrial transfer request %s DID %s:%s to tape %s renamed to %s' % (req['request_id'], req['scope'], req['name'], rse_info['rse'], path)) tmp_path = '%s%s' % (prefix[:-1], path) if prefix[-1] != '/': tmp_path = '%s%s' % (prefix, path) paths[req['scope'], req['name']] = path # add the hostname pfn['%s:%s' % (req['scope'], req['name'])] = nondet.path2pfn(tmp_path) if req['request_type'] == RequestType.STAGEIN: if len(sources) == 1: pfn['%s:%s' % (req['scope'], req['name'])] = sources[0][1] else: # TODO: need to check return None, None # we must set the destination path for nondeterministic replicas explicitly replica.update_replicas_paths([{'scope': req['scope'], 'name': req['name'], 'rse_id': req['dest_rse_id'], 'path': path}]) else: ts = time.time() try: pfn = rsemgr.lfns2pfns(rse_info, lfns=[{'scope': req['scope'], 'name': req['name']}], scheme=scheme) except RSEProtocolNotSupported: logging.warn('%s not supported by %s' % (scheme, rse_info['rse'])) return None, None record_timer('daemons.conveyor.submitter.lfns2pfns', (time.time() - ts) * 1000) destinations = [] for k in pfn: if isinstance(pfn[k], (str, unicode)): destinations.append(pfn[k]) elif isinstance(pfn[k], (tuple, list)): for url in pfn[k]: destinations.append(pfn[k][url]) protocols = None try: protocols = rsemgr.select_protocol(rse_info, 'write', scheme=scheme) except RSEProtocolNotSupported: logging.warn('%s not supported by %s' % (scheme, rse_info['rse'])) return None, None # we need to set the spacetoken if we use SRM dest_spacetoken = None if scheme == 'srm': dest_spacetoken = protocols['extended_attributes']['space_token'] return destinations, dest_spacetoken
def submitter(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = sys.argv[0] if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Submitter starting with timeout %s', prepend_str, timeout) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Transfer submitter started', prepend_str) while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info('%s CMS user transfer activity', prepend_str) user_transfer = True logging.info('%s Starting to get transfer transfers for %s', prepend_str, activity) start_time = time.time() transfers = __get_transfers( total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter( 'daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) GET_TRANSFERS_COUNTER.inc(len(transfers)) record_timer( 'daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info('%s Got %s transfers for %s in %s seconds', prepend_str, len(transfers), activity, time.time() - start_time) # group transfers logging.info('%s Starting to group transfers for %s', prepend_str, activity) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue) record_timer( 'daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info('%s Starting to submit transfers for %s', prepend_str, activity) if TRANSFER_TOOL == 'fts3': for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer( external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: for _, jobs in iteritems( grouped_jobs[external_host]): # submit transfers for job in jobs: submit_transfer( external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer) elif TRANSFER_TOOL == 'globus': if TRANSFER_TYPE == 'bulk': # build bulk job file list per external host to send to submit_transfer for external_host in grouped_jobs: # pad the job with job_params; irrelevant for globus but needed for further rucio parsing submitjob = { 'files': [], 'job_params': grouped_jobs[''][0].get('job_params') } for job in grouped_jobs[external_host]: submitjob.get('files').append( job.get('files')[0]) logging.debug('submitjob: %s' % submitjob) submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: # build single job files and individually send to submit_transfer job_params = grouped_jobs[''][0].get( 'job_params') if grouped_jobs else None for external_host in grouped_jobs: for job in grouped_jobs[external_host]: for file in job['files']: singlejob = { 'files': [file], 'job_params': job_params } logging.debug('singlejob: %s' % singlejob) submit_transfer( external_host=external_host, job=singlejob, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: logging.error(prepend_str + 'Unknown transfer tool') if len(transfers) < group_bulk: logging.info( '%s Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', prepend_str, len(transfers), activity, group_bulk, sleep_time) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) if once: break logging.info('%s Graceful stop requested', prepend_str) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop done', prepend_str) return
def delete_from_storage(replicas, prot, rse_info, staging_areas, auto_exclude_threshold, logger=logging.log): deleted_files = [] rse_name = rse_info['rse'] rse_id = rse_info['id'] noaccess_attempts = 0 pfns_to_bulk_delete = [] try: prot.connect() for replica in replicas: # Physical deletion try: deletion_dict = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo logger(logging.DEBUG, 'Deletion ATTEMPT of %s:%s as %s on %s', replica['scope'], replica['name'], replica['pfn'], rse_name) start = time.time() # For STAGING RSEs, no physical deletion if rse_id in staging_areas: logger( logging.WARNING, 'Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', replica['scope'], replica['name'], replica['pfn'], rse_name) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) continue if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info[ 'sign_url'] is not None: pfn = get_signed_url(rse_id, rse_info['sign_url'], 'delete', pfn) if prot.attributes['scheme'] == 'globus': pfns_to_bulk_delete.append(replica['pfn']) else: prot.delete(pfn) else: logger(logging.WARNING, 'Deletion UNAVAILABLE of %s:%s as %s on %s', replica['scope'], replica['name'], replica['pfn'], rse_name) monitor.record_timer('daemons.reaper.delete.{scheme}.{rse}', (time.time() - start) * 1000, labels={ 'scheme': prot.attributes['scheme'], 'rse': rse_name }) duration = time.time() - start deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %.2f seconds', replica['scope'], replica['name'], replica['pfn'], rse_name, duration) except SourceNotFound: duration = time.time() - start err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s in %.2f seconds' % ( replica['scope'], replica['name'], replica['pfn'], rse_name, duration) logger(logging.WARNING, '%s', err_msg) deletion_dict['reason'] = 'File Not Found' deletion_dict['duration'] = duration add_message('deletion-not-found', deletion_dict) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: duration = time.time() - start logger(logging.WARNING, 'Deletion NOACCESS of %s:%s as %s on %s: %s in %.2f', replica['scope'], replica['name'], replica['pfn'], rse_name, str(error), duration) deletion_dict['reason'] = str(error) deletion_dict['duration'] = duration add_message('deletion-failed', deletion_dict) noaccess_attempts += 1 if noaccess_attempts >= auto_exclude_threshold: logger( logging.INFO, 'Too many (%d) NOACCESS attempts for %s. RSE will be temporarly excluded.', noaccess_attempts, rse_name) REGION.set('temporary_exclude_%s' % rse_id, True) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) break except Exception as error: duration = time.time() - start logger( logging.CRITICAL, 'Deletion CRITICAL of %s:%s as %s on %s in %.2f seconds : %s', replica['scope'], replica['name'], replica['pfn'], rse_name, duration, str(traceback.format_exc())) deletion_dict['reason'] = str(error) deletion_dict['duration'] = duration add_message('deletion-failed', deletion_dict) if pfns_to_bulk_delete and prot.attributes['scheme'] == 'globus': logger(logging.DEBUG, 'Attempting bulk delete on RSE %s for scheme %s', rse_name, prot.attributes['scheme']) prot.bulk_delete(pfns_to_bulk_delete) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in replicas: logger(logging.WARNING, 'Deletion NOACCESS of %s:%s as %s on %s: %s', replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) payload = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) logger(logging.INFO, 'Cannot connect to %s. RSE will be temporarly excluded.', rse_name) REGION.set('temporary_exclude_%s' % rse_id, True) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) finally: prot.close() return deleted_files
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. """ logging.info('Starting Reaper: Worker %(worker_number)s, ' 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) # Generate a hash just for the subset of RSEs rse_names = [rse['rse'] for rse in rses] hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) nothing_to_do = {} while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) checkpoint_time = datetime.datetime.now() # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) max_deleting_rate = 0 for rse in sort_rses(rses): try: if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now(): heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) checkpoint_time = datetime.datetime.now() if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now(): continue logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number, rse['rse'], nothing_to_do.get(rse['id'])) rse_info = rsemgr.get_rse_info(rse_id=rse['id']) rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse']) nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' needed_free_space, max_being_deleted_files = None, 100 needed_free_space_per_child = None if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space, needed_free_space_per_child = 0, 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse']) else: if total_children and total_children > 0: needed_free_space_per_child = needed_free_space / float(total_children) start = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse_id=rse['id'], bytes=needed_free_space_per_child, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas)) if not replicas: nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s', worker_number, child_number, rse['rse'], nothing_to_do[rse['id']]) continue prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files)) try: update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True) for replica in files: try: replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error)) logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg) replica['pfn'] = None monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) try: deleted_files = [] prot.connect() for replica in files: try: deletion_dict = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) start = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) else: if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None: pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration) except SourceNotFound: err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) if replica['state'] == ReplicaState.AVAILABLE: deletion_dict['reason'] = str(err_msg) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except Exception as error: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) payload = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo add_message('deletion-failed', payload) break finally: prot.close() start = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse_id=rse['id'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) except DatabaseException as error: logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error)) except UnsupportedOperation as error: logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) except RSENotFound as error: logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) if once: break time.sleep(1) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def __handle_requests(reqs, suspicious_patterns, retry_protocol_mismatches, logger=logging.log): """ Used by finisher to handle terminated requests, :param reqs: List of requests. :param suspicious_patterns: List of suspicious patterns. :param retry_protocol_mismatches: Boolean to retry the transfer in case of protocol mismatch. :param prepend_str: String to prepend to logging. """ failed_during_submission = [ RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.LOST ] failed_no_submission_attempts = [ RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME ] undeterministic_rses = __get_undeterministic_rses(logger=logger) rses_info, protocols = {}, {} replicas = {} for req in reqs: try: replica = { 'scope': req['scope'], 'name': req['name'], 'rse_id': req['dest_rse_id'], 'bytes': req['bytes'], 'adler32': req['adler32'], 'request_id': req['request_id'] } replica['pfn'] = req['dest_url'] replica['request_type'] = req['request_type'] replica['error_message'] = None if req['request_type'] not in replicas: replicas[req['request_type']] = {} if req['rule_id'] not in replicas[req['request_type']]: replicas[req['request_type']][req['rule_id']] = [] if req['state'] == RequestState.DONE: replica['state'] = ReplicaState.AVAILABLE replica['archived'] = False # for TAPE, replica path is needed if req['request_type'] in ( RequestType.TRANSFER, RequestType.STAGEIN ) and req['dest_rse_id'] in undeterministic_rses: if req['dest_rse_id'] not in rses_info: rses_info[ req['dest_rse_id']] = rsemanager.get_rse_info( rse_id=req['dest_rse_id']) pfn = req['dest_url'] scheme = urlparse(pfn).scheme dest_rse_id_scheme = '%s_%s' % (req['dest_rse_id'], scheme) if dest_rse_id_scheme not in protocols: protocols[ dest_rse_id_scheme] = rsemanager.create_protocol( rses_info[req['dest_rse_id']], 'write', scheme) path = protocols[dest_rse_id_scheme].parse_pfns( [pfn])[pfn]['path'] replica['path'] = os.path.join(path, os.path.basename(pfn)) # replica should not be added to replicas until all info are filled replicas[req['request_type']][req['rule_id']].append(replica) # Standard failure from the transfer tool elif req['state'] == RequestState.FAILED: __check_suspicious_files(req, suspicious_patterns, logger=logger) tss = time.time() try: if request_core.should_retry_request( req, retry_protocol_mismatches): new_req = request_core.requeue_and_archive( req, source_ranking_update=True, retry_protocol_mismatches=retry_protocol_mismatches, logger=logger) # should_retry_request and requeue_and_archive are not in one session, # another process can requeue_and_archive and this one will return None. record_timer( 'daemons.conveyor.common.update_request_state.request_requeue_and_archive', (time.time() - tss) * 1000) logger( logging.WARNING, 'REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'], req['name'], req['request_id'], new_req['request_id'], new_req['retry_count'])) else: # No new_req is return if should_retry_request returns False logger( logging.WARNING, 'EXCEEDED SUBMITTING DID %s:%s REQUEST %s in state %s', req['scope'], req['name'], req['request_id'], req['state']) replica['state'] = ReplicaState.UNAVAILABLE replica['archived'] = False replica['error_message'] = req['err_msg'] if req[ 'err_msg'] else request_core.get_transfer_error( req['state']) replicas[req['request_type']][req['rule_id']].append( replica) except RequestNotFound: logger(logging.WARNING, 'Cannot find request %s anymore', req['request_id']) # All other failures elif req['state'] in failed_during_submission or req[ 'state'] in failed_no_submission_attempts: if req['state'] in failed_during_submission and req[ 'updated_at'] > (datetime.datetime.utcnow() - datetime.timedelta(minutes=120)): # To prevent race conditions continue try: tss = time.time() if request_core.should_retry_request( req, retry_protocol_mismatches): new_req = request_core.requeue_and_archive( req, source_ranking_update=False, retry_protocol_mismatches=retry_protocol_mismatches, logger=logger) record_timer( 'daemons.conveyor.common.update_request_state.request_requeue_and_archive', (time.time() - tss) * 1000) logger( logging.WARNING, 'REQUEUED SUBMITTING DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'], req['name'], req['request_id'], new_req['request_id'], new_req['retry_count'])) else: # No new_req is return if should_retry_request returns False logger( logging.WARNING, 'EXCEEDED SUBMITTING DID %s:%s REQUEST %s in state %s', req['scope'], req['name'], req['request_id'], req['state']) replica['state'] = ReplicaState.UNAVAILABLE replica['archived'] = False replica['error_message'] = req['err_msg'] if req[ 'err_msg'] else request_core.get_transfer_error( req['state']) replicas[req['request_type']][req['rule_id']].append( replica) except RequestNotFound: logger(logging.WARNING, 'Cannot find request %s anymore', req['request_id']) except Exception as error: logger( logging.ERROR, "Something unexpected happened when handling request %s(%s:%s) at %s: %s" % (req['request_id'], req['scope'], req['name'], req['dest_rse_id'], str(error))) __handle_terminated_replicas(replicas, logger=logger)
def submit(self, files, job_params, timeout=None): """ Submit transfers to FTS3 via JSON. :param files: List of dictionaries describing the file transfers. :param job_params: Dictionary containing key/value pairs, for all transfers. :param timeout: Timeout in seconds. :returns: FTS transfer identifier. """ # FTS3 expects 'davs' as the scheme identifier instead of https for transfer_file in files: if not transfer_file['sources'] or transfer_file['sources'] == []: raise Exception('No sources defined') new_src_urls = [] new_dst_urls = [] for url in transfer_file['sources']: if url.startswith('https'): new_src_urls.append(':'.join(['davs'] + url.split(':')[1:])) else: new_src_urls.append(url) for url in transfer_file['destinations']: if url.startswith('https'): new_dst_urls.append(':'.join(['davs'] + url.split(':')[1:])) else: new_dst_urls.append(url) transfer_file['sources'] = new_src_urls transfer_file['destinations'] = new_dst_urls transfer_id = None expected_transfer_id = None if self.deterministic_id: job_params = job_params.copy() job_params["id_generator"] = "deterministic" job_params["sid"] = files[0]['metadata']['request_id'] expected_transfer_id = self.__get_deterministic_id(job_params["sid"]) logging.debug("Submit bulk transfers in deterministic mode, sid %s, expected transfer id: %s", job_params["sid"], expected_transfer_id) # bulk submission params_dict = {'files': files, 'params': job_params} params_str = json.dumps(params_dict, cls=APIEncoder) post_result = None try: start_time = time.time() post_result = requests.post('%s/jobs' % self.external_host, verify=self.verify, cert=self.cert, data=params_str, headers={'Content-Type': 'application/json'}, timeout=timeout) record_timer('transfertool.fts3.submit_transfer.%s' % self.__extract_host(self.external_host), (time.time() - start_time) * 1000 / len(files)) except ReadTimeout as error: raise TransferToolTimeout(error) except JSONDecodeError as error: raise TransferToolWrongAnswer(error) except Exception as error: logging.warn('Could not submit transfer to %s - %s' % (self.external_host, str(error))) if post_result and post_result.status_code == 200: record_counter('transfertool.fts3.%s.submission.success' % self.__extract_host(self.external_host), len(files)) transfer_id = str(post_result.json()['job_id']) elif post_result and post_result.status_code == 409: record_counter('transfertool.fts3.%s.submission.failure' % self.__extract_host(self.external_host), len(files)) raise DuplicateFileTransferSubmission() else: if expected_transfer_id: transfer_id = expected_transfer_id logging.warn("Failed to submit transfer to %s, will use expected transfer id %s, error: %s", self.external_host, transfer_id, post_result.text if post_result is not None else post_result) else: logging.warn("Failed to submit transfer to %s, error: %s", self.external_host, post_result.text if post_result is not None else post_result) record_counter('transfertool.fts3.%s.submission.failure' % self.__extract_host(self.external_host), len(files)) if not transfer_id: raise TransferToolWrongAnswer('No transfer id returned by %s' % self.external_host) return transfer_id
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. """ logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals()) while not graceful_stop.is_set(): try: max_deleting_rate = 0 for rse in rses: deleting_rate = 0 rse_info = rsemgr.get_rse_info(rse['rse']) rse_protocol = rse_core.get_rse_protocols(rse['rse']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse'])) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse'])) try: needed_free_space, max_being_deleted_files = None, 10000 if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space = 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse'])) s = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas))) if not replicas: logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse'])) continue p = rsemgr.create_protocol(rse_info, 'delete', scheme=None) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files))) try: s = time.time() update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files]) for replica in files: try: replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0]) except ReplicaUnAvailable as e: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e)) logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg)) replica['pfn'] = None add_message('deletion-planned', {'scope': replica['scope'], 'name': replica['name'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'rse': rse_info['rse']}) # logging.debug('update_replicas_states %s' % (time.time() - s)) monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) if not scheme: try: deleted_files = [] p.connect() for replica in files: try: logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) s = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) else: if replica['pfn']: p.delete(replica['pfn']) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000) duration = time.time() - s deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-done', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'duration': duration}) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) except SourceNotFound: err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': err_msg}) except (ServiceUnavailable, RSEAccessDenied) as e: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except Exception as e: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) except (ServiceUnavailable, RSEAccessDenied) as e: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) finally: p.close() s = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse=rse['rse'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s)) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) deleting_rate += len(deleted_files) except: logging.critical(traceback.format_exc()) deleting_rate = deleting_rate * 1.0 / max_being_deleted_files if deleting_rate > max_deleting_rate: max_deleting_rate = deleting_rate except: logging.critical(traceback.format_exc()) if once: break logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate)) sleep_time = int((1 - max_deleting_rate) * 60 + 1) time.sleep(sleep_time) except: logging.critical(traceback.format_exc()) logging.info('Graceful stop requested') logging.info('Graceful stop done')
def test_record_timer_message(self): """MONITOR (CORE): Send a timer message to graphite """ monitor.record_timer('test.runtime', 500)
tss = time.time() logging.debug('UPDATE REPLICA STATE DID %s:%s RSE %s' % (response['scope'], response['name'], rse_update_name)) # make sure we do not leave the transaction try: # try quickly replica.update_replicas_states([{'rse': rse_update_name, 'scope': response['scope'], 'name': response['name'], 'state': ReplicaState.AVAILABLE}], nowait=False, session=session) except UnsupportedOperation: # replica not found, but it has been transferred because cancel came too late. # so we need to register the replica and schedule it for deletion again record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000) logging.warn('DID %s:%s AT RSE %s NOT FOUND - Registering replica and scheduling for immediate deletion' % (response['scope'], response['name'], rse_name)) did_meta = None try: did_meta = did.get_metadata(response['scope'], response['name'], session=session) except: logging.critical('DID %s:%s NOT FOUND - Cannot re-register replica - potential dark data' % (response['scope'], response['name'])) raise if did_meta: try:
def poller(once=False, activities=None, sleep_time=60, fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None): """ Main loop to check the status of a transfer primitive with a transfertool. """ try: timeout = config_get('conveyor', 'poll_timeout') timeout = float(timeout) except NoOptionError: timeout = None executable = 'conveyor-poller' if activities: activities.sort() executable += '--activities ' + str(activities) if activity_shares: activities.sort() executable += '--activity_shares' + str(activity_shares) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' % (db_bulk, fts_bulk, timeout)) time.sleep(10) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Poller started') activity_next_exe_time = defaultdict(time.time) while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) if activities is None: activities = [None] for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue start_time = time.time() logging.debug(prepend_str + 'Start to poll transfers older than %i seconds for activity %s' % (older_than, activity)) transfs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than), total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], mode_all=False, hash_variable='id', activity=activity, activity_shares=activity_shares) record_timer('daemons.conveyor.poller.000-get_next', (time.time() - start_time) * 1000) if transfs: logging.debug(prepend_str + 'Polling %i transfers for activity %s' % (len(transfs), activity)) xfers_ids = {} for transf in transfs: if not transf['external_host'] in xfers_ids: xfers_ids[transf['external_host']] = [] xfers_ids[transf['external_host']].append((transf['external_id'], transf['request_id'])) for external_host in xfers_ids: external_ids = list({trf[0] for trf in xfers_ids[external_host]}) request_ids = [trf[1] for trf in xfers_ids[external_host]] for xfers in chunks(external_ids, fts_bulk): # poll transfers poll_transfers(external_host=external_host, xfers=xfers, prepend_str=prepend_str, request_ids=request_ids, timeout=timeout) if len(transfs) < fts_bulk / 2: logging.info(prepend_str + "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (len(transfs), activity, fts_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except Exception: logging.critical(prepend_str + "%s" % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
def stager(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Stager starting with bring_online %s seconds' % (bring_online)) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Stager started') while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue logging.info(prepend_str + 'Starting to get stagein transfers for %s' % (activity)) start_time = time.time() transfers = __get_stagein_transfers( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, mock=mock, schemes=scheme, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer( 'daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.stager.get_stagein_transfers', len(transfers)) record_timer( 'daemons.conveyor.stager.get_stagein_transfers.transfers', len(transfers)) logging.info(prepend_str + 'Got %s stagein transfers for %s' % (len(transfers), activity)) # group transfers logging.info(prepend_str + 'Starting to group transfers for %s' % (activity)) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue) record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info(prepend_str + 'Starting to submit transfers for %s' % (activity)) # submit transfers for external_host in grouped_jobs: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str) if len(transfers) < group_bulk: logging.info( prepend_str + 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical(prepend_str + '%s' % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, once=False, dataset_lifetime=None, set_metadata=False): sleep(sleep_time * (total_workers - worker_number) / total_workers) executable = ' '.join(argv) hostname = socket.getfqdn() pid = getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heartbeat.live(executable, hostname, pid, hb_thread) starttime = time() prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers) logging.info(prepend_str + 'Getting data distribution') probabilities, data = get_data_distribution(inputfile) logging.debug(prepend_str + 'Probabilities %s' % (probabilities)) account = 'root' scope = 'tests' totretries = 3 status = False for site in sites: for retry in range(0, totretries): start_time = time() tmpdir = tempfile.mkdtemp() logging.info(prepend_str + 'Running on site %s' % (site)) dic = choose_element(probabilities, data) metadata = dic['metadata'] metadata['version'] = str(random.randint(0, 1000)) metadata['run_number'] = str(random.randint(0, 100000)) metadata['stream_name'] = 'automatix_stream' metadata['campaign'] = 'automatix_campaign' try: nbfiles = dic['nbfiles'] except KeyError: nbfiles = 2 logging.warning(prepend_str + 'No nbfiles defined in the configuration, will use 2') try: filesize = dic['filesize'] except KeyError: filesize = 1000000 logging.warning(prepend_str + 'No filesize defined in the configuration, will use 1M files') dsn = 'tests:%s.%s.%s.%s.%s.%s' % (metadata['project'], metadata['run_number'], metadata['stream_name'], metadata['prod_step'], metadata['datatype'], metadata['version']) fnames = [] lfns = [] for dummy_nbfile in range(nbfiles): fname = '%s.%s' % (metadata['datatype'], generate_uuid()) lfns.append(fname) fname = '%s/%s' % (tmpdir, fname) logging.info(prepend_str + 'Generating file %(fname)s in dataset %(dsn)s' % locals()) generate_file(fname, filesize) fnames.append(fname) logging.info(prepend_str + 'Upload %s to %s' % (dsn, site)) status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata) for fname in fnames: remove(fname) rmdir(tmpdir) if status: monitor.record_counter(counters='automatix.addnewdataset.done', delta=1) monitor.record_counter(counters='automatix.addnewfile.done', delta=nbfiles) monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000) break else: logging.info(prepend_str + 'Failed to upload files. Will retry another time (attempt %s/%s)' % (str(retry + 1), str(totretries))) if once is True: logging.info(prepend_str + 'Run with once mode. Exiting') break tottime = time() - starttime if status: logging.info(prepend_str + 'It took %s seconds to upload one dataset on %s' % (str(tottime), str(sites))) if sleep_time > tottime: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) sleep(sleep_time - tottime) else: logging.info(prepend_str + 'Retrying a new upload') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000): """ Main loop to update the replicas and rules based on finished requests. """ try: conveyor_config = {item[0]: item[1] for item in items('conveyor')} except ConfigNotFound: logging.info('No configuration found for conveyor') conveyor_config = {} # Get suspicious patterns suspicious_patterns = conveyor_config.get('suspicious_pattern', []) if suspicious_patterns: pattern = str(suspicious_patterns) patterns = pattern.split(",") suspicious_patterns = [re.compile(pat.strip()) for pat in patterns] logging.debug("Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns]) retry_protocol_mismatches = conveyor_config.get('retry_protocol_mismatches', False) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all finishers have the correct worker number on the next try heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info('%s Finisher starting - db_bulk(%i) bulk (%i)', prepend_str, db_bulk, bulk) graceful_stop.wait(10) while not graceful_stop.is_set(): start_time = time.time() try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.debug('%s Starting new cycle', prepend_str) if activities is None: activities = [None] for activity in activities: logging.debug('%s Working on activity %s', prepend_str, activity) time1 = time.time() reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], mode_all=True, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logging.debug('%s Updating %i requests for activity %s', prepend_str, len(reqs), activity) for chunk in chunks(reqs, bulk): try: time3 = time.time() __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, prepend_str) record_timer('daemons.conveyor.finisher.handle_requests', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter('daemons.conveyor.finisher.handle_requests', len(chunk)) except Exception as error: logging.warn('%s %s', prepend_str, str(error)) if reqs: logging.debug('%s Finish to update %s finished requests for activity %s in %s seconds', prepend_str, len(reqs), activity, time.time() - time2) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logging.warn('%s Lock detected when handling request - skipping: %s', prepend_str, str(error)) else: logging.error('%s %s', prepend_str, traceback.format_exc()) except Exception as error: logging.critical('%s %s', prepend_str, str(error)) if once: return end_time = time.time() time_diff = end_time - start_time if time_diff < sleep_time: logging.info('%s Sleeping for a while : %s seconds', prepend_str, (sleep_time - time_diff)) graceful_stop.wait(sleep_time - time_diff) logging.info('%s Graceful stop requests', prepend_str) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop done', prepend_str)
def test_monitor_record_timer_message(self): """MONITOR (CORE): Send a timer message to graphite """ monitor.record_timer('test.runtime', 500)
def run_once(bulk, db_bulk, suspicious_patterns, retry_protocol_mismatches, heartbeat_handler, activity): worker_number, total_workers, logger = heartbeat_handler.live() try: logger(logging.DEBUG, 'Working on activity %s', activity) time1 = time.time() reqs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[ RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME ], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=total_workers, worker_number=worker_number, mode_all=True, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logger(logging.DEBUG, 'Updating %i requests for activity %s', len(reqs), activity) for chunk in chunks(reqs, bulk): try: worker_number, total_workers, logger = heartbeat_handler.live() time3 = time.time() __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, logger=logger) record_timer('daemons.conveyor.finisher.handle_requests_time', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter('daemons.conveyor.finisher.handle_requests', delta=len(chunk)) except Exception as error: logger(logging.WARNING, '%s', str(error)) if reqs: logger( logging.DEBUG, 'Finish to update %s finished requests for activity %s in %s seconds', len(reqs), activity, time.time() - time2) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: raise return True
def request_transfer(once=False, src=None, dst=None): """ Main loop to request a new transfer. """ logging.info('request: starting') site_a = 'RSE%s' % generate_uuid().upper() site_b = 'RSE%s' % generate_uuid().upper() scheme = 'https' impl = 'rucio.rse.protocols.webdav.Default' if not src.startswith('https://'): scheme = 'srm' impl = 'rucio.rse.protocols.srm.Default' srctoken = src.split(':')[0] dsttoken = dst.split(':')[0] tmp_proto = { 'impl': impl, 'scheme': scheme, 'domains': { 'lan': {'read': 1, 'write': 1, 'delete': 1}, 'wan': {'read': 1, 'write': 1, 'delete': 1}}} rse.add_rse(site_a) tmp_proto['hostname'] = src.split(':')[1][2:] tmp_proto['port'] = src.split(':')[2].split('/')[0] tmp_proto['prefix'] = '/'.join([''] + src.split(':')[2].split('/')[1:]) if scheme == 'srm': tmp_proto['extended_attributes'] = {'space_token': srctoken, 'web_service_path': ''} rse.add_protocol(site_a, tmp_proto) tmp_proto = { 'impl': impl, 'scheme': scheme, 'domains': { 'lan': {'read': 1, 'write': 1, 'delete': 1}, 'wan': {'read': 1, 'write': 1, 'delete': 1}}} rse.add_rse(site_b) tmp_proto['hostname'] = dst.split(':')[1][2:] tmp_proto['port'] = dst.split(':')[2].split('/')[0] tmp_proto['prefix'] = '/'.join([''] + dst.split(':')[2].split('/')[1:]) if scheme == 'srm': tmp_proto['extended_attributes'] = {'space_token': dsttoken, 'web_service_path': ''} rse.add_protocol(site_b, tmp_proto) si = rsemanager.get_rse_info(site_a) session = get_session() logging.info('request: started') while not graceful_stop.is_set(): try: ts = time.time() tmp_name = generate_uuid() # add a new dataset did.add_did(scope='mock', name='dataset-%s' % tmp_name, type=DIDType.DATASET, account='root', session=session) # construct PFN pfn = rsemanager.lfns2pfns(si, lfns=[{'scope': 'mock', 'name': 'file-%s' % tmp_name}])['mock:file-%s' % tmp_name] # create the directories if needed p = rsemanager.create_protocol(si, operation='write', scheme=scheme) p.connect() try: p.mkdir(pfn) except: pass # upload the test file try: fp = os.path.dirname(config_get('injector', 'file')) fn = os.path.basename(config_get('injector', 'file')) p.put(fn, pfn, source_dir=fp) except: logging.critical('Could not upload, removing temporary DID: %s' % str(sys.exc_info())) did.delete_dids([{'scope': 'mock', 'name': 'dataset-%s' % tmp_name}], account='root', session=session) break # add the replica replica.add_replica(rse=site_a, scope='mock', name='file-%s' % tmp_name, bytes=config_get_int('injector', 'bytes'), adler32=config_get('injector', 'adler32'), md5=config_get('injector', 'md5'), account='root', session=session) # to the dataset did.attach_dids(scope='mock', name='dataset-%s' % tmp_name, dids=[{'scope': 'mock', 'name': 'file-%s' % tmp_name, 'bytes': config_get('injector', 'bytes')}], account='root', session=session) # add rule for the dataset ts = time.time() rule.add_rule(dids=[{'scope': 'mock', 'name': 'dataset-%s' % tmp_name}], account='root', copies=1, rse_expression=site_b, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None, activity='mock-injector', session=session) logging.info('added rule for %s for DID mock:%s' % (site_b, tmp_name)) record_timer('daemons.mock.conveyorinjector.add_rule', (time.time()-ts)*1000) record_counter('daemons.mock.conveyorinjector.request_transfer') session.commit() except: session.rollback() logging.critical(traceback.format_exc()) if once: return logging.info('request: graceful stop requested') logging.info('request: graceful stop done')