def revert(self): """ Reverts the changes from previous tuning, this is to avoid recursively tuning with no reference point, the manually configured attributes or the default attributes will stay as the reference point Before each cycle, all tunings will be reverted to the original reference point. :returns: bool indicating if revert was successful or not. """ try: cycle_file = config_get('conveyor', 'fts_throttler_cycle') except Exception: logging.warning( 'could not get the cycle file, cannot revert cycle changes, therefor no tuning either' ) return False with open(cycle_file) as cycle_info: cycle_info_dict = json.load(cycle_info) storages = cycle_info_dict['storages'] for storage in storages: t = FTS3Transfertool(storage['fts-host']) logging.info('storage information: %s', storage) t.set_se_config( storage['storage'], inbound_max_active=storage['inbound_max_active'], outbound_max_active=storage['outbound_max_active']) logging.info( 'on storage ' + storage['storage'] + ' outbound_max_active reverted from ' + str(storage['tuned_outbound_max_active']) + ' to ' + str(storage['outbound_max_active']) # NOQA: W503 + ', inbound_max_active reverted from ' + str(storage['tuned_inbound_max_active']) + ' to ' + str(storage['inbound_max_active'])) # NOQA: W503 logging.info('revert performed') return True
def query_latest(external_host, state, last_nhours=1): """ Query the latest transfers in last n hours with state. :param external_host: FTS host name as a string. :param state: FTS job state as a string or a dictionary. :param last_nhours: Latest n hours as an integer. :returns: Requests status information as a dictionary. """ record_counter('core.request.query_latest') start_time = time.time() resps = FTS3Transfertool(external_host=external_host).query_latest(state=state, last_nhours=last_nhours) record_timer('core.request.query_latest_fts3.%s.%s_hours' % (external_host, last_nhours), (time.time() - start_time) * 1000) if not resps: return ret_resps = [] for resp in resps: if 'job_metadata' not in resp or resp['job_metadata'] is None or 'issuer' not in resp['job_metadata'] or resp['job_metadata']['issuer'] != 'rucio': continue if 'request_id' not in resp['job_metadata']: # submitted by new submitter try: logging.debug("Transfer %s on %s is %s, decrease its updated_at." % (resp['job_id'], external_host, resp['job_state'])) set_transfer_update_time(external_host, resp['job_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24)) except Exception as error: logging.debug("Exception happened when updating transfer updatetime: %s" % str(error).replace('\n', '')) return ret_resps
def submit_bulk_transfers(external_host, files, transfertool='fts3', job_params={}, timeout=None, user_transfer_job=False): """ Submit transfer request to a transfertool. :param external_host: External host name as string :param files: List of Dictionary containing request file. :param transfertool: Transfertool as a string. :param job_params: Metadata key/value pairs for all files as a dictionary. :returns: Transfertool external ID. """ record_counter('core.request.submit_transfer') transfer_id = None if transfertool == 'fts3': start_time = time.time() job_files = [] for file in files: job_file = {} for key in file: if key == 'sources': # convert sources from (src_rse, url, src_rse_id, rank) to url job_file[key] = [] for source in file[key]: job_file[key].append(source[1]) else: job_file[key] = file[key] job_files.append(job_file) if not user_transfer_job: transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) elif USER_TRANSFERS == "cms": transfer_id = FTS3MyProxyTransfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) else: # if no valid USER TRANSFER cases --> go with std submission transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout) record_timer('core.request.submit_transfers_fts3', (time.time() - start_time) * 1000 / len(files)) return transfer_id
def testread(self, tuning_ratio=25): """ Read the failure ratio of storages without tuning :returns: filtered JSON response from Elastic search. :param tuning_ratio: integer lower bound for what failing storages you want to read. """ result = self.request_timeout_data() if result is not None: rses = result['aggregations']['rse']['buckets'] for rse in rses: # if a rse has a failure ratio above the tuning ratio we read it. if rse['failure_ratio'].get('value') > tuning_ratio: # rse_info holds the storage name(0) and FTS-host server(1) rse_info = rse['key'].split() t = FTS3Transfertool(rse_info[1]) # extract FTS storage from dst-url tmp = rse['destination']['hits']['hits'][0]['_source'][ 'payload']['dst-url'].split(':', 2) url = tmp[0] + ':' + tmp[1] logging.info('\033[91m RSE \033[0m' + rse_info[0] + '\033[91m on FTS host \033[0m' + rse_info[1] + '\033[91m has failure ratio \033[0m' + str(rse['failure_ratio'].get('value')) + '\033[91m on storage \033[0m' + url) try: se = t.get_se_config(url) self.logger(logging.INFO, 'storage settings: %s', se) except KeyError: self.logger( logging.WARNING, 'configuration for storage element was not found') except Exception as error: self.logger( logging.WARNING, 'an error occured when trying to get the storage configuration' ) self.logger(logging.WARNING, str(error)) continue return rses else: self.logger( logging.WARNING, 'Could not retrieve timeout data with elastic search, trying again next cycle' )
def bulk_query_transfers(request_host, transfer_ids, transfertool='fts3', timeout=None): """ Query the status of a transfer. :param request_host: Name of the external host. :param transfer_ids: List of (External-ID as a 32 character hex string) :param transfertool: Transfertool name as a string. :returns: Request status information as a dictionary. """ record_counter('core.request.bulk_query_transfers') if transfertool == 'fts3': try: start_time = time.time() fts_resps = FTS3Transfertool(external_host=request_host).bulk_query(transfer_ids=transfer_ids, timeout=timeout) record_timer('core.request.bulk_query_transfers', (time.time() - start_time) * 1000 / len(transfer_ids)) except Exception: raise for transfer_id in transfer_ids: if transfer_id not in fts_resps: fts_resps[transfer_id] = Exception("Transfer id %s is not returned" % transfer_id) if fts_resps[transfer_id] and not isinstance(fts_resps[transfer_id], Exception): for request_id in fts_resps[transfer_id]: if fts_resps[transfer_id][request_id]['file_state'] in (str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)): fts_resps[transfer_id][request_id]['new_state'] = RequestState.FAILED elif fts_resps[transfer_id][request_id]['file_state'] in str(FTSState.FINISHED): fts_resps[transfer_id][request_id]['new_state'] = RequestState.DONE return fts_resps else: raise NotImplementedError return None
def tune(self): """ tune the configuration settings """ result = self.request_timeout_data() if result is not None: try: cycle_file = config_get('conveyor', 'fts_throttler_cycle') except Exception: logging.warning( 'could not get the cycle file, cannot perform tuning for this cycle without cycle file, returning' ) return try: tuning_ratio = config_get('conveyor', 'fts_throttler_tuning_ratio') except Exception: logging.warning( 'could not get the tuning ratio from config, returning') return rses = result['aggregations']['rse']['buckets'] cycle_info_dict = {'storages': []} for rse in rses: # if a rse has a failure ratio above the tuning ratio (percentage) we tune it. if rse['failure_ratio'].get('value') > int(tuning_ratio): # rse_info holds the storage name(0) and FTS-host server(1) rse_info = rse['key'].split() # Tapes might have other reasons for timeouts which should be treated differently, therefor they are ignored and not tuned for now. if rse['storage_type']['hits']['hits'][0]['_source'][ 'payload']['dst-type'] == 'TAPE': logging.info( '%s is a tape storage type, it will not be tuned', rse_info[0]) continue # instantiate transfertool for access to get_se_config and set_se_config. t = FTS3Transfertool(rse_info[1]) # extract FTS storage from dst-url tmp = rse['destination']['hits']['hits'][0]['_source'][ 'payload']['dst-url'].split(':', 2) url = tmp[0] + ':' + tmp[1] n = rse['failure_ratio'].get('value') logging.info(' RSE ' + rse_info[0] + ' on FTS host ' + rse_info[1] + ' has failure ratio ' + str(rse['failure_ratio'].get('value')) + ' on storage ' + url) # NOQA: W503 try: se = t.get_se_config(url) logging.info('storage settings: %s', se) except KeyError: logging.warning( 'configuration for storage element was not found, config will be set from default values' ) # all FTS Host servers have a default reference storage named '*' that holds the default values for all storages that arent listed yet. default_storage = t.get_se_config('*') t.set_se_config( url, inbound_max_active=int( (100 / (100 + n)) * default_storage['se_info'] ['inbound_max_active']), outbound_max_active=int( (100 / (100 + n)) * default_storage['se_info'] ['outbound_max_active'])) logging.info( url + 'inbound_max_active changed from ' + str(default_storage['se_info'] ['inbound_max_active']) + ' to ' + str( int((100 / (100 + n)) * default_storage['se_info'] ['inbound_max_active'])) + ', outbound_max_active changed from ' + str(default_storage['se_info'] ['outbound_max_active']) + ' to ' + str( int((100 / (100 + n)) * default_storage['se_info'] ['outbound_max_active']))) # NOQA: W503 # cycle_info_dict is used to write changes down to the cycle file. cycle_info_dict['storages'].append({ 'storage': url, 'inbound_max_active': default_storage['se_info']['inbound_max_active'], 'outbound_max_active': default_storage['se_info']['outbound_max_active'], 'failure_ratio': n, 'tuned_inbound_max_active': int((100 / (100 + n)) * default_storage['se_info'] ['inbound_max_active']), 'tuned_outbound_max_active': int((100 / (100 + n)) * default_storage['se_info'] ['outbound_max_active']), 'fts-host': rse_info[1], 'time': str(datetime.datetime.now()) }) continue except Exception as error: logging.warning( 'an error occured when trying to get the storage configuration' ) logging.warning(str(error)) continue # Even though we could read the config, we still need to know if the important attributes are empty. if se['se_info']['inbound_max_active'] is None: try: default_storage = t.get_se_config('*') except Exception: raise Exception( 'Could not retrieve the default storage information' ) ima = default_storage['se_info']['inbound_max_active'] else: ima = se['se_info']['inbound_max_active'] if se['se_info']['outbound_max_active'] is None: try: default_storage = t.get_se_config('*') except Exception: raise Exception( 'Could not retrieve the default storage information' ) oma = default_storage['se_info']['outbound_max_active'] else: oma = se['se_info']['outbound_max_active'] # append existing information to dict and write to file. cycle_info_dict['storages'].append({ 'storage': url, 'inbound_max_active': ima, 'outbound_max_active': oma, 'failure_ratio': n, 'tuned_inbound_max_active': int((100 / (100 + n)) * ima), 'tuned_outbound_max_active': int((100 / (100 + n)) * oma), 'fts-host': rse_info[1], 'time': str(datetime.datetime.now()) }) # tune down the configuration of a storage relative to the failure ratio(n) and existing configuration. t.set_se_config(url, inbound_max_active=int( (100 / (100 + n)) * ima), outbound_max_active=int( (100 / (100 + n)) * oma)) logging.info( url + 'inbound_max_active changed from ' + str(ima) + ' to ' + str(int((100 / (100 + n)) * ima)) + ', outbound_max_active changed from ' + str(oma) + ' to ' + str(int( (100 / (100 + n)) * oma))) # NOQA: W503 if cycle_info_dict['storages'] == []: logging.info( 'no storages are failing significantly due to timeout errors, therefor no tuning happened.' ) with open(cycle_file, 'w') as outfile: json.dump(cycle_info_dict, outfile) else: logging.warning( 'Could not detect any storages with sufficient failure ratio for tuning, trying again next cycle' ) return
def run_once(fts_bulk, db_bulk, older_than, activity_shares, multi_vo, timeout, activity, heartbeat_handler): worker_number, total_workers, logger = heartbeat_handler.live() start_time = time.time() logger( logging.DEBUG, 'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s' % (older_than, activity, FILTER_TRANSFERTOOL)) transfs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than) if older_than else None, total_workers=total_workers, worker_number=worker_number, mode_all=True, hash_variable='id', activity=activity, activity_shares=activity_shares, transfertool=FILTER_TRANSFERTOOL) record_timer('daemons.conveyor.poller.get_next', (time.time() - start_time) * 1000) if TRANSFER_TOOL and not FILTER_TRANSFERTOOL: # only keep transfers which don't have any transfertool set, or have one equal to TRANSFER_TOOL transfs_tmp = [ t for t in transfs if not t['transfertool'] or t['transfertool'] == TRANSFER_TOOL ] if len(transfs_tmp) != len(transfs): logger( logging.INFO, 'Skipping %i transfers because of missmatched transfertool', len(transfs) - len(transfs_tmp)) transfs = transfs_tmp if transfs: logger( logging.DEBUG, 'Polling %i transfers for activity %s' % (len(transfs), activity)) transfs.sort( key=lambda t: (t['external_host'] or '', t['scope'].vo if multi_vo else '', t['external_id'] or '', t['request_id'] or '')) for (external_host, vo), transfers_for_host in groupby(transfs, key=lambda t: (t['external_host'], t['scope'].vo if multi_vo else None)): transfers_by_eid = {} for external_id, xfers in groupby(transfers_for_host, key=lambda t: t['external_id']): transfers_by_eid[external_id] = {t['request_id']: t for t in xfers} for chunk in dict_chunks(transfers_by_eid, fts_bulk): try: if TRANSFER_TOOL == 'globus': transfertool_obj = GlobusTransferTool(external_host=None) else: transfertool_obj = FTS3Transfertool( external_host=external_host, vo=vo) worker_number, total_workers, logger = heartbeat_handler.live() poll_transfers(transfertool_obj=transfertool_obj, transfers_by_eid=chunk, timeout=timeout, logger=logger) except Exception: logger(logging.ERROR, 'Exception', exc_info=True) queue_empty = False if len(transfs) < fts_bulk / 2: logger( logging.INFO, "Only %s transfers for activity %s, which is less than half of the bulk %s" % (len(transfs), activity, fts_bulk)) queue_empty = True return queue_empty