def get(section, option, issuer=None): """ Get an option value for the named section. Value can be auto-coerced to int, float, and bool; string otherwise. Caveat emptor: Strings, regardless the case, matching 'on'/off', 'true'/'false', 'yes'/'no' are converted to bool. 0/1 are converted to int, and not to bool. :param section: The name of the section. :param option: The name of the option. :param issuer: The issuer account. :returns: The auto-coerced value. """ kwargs = {'issuer': issuer, 'section': section, 'option': option} if not permission.has_permission(issuer=issuer, action='config_get', kwargs=kwargs): raise exception.AccessDenied('%s cannot retrieve option %s from section %s' % (issuer, option, section)) return config.get(section, option)
def __schedule_requests(): """ Schedule requests """ try: logging.info("Throttler retrieve requests statistics") results = get_stats_by_activity_dest_state(state=[ RequestState.QUEUED, RequestState.SUBMITTING, RequestState.SUBMITTED, RequestState.WAITING ]) result_dict = {} for activity, dest_rse_id, account, state, rse, counter in results: threshold = get_config_limit(activity, dest_rse_id) if threshold or (counter and (state == RequestState.WAITING)): if dest_rse_id not in result_dict: result_dict[dest_rse_id] = { 'waiting': 0, 'transfer': 0, 'threshold': get_config_limit('all_activities', dest_rse_id), 'rse': rse, 'activities': {} } if activity not in result_dict[dest_rse_id]['activities']: result_dict[dest_rse_id]['activities'][activity] = { 'waiting': 0, 'transfer': 0, 'threshold': threshold, 'accounts': {} } if account not in result_dict[dest_rse_id]['activities'][ activity]['accounts']: result_dict[dest_rse_id]['activities'][activity][ 'accounts'][account] = { 'waiting': 0, 'transfer': 0 } if state == RequestState.WAITING: result_dict[dest_rse_id]['activities'][activity][ 'accounts'][account]['waiting'] += counter result_dict[dest_rse_id]['activities'][activity][ 'waiting'] += counter result_dict[dest_rse_id]['waiting'] += counter else: result_dict[dest_rse_id]['activities'][activity][ 'accounts'][account]['transfer'] += counter result_dict[dest_rse_id]['activities'][activity][ 'transfer'] += counter result_dict[dest_rse_id]['transfer'] += counter for dest_rse_id in result_dict: dest_rse_release_strategy = config_core.get( 'throttler_release_strategy', 'dest_%s' % dest_rse_id, default='fifo', use_cache=False) rse_name = result_dict[dest_rse_id]['rse'] availability = get_rse(rse_name).availability if availability & 2: # dest_rse is not blacklisted for write if dest_rse_release_strategy == 'grouped_fifo': threshold = result_dict[dest_rse_id]['threshold'] transfer = result_dict[dest_rse_id]['transfer'] waiting = result_dict[dest_rse_id]['waiting'] if threshold and transfer + waiting > threshold: record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.max_transfers' % (rse_name), threshold) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.transfers' % (rse_name), transfer) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.waitings' % (rse_name), waiting) if transfer < 0.8 * threshold: to_be_released = threshold - transfer release_waiting_requests_grouped_fifo( rse_name, rse_id=dest_rse_id, count=to_be_released) else: logging.debug( "Throttler has done nothing on rse %s (transfer > 0.8 * threshold)" % rse_name) elif waiting > 0 or not threshold: logging.debug( "Throttler remove limits(threshold: %s) and release all waiting requests, rse %s" % (threshold, rse_name)) delete_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id) release_all_waiting_requests(rse=None, rse_id=dest_rse_id) record_counter( 'daemons.conveyor.throttler.delete_rse_transfer_limits.%s' % (rse_name)) elif dest_rse_release_strategy == 'fifo': for activity in result_dict[dest_rse_id]['activities']: threshold = result_dict[dest_rse_id]['activities'][ activity]['threshold'] transfer = result_dict[dest_rse_id]['activities'][ activity]['transfer'] waiting = result_dict[dest_rse_id]['activities'][ activity]['waiting'] if waiting: logging.debug( "Request status for %s at %s: %s" % (activity, rse_name, result_dict[dest_rse_id] ['activities'][activity])) if threshold is None: logging.debug( "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse_id %s" % (threshold, activity, dest_rse_id)) delete_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id) release_all_waiting_requests(rse=None, activity=activity, rse_id=dest_rse_id) record_counter( 'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s' % (activity, rse_name)) elif transfer + waiting > threshold: logging.debug( "Throttler set limits for activity %s, rse %s" % (activity, rse_name)) set_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id, max_transfers=threshold, transfers=transfer, waitings=waiting) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.max_transfers' % (activity, rse_name), threshold) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.transfers' % (activity, rse_name), transfer) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.waitings' % (activity, rse_name), waiting) if transfer < 0.8 * threshold: # release requests on account nr_accounts = len( result_dict[dest_rse_id]['activities'] [activity]['accounts']) if nr_accounts < 1: nr_accounts = 1 to_release = threshold - transfer threshold_per_account = math.ceil(threshold / nr_accounts) to_release_per_account = math.ceil(to_release / nr_accounts) accounts = result_dict[dest_rse_id][ 'activities'][activity]['accounts'] for account in accounts: if nr_accounts == 1: logging.debug( "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release, activity, rse_name, account)) release_waiting_requests_fifo( rse=None, activity=activity, rse_id=dest_rse_id, account=account, count=to_release) record_gauge( 'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s' % (activity, rse_name, account), to_release) elif accounts[account][ 'transfer'] > threshold_per_account: logging.debug( "Throttler will not release %s waiting requests for activity %s, rse %s, account %s: It queued more transfers than its share " % (accounts[account]['waiting'], activity, rse_name, account)) nr_accounts -= 1 to_release_per_account = math.ceil( to_release / nr_accounts) elif accounts[account][ 'waiting'] < to_release_per_account: logging.debug( "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (accounts[account]['waiting'], activity, rse_name, account)) release_waiting_requests_fifo( rse=None, activity=activity, rse_id=dest_rse_id, account=account, count=accounts[account]['waiting']) record_gauge( 'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s' % (activity, rse_name, account), accounts[account]['waiting']) to_release = to_release - accounts[ account]['waiting'] nr_accounts -= 1 to_release_per_account = math.ceil( to_release / nr_accounts) else: logging.debug( "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release_per_account, activity, rse_name, account)) release_waiting_requests_fifo( rse=None, activity=activity, rse_id=dest_rse_id, account=account, count=to_release_per_account) record_gauge( 'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s' % (activity, rse_name, account), to_release_per_account) to_release = to_release - to_release_per_account nr_accounts -= 1 else: logging.debug( "Throttler has done nothing for activity %s on rse %s (transfer > 0.8 * threshold)" % (activity, rse_name)) elif waiting > 0: logging.debug( "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse %s" % (threshold, activity, rse_name)) delete_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id) release_all_waiting_requests(rse=None, activity=activity, rse_id=dest_rse_id) record_counter( 'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s' % (activity, rse_name)) except Exception: logging.critical("Failed to schedule requests, error: %s" % (traceback.format_exc()))
def reaper(rses, include_rses, exclude_rses, vos=None, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0, sleep_time=60, auto_exclude_threshold=100, auto_exclude_timeout=600): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param include_rses: RSE expression to include RSEs. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. :param chunk_size: The size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param delay_seconds: The delay to query replicas in BEING_DELETED state. :param sleep_time: Time between two cycles. :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. """ hostname = socket.getfqdn() executable = 'reaper2' pid = os.getpid() hb_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper starting') if not once: GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper started') while not GRACEFUL_STOP.is_set(): # try to get auto exclude parameters from the config table. Otherwise use CLI parameters. try: auto_exclude_threshold = get('reaper', 'auto_exclude_threshold', default=auto_exclude_threshold) auto_exclude_timeout = get('reaper', 'auto_exclude_timeout', default=auto_exclude_timeout) except ConfigNotFound: pass # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = get('reaper', 'max_evaluator_backlog_count') except ConfigNotFound: max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = get( 'reaper', 'max_evaluator_backlog_duration') except ConfigNotFound: max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog() if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Reaper: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses, vos) if not rses_to_process: logger(logging.ERROR, 'Reaper: No RSEs found. Will sleep for 30 seconds') GRACEFUL_STOP.wait(30) continue start_time = time.time() try: staging_areas = [] dict_rses = {} heart_beat = live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') tot_needed_free_space = 0 for rse in rses_to_process: # Check if the RSE is a staging area if rse['staging_area']: staging_areas.append(rse['rse']) # Check if RSE is blacklisted if rse['availability'] % 2 == 0: logger(logging.DEBUG, 'RSE %s is blacklisted for delete', rse['rse']) continue max_being_deleted_files, needed_free_space, used, free, only_delete_obsolete = __check_rse_usage( rse['rse'], rse['id'], logger) # Check if greedy mode if greedy: dict_rses[(rse['rse'], rse['id'])] = [ 1000000000000, max_being_deleted_files, only_delete_obsolete ] tot_needed_free_space += 1000000000000 else: if needed_free_space: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, max_being_deleted_files, only_delete_obsolete ] tot_needed_free_space += needed_free_space elif only_delete_obsolete: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, max_being_deleted_files, only_delete_obsolete ] else: logger(logging.DEBUG, 'Nothing to delete on %s', rse['rse']) # Ordering the RSEs based on the needed free space sorted_dict_rses = OrderedDict( sorted(dict_rses.items(), key=itemgetter(1), reverse=True)) logger(logging.DEBUG, 'List of RSEs to process ordered by needed space desc: %s', str(sorted_dict_rses)) # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value rses_hostname_mapping = get_rses_to_hostname_mapping() # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping)) list_rses_mult = [] # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity for rse_key in dict_rses: rse_name, rse_id = rse_key # The length of the deletion queue scales inversily with the number of workers # The ceil increase the weight of the RSE with small amount of files to delete if tot_needed_free_space: max_workers = ceil(dict_rses[rse_key][0] / tot_needed_free_space * 1000 / heart_beat['nr_threads']) else: max_workers = 1 list_rses_mult.extend([ (rse_name, rse_id, dict_rses[rse_key][0], dict_rses[rse_key][1]) for _ in range(int(max_workers)) ]) random.shuffle(list_rses_mult) for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult: result = REGION.get('pause_deletion_%s' % rse_id, expiration_time=120) if result is not NO_VALUE: logger( logging.INFO, 'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', rse_name) continue result = REGION.get('temporary_exclude_%s' % rse_id, expiration_time=auto_exclude_timeout) if result is not NO_VALUE: logger( logging.WARNING, 'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.', rse_name) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) continue labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(0) percent = 0 if tot_needed_free_space: percent = needed_free_space / tot_needed_free_space * 100 logger( logging.DEBUG, 'Working on %s. Percentage of the total space needed %.2f', rse_name, percent) rse_hostname, rse_info = rses_hostname_mapping[rse_id] rse_hostname_key = '%s,%s' % (rse_id, rse_hostname) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt)) tot_threads_for_hostname = 0 tot_threads_for_rse = 0 for key in payload_cnt: if key and key.find(',') > -1: if key.split(',')[1] == rse_hostname: tot_threads_for_hostname += payload_cnt[key] if key.split(',')[0] == str(rse_id): tot_threads_for_rse += payload_cnt[key] max_deletion_thread = get_max_deletion_threads_by_hostname( rse_hostname) if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread: logger( logging.DEBUG, 'Too many deletion threads for %s on RSE %s. Back off', rse_hostname, rse_name) # Might need to reschedule a try on this RSE later in the same cycle continue logger( logging.INFO, 'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', rse_hostname, tot_threads_for_hostname, max_deletion_thread, rse_name) live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) logger(logging.DEBUG, 'Total deletion workers for %s : %i', rse_hostname, tot_threads_for_hostname + 1) # List and mark BEING_DELETED the files to delete del_start_time = time.time() only_delete_obsolete = dict_rses[(rse_name, rse_id)][2] try: with monitor.record_timer_block( 'reaper.list_unlocked_replicas'): if only_delete_obsolete: logger( logging.DEBUG, 'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas', rse_name) replicas = list_and_mark_unlocked_replicas( limit=chunk_size, bytes=needed_free_space, rse_id=rse_id, delay_seconds=delay_seconds, only_delete_obsolete=only_delete_obsolete, session=None) logger( logging.DEBUG, 'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', rse_name, needed_free_space, time.time() - del_start_time, len(replicas)) if len(replicas) < chunk_size: logger( logging.DEBUG, 'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', rse_name, chunk_size, len(replicas)) REGION.set('pause_deletion_%s' % rse_id, True) except (DatabaseException, IntegrityError, DatabaseError) as error: logger(logging.ERROR, '%s', str(error)) continue except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) # Physical deletion will take place there try: prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for file_replicas in chunks(replicas, 100): # Refresh heartbeat live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) del_start_time = time.time() for replica in file_replicas: try: replica['pfn'] = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: logger( logging.WARNING, 'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', replica['scope'], replica['name'], rse_name, str(error)) replica['pfn'] = None except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) deleted_files = delete_from_storage( file_replicas, prot, rse_info, staging_areas, auto_exclude_threshold, logger) logger(logging.INFO, '%i files processed in %s seconds', len(file_replicas), time.time() - del_start_time) # Then finally delete the replicas del_start = time.time() with monitor.record_timer_block( 'reaper.delete_replicas'): delete_replicas(rse_id=rse_id, files=deleted_files) logger( logging.DEBUG, 'delete_replicas successed on %s : %s replicas in %s seconds', rse_name, len(deleted_files), time.time() - del_start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) DELETION_COUNTER.inc(len(deleted_files)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: break tottime = time.time() - start_time if tottime < sleep_time: logger(logging.INFO, 'Will sleep for %s seconds', sleep_time - tottime) GRACEFUL_STOP.wait(sleep_time - tottime) except DatabaseException as error: logger(logging.WARNING, 'Reaper: %s', str(error)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) finally: if once: break die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done') return
def add_exception(dids, account, pattern, comments, expires_at, session=None): """ Add exceptions to Lifetime Model. :param dids: The list of dids :param account: The account of the requester. :param pattern: The account. :param comments: The comments associated to the exception. :param expires_at: The expiration date of the exception. :param session: The database session in use. returns: The id of the exception. """ exception_id = generate_uuid() text = 'Account %s requested a lifetime extension for a list of DIDs that can be found below\n' % account reason = comments volume = None lifetime = None if comments.find('||||') > -1: reason, volume = comments.split('||||') text += 'The reason for the extension is "%s"\n' % reason text += 'It represents %s datasets\n' % len(dids) if volume: text += 'The estimated physical volume is %s\n' % volume if expires_at and isinstance(expires_at, string_types): lifetime = str_to_date(expires_at) text += 'The lifetime exception should expires on %s\n' % str( expires_at) elif isinstance(expires_at, datetime): lifetime = expires_at text += 'The lifetime exception should expires on %s\n' % str( expires_at) text += 'Link to approve or reject this request can be found at the end of the mail\n' text += '\n' text += 'DIDTYPE SCOPE NAME\n' text += '\n' truncated_message = False for did in dids: did_type = None if 'did_type' in did: if isinstance(did['did_type'], string_types): did_type = DIDType.from_sym(did['did_type']) else: did_type = did['did_type'] new_exception = models.LifetimeExceptions( id=exception_id, scope=did['scope'], name=did['name'], did_type=did_type, account=account, pattern=pattern, comments=reason, state=LifetimeExceptionsState.WAITING, expires_at=lifetime) if len(text) < 3000: text += '%s %s %s\n' % (str(did_type), did['scope'], did['name']) else: truncated_message = True try: new_exception.save(session=session, flush=False) except IntegrityError as error: if match('.*ORA-00001.*', str(error.args[0]))\ or match('.*IntegrityError.*UNIQUE constraint failed.*', str(error.args[0]))\ or match('.*1062.*Duplicate entry.*for key.*', str(error.args[0]))\ or match('.*sqlite3.IntegrityError.*are not unique.*', error.args[0]): raise LifetimeExceptionDuplicate() raise RucioException(error.args[0]) if truncated_message: text += '...\n' text += 'List too long. Truncated\n' text += '\n' text += 'Approve: https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=approve\n' % str( exception_id) text += 'Deny: https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=deny\n' % str( exception_id) approvers_email = get('lifetime_model', 'approvers_email', default=[], session=session) if approvers_email: approvers_email = approvers_email.split(',') # pylint: disable=no-member add_message(event_type='email', payload={ 'body': text, 'to': approvers_email, 'subject': '[RUCIO] Request to approve lifetime exception %s' % str(exception_id) }, session=session) return exception_id
def rule_rebalancer(rse_expression, move_subscriptions=False, use_dump=False, sleep_time=300, once=True, dry_run=False): """ Main loop to rebalancer rules automatically """ total_rebalance_volume = 0 executable = 'rucio-bb8' hostname = socket.gethostname() pid = os.getpid() hb_thread = threading.current_thread() heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'bb8[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.DEBUG, 'rse_expression: %s', rse_expression) logger(logging.INFO, 'BB8 started') while not GRACEFUL_STOP.is_set(): logger(logging.INFO, 'Starting new cycle') heart_beat = live(executable, hostname, pid, hb_thread) start_time = time.time() total_rebalance_volume = 0 tolerance = config_core.get('bb8', 'tolerance', default=0.05) max_total_rebalance_volume = config_core.get( 'bb8', 'max_total_rebalance_volume', default=10 * 1E12) max_rse_rebalance_volume = config_core.get('bb8', 'max_rse_rebalance_volume', default=500 * 1E9) min_total = config_core.get('bb8', 'min_total', default=20 * 1E9) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) if rse_expression in payload_cnt: logger( logging.WARNING, 'One BB8 instance already running with the same RSE expression. Stopping' ) break else: # List the RSEs represented by rse_expression try: rses = [rse for rse in parse_expression(rse_expression)] list_rses2 = [rse['rse'] for rse in rses] except InvalidRSEExpression as err: logger(logging.ERROR, err) break # List the RSEs represented by all the RSE expressions stored in heartbeat payload list_rses1 = [] for rse_exp in payload_cnt: if rse_exp: list_rses1 = [ rse['rse'] for rse in parse_expression(rse_exp) ] for rse in list_rses2: if rse in list_rses1: logger(logging.WARNING, 'Overlapping RSE expressions %s vs %s. Stopping', rse_exp, rse_expression) break logger(logging.INFO, 'Will process rebalancing on %s', rse_expression) heart_beat = live(executable, hostname, pid, hb_thread, older_than=max(600, sleep_time), hash_executable=None, payload=rse_expression, session=None) total_primary = 0 total_secondary = 0 total_total = 0 global_ratio = float(0) for rse in rses: logger(logging.DEBUG, 'Getting RSE usage on %s', rse['rse']) rse_usage = get_rse_usage(rse_id=rse['id']) usage_dict = {} for item in rse_usage: # TODO Check last update usage_dict[item['source']] = { 'used': item['used'], 'free': item['free'], 'total': item['total'] } try: rse['primary'] = usage_dict['rucio']['used'] - usage_dict[ 'expired']['used'] rse['secondary'] = usage_dict['expired']['used'] rse['total'] = usage_dict['storage']['total'] - usage_dict[ 'min_free_space']['used'] rse['ratio'] = float(rse['primary']) / float(rse['total']) except KeyError as err: logger(logging.ERROR, 'Missing source usage %s for RSE %s. Exiting', err, rse['rse']) break total_primary += rse['primary'] total_secondary += rse['secondary'] total_total += float(rse['total']) rse['receive_volume'] = 0 # Already rebalanced volume in this run global_ratio = float(total_primary) / float(total_total) logger(logging.INFO, 'Global ratio: %f' % (global_ratio)) for rse in sorted(rses, key=lambda k: k['ratio']): logger(logging.INFO, '%s Sec/Prim local ratio (%f) vs global %s', rse['rse'], rse['ratio'], global_ratio) rses_over_ratio = sorted([ rse for rse in rses if rse['ratio'] > global_ratio + global_ratio * tolerance ], key=lambda k: k['ratio'], reverse=True) rses_under_ratio = sorted([ rse for rse in rses if rse['ratio'] < global_ratio - global_ratio * tolerance ], key=lambda k: k['ratio'], reverse=False) # Excluding RSEs logger( logging.DEBUG, 'Excluding RSEs as destination which are too small by size:') for des in rses_under_ratio: if des['total'] < min_total: logger(logging.DEBUG, 'Excluding %s', des['rse']) rses_under_ratio.remove(des) logger(logging.DEBUG, 'Excluding RSEs as sources which are too small by size:') for src in rses_over_ratio: if src['total'] < min_total: logger(logging.DEBUG, 'Excluding %s', src['rse']) rses_over_ratio.remove(src) logger( logging.DEBUG, 'Excluding RSEs as destinations which are not available for write:' ) for des in rses_under_ratio: if des['availability'] & 2 == 0: logger(logging.DEBUG, 'Excluding %s', des['rse']) rses_under_ratio.remove(des) logger( logging.DEBUG, 'Excluding RSEs as sources which are not available for read:') for src in rses_over_ratio: if src['availability'] & 4 == 0: logger(logging.DEBUG, 'Excluding %s', src['rse']) rses_over_ratio.remove(src) # Gets the number of active transfers per location dict_locks = get_active_locks(session=None) # Loop over RSEs over the ratio for index, source_rse in enumerate(rses_over_ratio): # The volume that would be rebalanced, not real availability of the data: available_source_rebalance_volume = int( (source_rse['primary'] - global_ratio * source_rse['secondary']) / (global_ratio + 1)) if available_source_rebalance_volume > max_rse_rebalance_volume: available_source_rebalance_volume = max_rse_rebalance_volume if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume: available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume # Select a target: for destination_rse in rses_under_ratio: if available_source_rebalance_volume > 0: vo_str = ' on VO {}'.format( destination_rse['vo'] ) if destination_rse['vo'] != 'def' else '' if index == 0 and destination_rse['id'] in dict_locks: replicating_volume = dict_locks[ destination_rse['id']]['bytes'] logger(logging.DEBUG, 'Already %f TB replicating to %s%s', replicating_volume / 1E12, destination_rse['rse'], vo_str) destination_rse[ 'receive_volume'] += replicating_volume if destination_rse[ 'receive_volume'] >= max_rse_rebalance_volume: continue available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse[ 'receive_volume'] if available_target_rebalance_volume >= available_source_rebalance_volume: available_target_rebalance_volume = available_source_rebalance_volume logger(logging.INFO, 'Rebalance %d TB from %s(%f) to %s(%f)%s', available_target_rebalance_volume / 1E12, source_rse['rse'], source_rse['ratio'], destination_rse['rse'], destination_rse['ratio'], vo_str) expr = destination_rse['rse'] rebalance_rse( rse_id=source_rse['id'], max_bytes=available_target_rebalance_volume, dry_run=dry_run, comment='Background rebalancing', force_expression=expr, logger=logger) destination_rse[ 'receive_volume'] += available_target_rebalance_volume total_rebalance_volume += available_target_rebalance_volume available_source_rebalance_volume -= available_target_rebalance_volume if once: break end_time = time.time() time_diff = end_time - start_time if time_diff < sleep_time: logger(logging.INFO, 'Sleeping for a while : %f seconds', sleep_time - time_diff) GRACEFUL_STOP.wait(sleep_time - time_diff) die(executable='rucio-bb8', hostname=hostname, pid=pid, thread=hb_thread)
def add_subscription(name, account, filter, replication_rules, comments, lifetime, retroactive, dry_run, priority=3, session=None): """ Adds a new subscription which will be verified against every new added file and dataset :param account: Account identifier :type account: String :param name: Name of the subscription :type: String :param filter: Dictionary of attributes by which the input data should be filtered **Example**: ``{'dsn': 'data11_hi*.express_express.*,data11_hi*physics_MinBiasOverlay*', 'account': 'tzero'}`` :type filter: Dict :param replication_rules: Replication rules to be set : Dictionary with keys copies, rse_expression, weight, rse_expression :type replication_rules: Dict :param comments: Comments for the subscription :type comments: String :param lifetime: Subscription's lifetime (days) :type lifetime: Integer or None :param retroactive: Flag to know if the subscription should be applied on previous data :type retroactive: Boolean :param dry_run: Just print the subscriptions actions without actually executing them (Useful if retroactive flag is set) :type dry_run: Boolean :param priority: The priority of the subscription :type priority: Integer :param session: The database session in use. :returns: The subscriptionid """ try: keep_history = get('subscriptions', 'keep_history') except ConfigNotFound: keep_history = False SubscriptionHistory = models.Subscription.__history_mapper__.class_ retroactive = bool( retroactive) # Force boolean type, necessary for strict SQL state = SubscriptionState.ACTIVE lifetime = None if retroactive: state = SubscriptionState.NEW if lifetime: lifetime = datetime.datetime.utcnow() + datetime.timedelta( days=lifetime) new_subscription = models.Subscription(name=name, filter=filter, account=account, replication_rules=replication_rules, state=state, lifetime=lifetime, retroactive=retroactive, policyid=priority, comments=comments) if keep_history: subscription_history = SubscriptionHistory( id=new_subscription.id, name=new_subscription.name, filter=new_subscription.filter, account=new_subscription.account, replication_rules=new_subscription.replication_rules, state=new_subscription.state, lifetime=new_subscription.lifetime, retroactive=new_subscription.retroactive, policyid=new_subscription.policyid, comments=new_subscription.comments) try: new_subscription.save(session=session) if keep_history: subscription_history.save(session=session) except IntegrityError as error: if re.match('.*IntegrityError.*ORA-00001: unique constraint.*SUBSCRIPTIONS_PK.*violated.*', error.args[0])\ or re.match(".*IntegrityError.*UNIQUE constraint failed: subscriptions.name, subscriptions.account.*", error.args[0])\ or re.match('.*IntegrityError.*columns? name.*account.*not unique.*', error.args[0]) \ or re.match('.*IntegrityError.*ORA-00001: unique constraint.*SUBSCRIPTIONS_NAME_ACCOUNT_UQ.*violated.*', error.args[0])\ or re.match('.*IntegrityError.*1062.*Duplicate entry.*', error.args[0]) \ or re.match('.*IntegrityError.*duplicate key value violates unique constraint.*', error.args[0]) \ or re.match('.*UniqueViolation.*duplicate key value violates unique constraint.*', error.args[0]): raise SubscriptionDuplicate( 'Subscription \'%s\' owned by \'%s\' already exists!' % (name, account)) raise RucioException(error.args) return new_subscription.id
def update_subscription(name, account, metadata=None, session=None): """ Updates a subscription :param name: Name of the subscription :type: String :param account: Account identifier :type account: String :param metadata: Dictionary of metadata to update. Supported keys : filter, replication_rules, comments, lifetime, retroactive, dry_run, priority, last_processed :type metadata: Dict :param session: The database session in use. :raises: SubscriptionNotFound if subscription is not found """ try: keep_history = get('subscriptions', 'keep_history') except ConfigNotFound: keep_history = False values = {'state': SubscriptionState.UPDATED} if 'filter' in metadata and metadata['filter']: values['filter'] = dumps(metadata['filter']) if 'replication_rules' in metadata and metadata['replication_rules']: values['replication_rules'] = dumps(metadata['replication_rules']) if 'lifetime' in metadata and metadata['lifetime']: values['lifetime'] = datetime.datetime.utcnow() + datetime.timedelta( days=float(metadata['lifetime'])) if 'retroactive' in metadata and metadata['retroactive']: values['retroactive'] = metadata['retroactive'] if 'dry_run' in metadata and metadata['dry_run']: values['dry_run'] = metadata['dry_run'] if 'comments' in metadata and metadata['comments']: values['comments'] = metadata['comments'] if 'priority' in metadata and metadata['priority']: values['policyid'] = metadata['priority'] if 'last_processed' in metadata and metadata['last_processed']: values['last_processed'] = metadata['last_processed'] if 'state' in metadata and metadata['state'] == SubscriptionState.INACTIVE: values['state'] = SubscriptionState.INACTIVE values['expired_at'] = datetime.datetime.utcnow() SubscriptionHistory = models.Subscription.__history_mapper__.class_ try: subscription = session.query(models.Subscription).filter_by( account=account, name=name).one() subscription.update(values) if keep_history: subscription_history = SubscriptionHistory( id=subscription.id, name=subscription.name, filter=subscription.filter, account=subscription.account, replication_rules=subscription.replication_rules, state=subscription.state, lifetime=subscription.lifetime, retroactive=subscription.retroactive, policyid=subscription.policyid, comments=subscription.comments, last_processed=subscription.last_processed, expired_at=subscription.expired_at, updated_at=subscription.updated_at, created_at=subscription.created_at) subscription_history.save(session=session) except NoResultFound: raise SubscriptionNotFound( "Subscription for account '%(account)s' named '%(name)s' not found" % locals())