def run(once=False, process=0, total_processes=1, threads_per_process=1): """ Starts up the Judge-Clean threads. """ try: ntpc = ntplib.NTPClient() response = ntpc.request('137.138.16.69', version=3) # 137.138.16.69 CERN IP-TIME-1 NTP Server (Stratum 2) if response.offset > 60*60+10: # 1hour 10seconds logging.critical('Offset between NTP server and system time too big. Stopping Cleaner') return except: return for i in xrange(process * threads_per_process, max(0, process * threads_per_process + threads_per_process - 1)): record_gauge('rule.judge.cleaner.threads.%d' % i, 0) if once: logging.info('main: executing one iteration only') rule_cleaner(once) else: logging.info('main: starting threads') threads = [threading.Thread(target=rule_cleaner, kwargs={'process': process, 'total_processes': total_processes, 'once': once, 'thread': i, 'threads_per_process': threads_per_process}) for i in xrange(0, threads_per_process)] [t.start() for t in threads] logging.info('main: waiting for interrupts') # Interruptible joins require a timeout. while threads[0].is_alive(): [t.join(timeout=3.14) for t in threads]
def run(once=False, process=0, total_processes=1, threads_per_process=1): """ Starts up the Judge-Repairer threads. """ for i in xrange(process * threads_per_process, max(0, process * threads_per_process + threads_per_process - 1)): record_gauge('rule.judge.repairer.threads.%d' % i, 0) if once: logging.info('main: executing one iteration only') rule_repairer(once) else: logging.info('main: starting threads') threads = [threading.Thread(target=rule_repairer, kwargs={'process': process, 'total_processes': total_processes, 'once': once, 'thread': i, 'threads_per_process': threads_per_process}) for i in xrange(0, threads_per_process)] [t.start() for t in threads] logging.info('main: waiting for interrupts') # Interruptible joins require a timeout. while threads[0].is_alive(): [t.join(timeout=3.14) for t in threads]
def __release_all_activities(stats, direction, rse_name, rse_id, logger, session): """ Release requests if activities should be ignored. :param stats: Request statistics :param direction: String whether request statistics are based on source or destination RSEs. :param rse_name: RSE name. :param rse_id: RSE id. """ threshold = stats['threshold'] transfer = stats['transfer'] waiting = stats['waiting'] strategy = stats['strategy'] if threshold is not None and transfer + waiting > threshold: record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', threshold, labels={'activity': 'all_activities', 'rse': rse_name, 'limit_attr': 'max_transfers'}) record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', transfer, labels={'activity': 'all_activities', 'rse': rse_name, 'limit_attr': 'transfers'}) record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', waiting, labels={'activity': 'all_activities', 'rse': rse_name, 'limit_attr': 'waiting'}) if transfer < 0.8 * threshold: to_be_released = threshold - transfer if strategy == 'grouped_fifo': deadline = stats.get('deadline') volume = stats.get('volume') release_waiting_requests_grouped_fifo(rse_id, count=to_be_released, direction=direction, volume=volume, deadline=deadline, session=session) elif strategy == 'fifo': release_waiting_requests_fifo(rse_id, count=to_be_released, direction=direction, session=session) else: logger(logging.DEBUG, "Throttler has done nothing on rse %s (transfer > 0.8 * threshold)" % rse_name) elif waiting > 0 or not threshold: logger(logging.DEBUG, "Throttler remove limits(threshold: %s) and release all waiting requests, rse %s" % (threshold, rse_name)) delete_rse_transfer_limits(rse_id, activity='all_activities', session=session) release_all_waiting_requests(rse_id, direction=direction, session=session) record_counter('daemons.conveyor.throttler.delete_rse_transfer_limits.{activity}.{rse}', labels={'activity': 'all_activities', 'rse': rse_name})
def test_record_gauge_message(self): """MONITOR (CORE): Send a gauge message to graphite """ monitor.record_gauge('test.gauge', 10)
def __release_per_activity(stats, direction, rse_name, rse_id, logger, session): """ Release requests per activity. :param stats: Request statistics :param direction: String whether request statistics are based on source or destination RSEs. :param rse_name: RSE name. :param rse_id: RSE id. """ for activity in stats['activities']: threshold = stats['activities'][activity]['threshold'] transfer = stats['activities'][activity]['transfer'] waiting = stats['activities'][activity]['waiting'] if waiting: logger(logging.DEBUG, "Request status for %s at %s: %s" % (activity, rse_name, stats['activities'][activity])) if threshold is None: logger(logging.DEBUG, "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse_id %s" % (threshold, activity, rse_id)) delete_rse_transfer_limits(rse_id, activity=activity, session=session) release_all_waiting_requests(rse_id, activity=activity, direction=direction, session=session) record_counter('daemons.conveyor.throttler.delete_rse_transfer_limits.{activity}.{rse}', labels={'activity': activity, 'rse': rse_name}) elif transfer + waiting > threshold: logger(logging.DEBUG, "Throttler set limits for activity %s, rse %s" % (activity, rse_name)) set_rse_transfer_limits(rse_id, activity=activity, max_transfers=threshold, transfers=transfer, waitings=waiting, session=session) record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', threshold, labels={'activity': activity, 'rse': rse_name, 'limit_attr': 'max_transfers'}) record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', transfer, labels={'activity': activity, 'rse': rse_name, 'limit_attr': 'transfers'}) record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', waiting, labels={'activity': activity, 'rse': rse_name, 'limit_attr': 'waiting'}) if transfer < 0.8 * threshold: # release requests on account nr_accounts = len(stats['activities'][activity]['accounts']) if nr_accounts < 1: nr_accounts = 1 to_release = threshold - transfer threshold_per_account = math.ceil(threshold / nr_accounts) to_release_per_account = math.ceil(to_release / nr_accounts) accounts = stats['activities'][activity]['accounts'] for account in accounts: if nr_accounts == 1: logger(logging.DEBUG, "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release, activity, rse_name, account)) release_waiting_requests_fifo(rse_id, activity=activity, account=account, count=to_release, direction=direction, session=session) record_gauge('daemons.conveyor.throttler.release_waiting_requests.{activity}.{rse}.{account}', to_release, labels={'activity': activity, 'rse': rse_name, 'account': account}) elif accounts[account]['transfer'] > threshold_per_account: logger(logging.DEBUG, "Throttler will not release %s waiting requests for activity %s, rse %s, account %s: It queued more transfers than its share " % (accounts[account]['waiting'], activity, rse_name, account)) nr_accounts -= 1 to_release_per_account = math.ceil(to_release / nr_accounts) elif accounts[account]['waiting'] < to_release_per_account: logger(logging.DEBUG, "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (accounts[account]['waiting'], activity, rse_name, account)) release_waiting_requests_fifo(rse_id, activity=activity, account=account, count=accounts[account]['waiting'], direction=direction, session=session) record_gauge('daemons.conveyor.throttler.release_waiting_requests.{activity}.{rse}.{account}', accounts[account]['waiting'], labels={'activity': activity, 'rse': rse_name, 'account': account}) to_release = to_release - accounts[account]['waiting'] nr_accounts -= 1 to_release_per_account = math.ceil(to_release / nr_accounts) else: logger(logging.DEBUG, "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release_per_account, activity, rse_name, account)) release_waiting_requests_fifo(rse_id, activity=activity, account=account, count=to_release_per_account, direction=direction, session=session) record_gauge('daemons.conveyor.throttler.release_waiting_requests.{activity}.{rse}.{account}', to_release_per_account, labels={'activity': activity, 'rse': rse_name, 'account': account}) to_release = to_release - to_release_per_account nr_accounts -= 1 else: logger(logging.DEBUG, "Throttler has done nothing for activity %s on rse %s (transfer > 0.8 * threshold)" % (activity, rse_name)) elif waiting > 0: logger(logging.DEBUG, "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse %s" % (threshold, activity, rse_name)) delete_rse_transfer_limits(rse_id, activity=activity, session=session) release_all_waiting_requests(rse_id, activity=activity, direction=direction, session=session) record_counter('daemons.conveyor.throttler.delete_rse_transfer_limits.{activity}.{rse}', labels={'activity': activity, 'rse': rse_name})
def test_monitor_record_gauge_message(self): """MONITOR (CORE): Send a gauge message to graphite """ monitor.record_gauge('test.gauge', 10)
def __schedule_requests(): """ Schedule requests """ try: logging.info("Throttler retrieve requests statistics") results = get_stats_by_activity_dest_state(state=[ RequestState.QUEUED, RequestState.SUBMITTING, RequestState.SUBMITTED, RequestState.WAITING ]) result_dict = {} for activity, dest_rse_id, account, state, rse, counter in results: threshold = get_config_limit(activity, dest_rse_id) if threshold or (counter and (state == RequestState.WAITING)): if activity not in result_dict: result_dict[activity] = {} if dest_rse_id not in result_dict[activity]: result_dict[activity][dest_rse_id] = { 'waiting': 0, 'transfer': 0, 'threshold': threshold, 'accounts': {}, 'rse': rse } if account not in result_dict[activity][dest_rse_id][ 'accounts']: result_dict[activity][dest_rse_id]['accounts'][account] = { 'waiting': 0, 'transfer': 0 } if state == RequestState.WAITING: result_dict[activity][dest_rse_id]['accounts'][account][ 'waiting'] += counter result_dict[activity][dest_rse_id]['waiting'] += counter else: result_dict[activity][dest_rse_id]['accounts'][account][ 'transfer'] += counter result_dict[activity][dest_rse_id]['transfer'] += counter for activity in result_dict: for dest_rse_id in result_dict[activity]: threshold = result_dict[activity][dest_rse_id]['threshold'] transfer = result_dict[activity][dest_rse_id]['transfer'] waiting = result_dict[activity][dest_rse_id]['waiting'] rse_name = result_dict[activity][dest_rse_id]['rse'] if waiting: logging.debug("Request status for %s at %s: %s" % (activity, rse_name, result_dict[activity][dest_rse_id])) if threshold is None: logging.debug( "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse_id %s" % (threshold, activity, dest_rse_id)) delete_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id) release_waiting_requests(rse=None, activity=activity, rse_id=dest_rse_id) record_counter( 'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s' % (activity, rse_name)) elif transfer + waiting > threshold: logging.debug( "Throttler set limits for activity %s, rse %s" % (activity, rse_name)) set_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id, max_transfers=threshold, transfers=transfer, waitings=waiting) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.max_transfers' % (activity, rse_name), threshold) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.transfers' % (activity, rse_name), transfer) record_gauge( 'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.waitings' % (activity, rse_name), waiting) if transfer < 0.8 * threshold: # release requests on account nr_accounts = len( result_dict[activity][dest_rse_id]['accounts']) if nr_accounts < 1: nr_accounts = 1 to_release = threshold - transfer threshold_per_account = math.ceil(threshold / nr_accounts) to_release_per_account = math.ceil(to_release / nr_accounts) accounts = result_dict[activity][dest_rse_id][ 'accounts'] for account in accounts: if nr_accounts == 1: logging.debug( "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release, activity, rse_name, account)) release_waiting_requests(rse=None, activity=activity, rse_id=dest_rse_id, account=account, count=to_release) record_gauge( 'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s' % (activity, rse_name, account), to_release) elif accounts[account][ 'transfer'] > threshold_per_account: logging.debug( "Throttler will not release %s waiting requests for activity %s, rse %s, account %s: It queued more transfers than its share " % (accounts[account]['waiting'], activity, rse_name, account)) nr_accounts -= 1 to_release_per_account = math.ceil(to_release / nr_accounts) elif accounts[account][ 'waiting'] < to_release_per_account: logging.debug( "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (accounts[account]['waiting'], activity, rse_name, account)) release_waiting_requests( rse=None, activity=activity, rse_id=dest_rse_id, account=account, count=accounts[account]['waiting']) record_gauge( 'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s' % (activity, rse_name, account), accounts[account]['waiting']) to_release = to_release - accounts[account][ 'waiting'] nr_accounts -= 1 to_release_per_account = math.ceil(to_release / nr_accounts) else: logging.debug( "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release_per_account, activity, rse_name, account)) release_waiting_requests( rse=None, activity=activity, rse_id=dest_rse_id, account=account, count=to_release_per_account) record_gauge( 'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s' % (activity, rse_name, account), to_release_per_account) to_release = to_release - to_release_per_account nr_accounts -= 1 else: logging.debug( "Throttler has done nothing for activity %s on rse %s (transfer > 0.8 * threshold)" % (activity, rse_name)) elif waiting > 0: logging.debug( "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse %s" % (threshold, activity, rse_name)) delete_rse_transfer_limits(rse=None, activity=activity, rse_id=dest_rse_id) release_waiting_requests(rse=None, activity=activity, rse_id=dest_rse_id) record_counter( 'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s' % (activity, rse_name)) except: logging.critical("Failed to schedule requests, error: %s" % (traceback.format_exc()))
def rule_cleaner(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check for expired replication rules """ logging.info('rule_cleaner: starting') logging.info('rule_cleaner: started') paused_rules = {} # {rule_id: datetime} while not graceful_stop.is_set(): try: start = time.time() rules = get_expired_rules(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, limit=1000) logging.debug('rule_cleaner index query time %f fetch size is %d' % (time.time() - start, len(rules))) # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Remove paused rules from result set rules = [rule for rule in rules if rule[0] not in paused_rules] if not rules and not once: logging.info('rule_cleaner[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 1) for rule in rules: rule_id = rule[0] rule_expression = rule[1] logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError, AccessDenied), e: if isinstance(e.args[0], tuple): if match('.*ORA-00054.*', e.args[0][0]): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: return
else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: return logging.info('rule_cleaner: graceful stop requested') record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) logging.info('rule_cleaner: graceful stop done') def stop(signum=None, frame=None): """ Graceful exit. """ graceful_stop.set() def run(once=False, process=0, total_processes=1, threads_per_process=1): """ Starts up the Judge-Clean threads. """
def process_miss_files(path, scope, rse, latest_run, max_miss_fraction, max_files_at_site, old_enough_run, force_proceed): """ Process the Missing Replicas. """ prefix = 'storage-consistency-actions (process_miss_files())' logger = formatted_logger(logging.log, prefix + '%s') latest_miss = re.sub('_stats.json$', '_M.list', latest_run) logger(logging.INFO, 'latest_missing = %s' % latest_miss) # Create a cc_miss section in the stats file t0 = time.time() stats_key = "cc_miss" cc_stats = stats = None stats = Stats(latest_run) cc_stats = { "start_time": t0, "end_time": None, "initial_miss_files": 0, "confirmed_miss_files": 0, "x-check_run": old_enough_run, "status": "started" } stats[stats_key] = cc_stats ### # SAFEGUARD # If a large fraction (larger than 'max_miss_fraction') of the files at a site are reported as # 'missing', do NOT proceed with the invalidation. # Instead, put a warning in the _stats.json file, so that an operator can have a look. ### miss_files = sum(1 for line in open(latest_miss)) logger(logging.INFO, 'miss_files = %d' % miss_files) logger( logging.INFO, 'miss_files/max_files_at_site = %f' % (miss_files / max_files_at_site)) logger( logging.INFO, 'max_miss_fraction configured for this RSE (in %%): %f' % max_miss_fraction) labels = {'rse': rse} record_gauge('storage.consistency.actions_miss_files_found', miss_files, labels=labels) if miss_files / max_files_at_site < max_miss_fraction or force_proceed is True: logger(logging.INFO, 'Can proceed with missing files retransfer') invalidated_files = 0 issuer = InternalAccount('root') with open(latest_miss, 'r') as csvfile: reader = csv.reader(csvfile) reason = "invalidating damaged/missing replica" for name, in reader: logger( logging.INFO, 'Processing invalid replica:\n RSE: %s Scope: %s Name: %s' % (rse, scope, name)) rse_id = get_rse_id(rse=rse) dids = [{'scope': scope, 'name': name}] declare_bad_file_replicas(dids=dids, rse_id=rse_id, reason=reason, issuer=issuer) invalidated_files += 1 record_counter( 'storage.consistency.actions_miss_files_to_retransfer_counter', delta=1, labels=labels) # TODO: The stats updating can be refactored in a future version of the Stats class. # See: https://github.com/rucio/rucio/pull/5120#discussion_r792688019 # Update the stats t1 = time.time() cc_stats.update({ "end_time": t1, "initial_miss_files": miss_files, "confirmed_miss": invalidated_files, "status": "done" }) stats[stats_key] = cc_stats record_gauge( 'storage.consistency.actions_miss_files_to_retransfer', invalidated_files, labels=labels) else: missperc = 100. * miss_files / max_files_at_site logger( logging.WARNING, '\n Too many MISS files (%3.2f%%)!\n\ Stopping and asking for operators help.' % missperc) # Update the stats t1 = time.time() cc_stats.update({ "end_time": t1, "initial_miss_files": miss_files, "confirmed_miss_files": 0, "status": "ABORTED", "aborted_reason": "%3.2f%% miss" % missperc, }) stats[stats_key] = cc_stats record_gauge('storage.consistency.actions_miss_files_to_retransfer', 0, labels=labels)
def process_dark_files(path, scope, rse, latest_run, max_dark_fraction, max_files_at_site, old_enough_run, force_proceed): """ Process the Dark Files. """ prefix = 'storage-consistency-actions (process_dark_files())' logger = formatted_logger(logging.log, prefix + '%s') # Create a cc_dark section in the stats file t0 = time.time() stats_key = "cc_dark" cc_stats = stats = None stats = Stats(latest_run) cc_stats = { "start_time": t0, "end_time": None, "initial_dark_files": 0, "confirmed_dark_files": 0, "x-check_run": old_enough_run, "status": "started" } stats[stats_key] = cc_stats # Compare the two lists, and take only the dark files that are in both latest_dark = re.sub('_stats.json$', '_D.list', latest_run) old_enough_dark = re.sub('_stats.json$', '_D.list', old_enough_run) logger(logging.INFO, 'latest_dark = %s' % latest_dark) logger(logging.INFO, 'old_enough_dark = %s' % old_enough_dark) confirmed_dark = re.sub('_stats.json$', '_DeletionList.csv', latest_run) cmp2dark(new_list=latest_dark, old_list=old_enough_dark, comm_list=confirmed_dark, stats_file=latest_run) ### # SAFEGUARD # If a large fraction (larger than 'max_dark_fraction') of the files at a site # are reported as 'dark', do NOT proceed with the deletion. # Instead, put a warning in the _stats.json file, so that an operator can have a look. ### # Get the number of files recorded by the scanner dark_files = sum(1 for line in open(latest_dark)) confirmed_dark_files = sum(1 for line in open(confirmed_dark)) logger(logging.INFO, 'dark_files %d' % dark_files) logger(logging.INFO, 'confirmed_dark_files %d' % confirmed_dark_files) logger( logging.INFO, 'confirmed_dark_files/max_files_at_sit = %f' % (confirmed_dark_files / max_files_at_site)) logger(logging.INFO, 'max_dark_fraction configured for this RSE: %f' % max_dark_fraction) # Labels for the Prometheus counters/gauges labels = {'rse': rse} record_gauge('storage.consistency.actions_dark_files_found', confirmed_dark_files, labels=labels) record_gauge('storage.consistency.actions_dark_files_confirmed', confirmed_dark_files, labels=labels) deleted_files = 0 if confirmed_dark_files / max_files_at_site < max_dark_fraction or force_proceed is True: logger(logging.INFO, 'Can proceed with dark files deletion') # Then, do the real deletion (code from DeleteReplicas.py) issuer = InternalAccount('root') with open(confirmed_dark, 'r') as csvfile: reader = csv.reader(csvfile) for name, in reader: logger( logging.INFO, 'Processing a dark file:\n RSE %s Scope: %s Name: %s' % (rse, scope, name)) rse_id = get_rse_id(rse=rse) Intscope = InternalScope(scope=scope, vo=issuer.vo) lfns = [{'scope': scope, 'name': name}] attributes = get_rse_info(rse=rse) pfns = lfns2pfns(rse_settings=attributes, lfns=lfns, operation='delete') pfn_key = scope + ':' + name url = pfns[pfn_key] urls = [url] paths = parse_pfns(attributes, urls, operation='delete') replicas = [{ 'scope': Intscope, 'rse_id': rse_id, 'name': name, 'path': paths[url]['path'] + paths[url]['name'] }] add_quarantined_replicas(rse_id, replicas, session=None) deleted_files += 1 labels = {'rse': rse} record_counter( 'storage.consistency.actions_dark_files_deleted_counter', delta=1, labels=labels) # Update the stats t1 = time.time() cc_stats.update({ "end_time": t1, "initial_dark_files": dark_files, "confirmed_dark_files": deleted_files, "status": "done" }) stats[stats_key] = cc_stats record_gauge('storage.consistency.actions_dark_files_deleted', deleted_files, labels=labels) else: darkperc = 100. * confirmed_dark_files / max_files_at_site logger( logging.WARNING, '\n ATTENTION: Too many DARK files! (%3.2f%%) \n\ Stopping and asking for operators help.' % darkperc) # Update the stats t1 = time.time() cc_stats.update({ "end_time": t1, "initial_dark_files": dark_files, "confirmed_dark_files": 0, "status": "ABORTED", "aborted_reason": "%3.2f%% dark" % darkperc, }) stats[stats_key] = cc_stats record_gauge('storage.consistency.actions_dark_files_deleted', 0, labels=labels)
def rule_repairer(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check for STUCK replication rules """ logging.info('rule_repairer: starting') logging.info('rule_repairer: started') paused_rules = {} # {rule_id: datetime} while not graceful_stop.is_set(): try: # Select a bunch of rules for this worker to repair start = time.time() rules = get_stuck_rules(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, delta=-1 if once else 600) logging.debug('rule_repairer index query time %f fetch size is %d' % (time.time() - start, len(rules))) # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Remove paused rules from result set rules = [rule for rule in rules if rule[0] not in paused_rules] if not rules and not once: logging.info('rule_repairer[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 1) for rule_id in rules: rule_id = rule_id[0] logging.info('rule_repairer[%s/%s]: Repairing rule %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) if graceful_stop.is_set(): break try: start = time.time() repair_rule(rule_id=rule_id) logging.debug('rule_repairer[%s/%s]: repairing of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError), e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logging.warning('rule_repairer[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) record_counter('rule.judge.exceptions.LocksDetected') else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: return else: time.sleep(30)
def re_evaluator(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check the re-evaluation of dids. """ logging.info('re_evaluator: starting') logging.info('re_evaluator: started') paused_dids = {} # {(scope, name): datetime} while not graceful_stop.is_set(): try: # Select a bunch of dids for re evaluation for this worker start = time.time() # NOQA dids = get_updated_dids(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, limit=1000) logging.debug('Re-Evaluation index query time %f fetch size is %d' % (time.time() - start, len(dids))) # Refresh paused dids iter_paused_dids = deepcopy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] # Remove paused dids from result set dids = [did for did in dids if (did.scope, did.name) not in paused_dids] # If the list is empty, sent the worker to sleep if not dids and not once: logging.info('re_evaluator[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 1) done_dids = {} for did in dids: if graceful_stop.is_set(): break # Try to delete all duplicate dids # delete_duplicate_updated_dids(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action, id=did.id) # Check if this did has already been operated on if '%s:%s' % (did.scope, did.name) in done_dids: if did.rule_evaluation_action in done_dids['%s:%s' % (did.scope, did.name)]: continue else: done_dids['%s:%s' % (did.scope, did.name)] = [] done_dids['%s:%s' % (did.scope, did.name)].append(did.rule_evaluation_action) try: start_time = time.time() re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action) logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name, time.time() - start_time)) delete_updated_did(id=did.id) except DataIdentifierNotFound, e: delete_updated_did(id=did.id) except (DatabaseException, DatabaseError), e: if isinstance(e.args[0], tuple): if match('.*ORA-00054.*', e.args[0][0]): paused_dids[(did.scope, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name)) record_counter('rule.judge.exceptions.LocksDetected') else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name)) record_counter('rule.judge.exceptions.LocksDetected') else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name)) except FlushError, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.critical('re_evaluator[%s/%s]: Flush error for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name)) logging.critical(traceback.format_exc()) record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: break logging.info('re_evaluator: graceful stop requested') record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 0) logging.info('re_evaluator: graceful stop done') def stop(signum=None, frame=None): """