def rule_cleaner(once=False): """ Main loop to check for expired replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try live(executable='rucio-judge-cleaner', hostname=hostname, pid=pid, thread=current_thread) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable='rucio-judge-cleaner', hostname=hostname, pid=pid, thread=current_thread) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules(total_workers=heartbeat['nr_threads'] - 1, worker_number=heartbeat['assign_thread'], limit=200, blacklisted_rules=[key for key in paused_rules]) logging.debug('rule_cleaner[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, time.time() - start, len(rules))) if not rules and not once: logging.debug('rule_cleaner[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, str(len(paused_rules)))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] rule_expression = rule[1] logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation), e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) record_counter('rule.judge.exceptions.LocksDetected') logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound, e: pass
def rule_cleaner(once=False, process=0, total_processes=1, thread=0, threads_per_process=1): """ Main loop to check for expired replication rules """ logging.info('rule_cleaner: starting') logging.info('rule_cleaner: started') paused_rules = {} # {rule_id: datetime} while not graceful_stop.is_set(): try: start = time.time() rules = get_expired_rules(total_workers=total_processes*threads_per_process-1, worker_number=process*threads_per_process+thread, limit=1000) logging.debug('rule_cleaner index query time %f fetch size is %d' % (time.time() - start, len(rules))) # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Remove paused rules from result set rules = [rule for rule in rules if rule[0] not in paused_rules] if not rules and not once: logging.info('rule_cleaner[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1)) time.sleep(10) else: record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 1) for rule in rules: rule_id = rule[0] rule_expression = rule[1] logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError, AccessDenied), e: if isinstance(e.args[0], tuple): if match('.*ORA-00054.*', e.args[0][0]): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id)) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) except Exception, e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0) logging.critical(traceback.format_exc()) if once: return
def rule_cleaner(once=False): """ Main loop to check for expired replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try executable = 'judge-cleaner' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules( total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=200, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] rule_expression = rule[1] logger( logging.INFO, 'Deleting rule %s with expression %s' % (rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logger( logging.DEBUG, 'deletion of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow( ) + timedelta(seconds=randint(600, 2400)) record_counter( 'rule.judge.exceptions.LocksDetected') logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound: pass except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def run_once(paused_rules, heartbeat_handler, **_kwargs): worker_number, total_workers, logger = heartbeat_handler.live() try: start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_expired_rules(total_workers=total_workers, worker_number=worker_number, limit=200, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) return for rule in rules: _, _, logger = heartbeat_handler.live() rule_id = rule[0] rule_expression = rule[1] logger( logging.INFO, 'Deleting rule %s with expression %s' % (rule_id, rule_expression)) if graceful_stop.is_set(): break try: start = time.time() delete_rule(rule_id=rule_id, nowait=True) logger( logging.DEBUG, 'deletion of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(600, 2400)) record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except RuleNotFound: pass except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})