def update_replication_rule(rule_id, options, issuer, vo='def'): """ Update lock state of a replication rule. :param rule_id: The rule_id to lock. :param options: Options dictionary. :param issuer: The issuing account of this operation :param vo: The VO to act on. :raises: RuleNotFound if no Rule can be found. """ kwargs = {'rule_id': rule_id, 'options': options} if 'approve' in options: if not has_permission( issuer=issuer, vo=vo, action='approve_rule', kwargs=kwargs): raise AccessDenied( 'Account %s can not approve/deny this replication rule.' % (issuer)) issuer = InternalAccount(issuer, vo=vo) if options['approve']: rule.approve_rule(rule_id=rule_id, approver=issuer) else: rule.deny_rule(rule_id=rule_id, approver=issuer, reason=options.get('comment', None)) else: if not has_permission( issuer=issuer, vo=vo, action='update_rule', kwargs=kwargs): raise AccessDenied( 'Account %s can not update this replication rule.' % (issuer)) rule.update_rule(rule_id=rule_id, options=options)
def test_judge_expire_rule_with_child_rule(self): """ JUDGE CLEANER: Test the judge when deleting expired rules with child rules""" scope = InternalScope('mock', **self.vo) files = create_files(3, scope, self.rse1_id) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.DATASET, self.jdoe) attach_dids(scope, dataset, files, self.jdoe) rule_id = add_rule(dids=[{ 'scope': scope, 'name': dataset }], account=self.jdoe, copies=1, rse_expression=self.rse1, grouping='NONE', weight='fakeweight', lifetime=None, locked=False, subscription_id=None)[0] child_rule = add_rule(dids=[{ 'scope': scope, 'name': dataset }], account=self.jdoe, copies=1, rse_expression=self.rse3, grouping='NONE', weight='fakeweight', lifetime=-3, locked=False, subscription_id=None)[0] update_rule(rule_id, {'child_rule_id': child_rule}) rule_cleaner(once=True)
def test_locked_rule(self): """ REPLICATION RULE (CORE): Delete a locked replication rule""" scope = 'mock' files = create_files(3, scope, self.rse1) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id_1 = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='NONE', weight='fakeweight', lifetime=None, locked=True, subscription_id=None)[0] assert_raises(AccessDenied, delete_rule, rule_id_1) update_rule(rule_id=rule_id_1, options={'locked': False}) delete_rule(rule_id=rule_id_1)
def update_replication_rule(rule_id, options, issuer): """ Update lock state of a replication rule. :param rule_id: The rule_id to lock. :param options: Options dictionary. :param issuer: The issuing account of this operation :raises: RuleNotFound if no Rule can be found. """ kwargs = {'rule_id': rule_id, 'options': options} if not has_permission(issuer=issuer, action='update_rule', kwargs=kwargs): raise AccessDenied('Account %s can not update this replication rule.' % (issuer)) rule.update_rule(rule_id=rule_id, options=options)
def test_account_counter_rule_update(self): """ REPLICATION RULE (CORE): Test if the account counter is updated correctly when a rule is updated""" scope = 'mock' files = create_files(3, scope, self.rse1, bytes=100) dataset = 'dataset_' + str(uuid()) add_did(scope, dataset, DIDType.from_sym('DATASET'), 'jdoe') attach_dids(scope, dataset, files, 'jdoe') rule_id = add_rule(dids=[{'scope': scope, 'name': dataset}], account='jdoe', copies=1, rse_expression=self.rse1, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None)[0] account_update(once=True) account_counter_before_1 = get_account_counter(self.rse1_id, 'jdoe') account_counter_before_2 = get_account_counter(self.rse1_id, 'root') update_rule(rule_id, {'account': 'root'}) account_update(once=True) # Check if the counter has been updated correctly account_counter_after_1 = get_account_counter(self.rse1_id, 'jdoe') account_counter_after_2 = get_account_counter(self.rse1_id, 'root') assert(account_counter_before_1['bytes'] - 3*100 == account_counter_after_1['bytes']) assert(account_counter_before_2['bytes'] + 3*100 == account_counter_after_2['bytes'])
def rebalance_rule(parent_rule_id, activity, rse_expression, priority, source_replica_expression=None, comment=None): """ Rebalance a replication rule to a new RSE :param parent_rule_id: Replication rule to be rebalanced. :param activity: Activity to be used for the rebalancing. :param rse_expression: RSE expression of the new rule. :param priority: Priority of the newly created rule. :param source_replica_expression: Source replica expression of the new rule. :param comment: Comment to set on the new rules. :returns: The new child rule id. """ parent_rule = get_rule(rule_id=parent_rule_id) if parent_rule['expires_at'] is None: lifetime = None else: lifetime = (parent_rule['expires_at'] - datetime.utcnow()).days * 24 * 3600 + ( parent_rule['expires_at'] - datetime.utcnow()).seconds if parent_rule['grouping'] == RuleGrouping.ALL: grouping = 'ALL' elif parent_rule['grouping'] == RuleGrouping.NONE: grouping = 'NONE' else: grouping = 'DATASET' # check if concurrent replica at target rse does not exist concurrent_replica = False try: for lock in get_dataset_locks(parent_rule['scope'], parent_rule['name']): if lock['rse'] == rse_expression: concurrent_replica = True except Exception as error: concurrent_replica = True print 'Exception: get_dataset_locks not feasible for %s %s:' % ( parent_rule['scope'], parent_rule['name']) raise error if concurrent_replica: return 'Concurrent replica exists at target rse!' print concurrent_replica child_rule = add_rule( dids=[{ 'scope': parent_rule['scope'], 'name': parent_rule['name'] }], account=parent_rule['account'], copies=parent_rule['copies'], rse_expression=rse_expression, grouping=grouping, weight=parent_rule['weight'], lifetime=lifetime, locked=parent_rule['locked'], subscription_id=parent_rule['subscription_id'], source_replica_expression=source_replica_expression, activity=activity, notify=parent_rule['notification'], purge_replicas=parent_rule['purge_replicas'], ignore_availability=False, comment=parent_rule['comments'] if not comment else comment, ask_approval=False, asynchronous=False, priority=priority)[0] update_rule(rule_id=parent_rule_id, options={ 'child_rule_id': child_rule, 'lifetime': 0 }) return child_rule
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True, unlock=False, spread_period=0, purge_replicas=False): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """ sleep_time = 60 executable = 'atropos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) time.sleep(10) hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) logging.debug(prepend_str + 'Starting worker') summary = {} lifetime_exceptions = {} rand = random.Random(hb['assign_thread']) for excep in rucio.core.lifetime_exception.list_exceptions( exception_id=None, states=[ LifetimeExceptionsState.APPROVED, ], session=None): key = '{}:{}'.format(excep['scope'].internal, excep['name']) if key not in lifetime_exceptions: lifetime_exceptions[key] = excep['expires_at'] elif lifetime_exceptions[key] < excep['expires_at']: lifetime_exceptions[key] = excep['expires_at'] logging.debug(prepend_str + '%s active exceptions' % len(lifetime_exceptions)) if not dry_run and date_check > now: logging.error( prepend_str + 'Atropos cannot run in non-dry-run mode for date in the future') else: while not GRACEFUL_STOP.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'], session=None) logging.info(prepend_str + '%s rules to process' % (len(rules))) for rule_idx, rule in enumerate(rules, start=1): did = '%s:%s' % (rule.scope, rule.name) did_key = '{}:{}'.format(rule.scope.internal, rule.name) logging.debug(prepend_str + 'Working on rule %s on DID %s on %s' % (rule.id, did, rule.rse_expression)) if (rule_idx % 1000) == 0: logging.info(prepend_str + '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expected eol_at try: rses = parse_expression(rule.rse_expression, filter={'vo': rule.account.vo}) except InvalidRSEExpression: logging.warning( prepend_str + 'Rule %s has an RSE expression that results in an empty set: %s' % (rule.id, rule.rse_expression)) continue eol_at = rucio.core.lifetime_exception.define_eol( rule.scope, rule.name, rses) if eol_at != rule.eol_at: logging.warning( prepend_str + 'The computed eol %s differs from the one recorded %s for rule %s on %s at %s' % (eol_at, rule.eol_at, rule.id, did, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logging.warning(prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Check the exceptions if did_key in lifetime_exceptions: if eol_at > lifetime_exceptions[did_key]: logging.info( prepend_str + 'Rule %s on DID %s on %s has longer expiration date than the one requested : %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) else: # If eol_at < requested extension, update eol_at logging.info( prepend_str + 'Updating rule %s on DID %s on %s according to the exception till %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) eol_at = lifetime_exceptions[did_key] try: update_rule(rule.id, options={ 'eol_at': lifetime_exceptions[did_key] }) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Now check that the new eol_at is expired if eol_at and eol_at < date_check: no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse_id'] not in summary: summary[lock['rse_id']] = {} if did_key not in summary[lock['rse_id']]: summary[lock['rse_id']][did_key] = { 'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0 } if no_locks: logging.warning( prepend_str + 'Cannot find a lock for rule %s on DID %s' % (rule.id, did)) if not dry_run: lifetime = grace_period + rand.randrange( spread_period + 1) logging.info( prepend_str + 'Setting %s seconds lifetime for rule %s' % (lifetime, rule.id)) options = {'lifetime': lifetime} if purge_replicas: options['purge_replicas'] = True if rule.locked and unlock: logging.info(prepend_str + 'Unlocking rule %s', rule.id) options['locked'] = False try: update_rule(rule.id, options=options) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse_id in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse_id]: tot_datasets += 1 tot_files += summary[rse_id][did].get('length', 0) tot_size += summary[rse_id][did].get('bytes', 0) vo = get_rse_vo(rse_id=rse_id) logging.info( prepend_str + 'For RSE %s %s %s datasets will be deleted representing %s files and %s bytes' % (get_rse_name(rse_id=rse_id), '' if vo == 'def' else 'on VO ' + vo, tot_datasets, tot_files, tot_size)) if once: break else: tottime = time.time() - stime if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def test_bb8_full_workflow(vo, root_account, jdoe_account, rse_factory, mock_scope, did_factory): """BB8: Test the rebalance rule method""" config_core.set(section='bb8', option='allowed_accounts', value='jdoe') tot_rses = 4 rses = [rse_factory.make_posix_rse() for _ in range(tot_rses)] rse1, rse1_id = rses[0] rse2, rse2_id = rses[1] rse3, rse3_id = rses[2] rse4, rse4_id = rses[3] # Add Tags # RSE 1 and 2 nmatch expression T1=true # RSE 3 and 4 nmatch expression T2=true T1 = tag_generator() T2 = tag_generator() add_rse_attribute(rse1_id, T1, True) add_rse_attribute(rse2_id, T1, True) add_rse_attribute(rse3_id, T2, True) add_rse_attribute(rse4_id, T2, True) # Add fake weights add_rse_attribute(rse1_id, "fakeweight", 10) add_rse_attribute(rse2_id, "fakeweight", 0) add_rse_attribute(rse3_id, "fakeweight", 0) add_rse_attribute(rse4_id, "fakeweight", 0) add_rse_attribute(rse1_id, "freespace", 1) add_rse_attribute(rse2_id, "freespace", 1) add_rse_attribute(rse3_id, "freespace", 1) add_rse_attribute(rse4_id, "freespace", 1) # Add quota set_local_account_limit(jdoe_account, rse1_id, -1) set_local_account_limit(jdoe_account, rse2_id, -1) set_local_account_limit(jdoe_account, rse3_id, -1) set_local_account_limit(jdoe_account, rse4_id, -1) set_local_account_limit(root_account, rse1_id, -1) set_local_account_limit(root_account, rse2_id, -1) set_local_account_limit(root_account, rse3_id, -1) set_local_account_limit(root_account, rse4_id, -1) # Invalid the cache because the result of parse_expression is cached REGION.invalidate() tot_datasets = 4 # Create a list of datasets datasets = [did_factory.make_dataset() for _ in range(tot_datasets)] dsn = [dataset['name'] for dataset in datasets] rules = list() base_unit = 100000000000 nb_files1 = 7 nb_files2 = 5 nb_files3 = 3 nb_files4 = 2 file_size = 1 * base_unit rule_to_rebalance = None # Add one secondary file files = create_files(1, mock_scope, rse1_id, bytes_=1) add_rule(dids=[{ 'scope': mock_scope, 'name': files[0]['name'] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] for cnt in range(3, tot_rses): add_replicas(rses[cnt][1], files, jdoe_account) add_rule(dids=[{ 'scope': mock_scope, 'name': files[0]['name'] }], account=jdoe_account, copies=1, rse_expression=rses[cnt][0], grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] rule_cleaner(once=True) # Create dataset 1 of 800 GB and create a rule on RSE 1 and RSE 3 files = create_files(nb_files1, mock_scope, rse1_id, bytes_=file_size) attach_dids(mock_scope, dsn[0], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[0] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) add_replicas(rse3_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[0] }], account=jdoe_account, copies=1, rse_expression=rse3, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) # Create dataset 2 of 500 GB and create a rule on RSE 1 and RSE 2 files = create_files(nb_files2, mock_scope, rse1_id, bytes_=file_size) attach_dids(mock_scope, dsn[1], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[1] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) add_replicas(rse2_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[1] }], account=jdoe_account, copies=1, rse_expression=rse2, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) # Create dataset 3 of 300 GB and create a rule on RSE 1. The copy on RSE 3 is secondary files = create_files(nb_files3, mock_scope, rse1_id, bytes_=file_size) attach_dids(mock_scope, dsn[2], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[2] }], account=jdoe_account, copies=1, rse_expression=rse1, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rule_to_rebalance = rule_id rules.append(rule_id) add_replicas(rse3_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[2] }], account=jdoe_account, copies=1, rse_expression=rse3, grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] rule_cleaner(once=True) try: rule = get_rule(rule_id) except: pytest.raises(RuleNotFound, get_rule, rule_id) # Create dataset 4 of 200 GB and create a rule on RSE 3. The copy on RSE 2 is secondary files = create_files(nb_files4, mock_scope, rse3_id, bytes_=file_size) attach_dids(mock_scope, dsn[3], files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[3] }], account=jdoe_account, copies=1, rse_expression=rse3, grouping='DATASET', weight=None, lifetime=None, locked=False, subscription_id=None)[0] rules.append(rule_id) add_replicas(rse2_id, files, jdoe_account) rule_id = add_rule(dids=[{ 'scope': mock_scope, 'name': dsn[3] }], account=jdoe_account, copies=1, rse_expression=rse2, grouping='DATASET', weight=None, lifetime=-86400, locked=False, subscription_id=None)[0] rule_cleaner(once=True) try: rule = get_rule(rule_id) except: pytest.raises(RuleNotFound, get_rule, rule_id) for dataset in dsn: set_status(mock_scope, dataset, open=False) for rse in rses: fill_rse_expired(rse[1]) set_rse_usage(rse_id=rse[1], source='min_free_space', used=2 * base_unit, free=2 * base_unit, session=None) set_rse_usage(rse_id=rse[1], source='storage', used=15 * base_unit, free=2 * base_unit, session=None) set_rse_usage(rse_id=rse2_id, source='min_free_space', used=1 * base_unit, free=1 * base_unit, session=None) set_rse_usage(rse_id=rse2_id, source='storage', used=6 * base_unit, free=5 * base_unit, session=None) run_abacus(once=True, threads=1, fill_history_table=False, sleep_time=10) # Summary : # RSE 1 : 1500 GB primary + 1 B secondary tot_space = [ src for src in get_rse_usage(rse1_id) if src['source'] == 'rucio' ][0] expired = [ src for src in get_rse_usage(rse1_id) if src['source'] == 'expired' ][0] assert tot_space['used'] == (nb_files1 + nb_files2 + nb_files3) * file_size + 1 assert expired['used'] == 1 # RSE 2 : 500 GB primary + 100 GB secondary tot_space = [ src for src in get_rse_usage(rse2_id) if src['source'] == 'rucio' ][0] expired = [ src for src in get_rse_usage(rse2_id) if src['source'] == 'expired' ][0] assert tot_space['used'] == (nb_files2 + nb_files4) * file_size assert expired['used'] == nb_files4 * file_size # Total primary on T1=true : 2000 GB # Total secondary on T1=true : 200 GB # Ratio secondary / primary = 10 % # Ratio on RSE 1 : 0 % # Ratio on RSE 2 : 40 % # Now run BB8 re_evaluator(once=True, sleep_time=30, did_limit=100) bb8_run(once=True, rse_expression='%s=true' % str(T1), move_subscriptions=False, use_dump=False, sleep_time=300, threads=1, dry_run=False) for rule_id in rules: rule = get_rule(rule_id) if rule_id != rule_to_rebalance: assert (rule['child_rule_id'] is None) else: assert (rule['child_rule_id'] is not None) assert ( rule['expires_at'] <= datetime.utcnow() + timedelta(seconds=1) ) # timedelta needed to prevent failure due to rounding effects child_rule_id = rule['child_rule_id'] child_rule = get_rule(child_rule_id) assert (child_rule['rse_expression'] == rse2) # For teardown, delete child rule update_rule(child_rule_id, {'lifetime': -86400}) rule_cleaner(once=True) for dataset in dsn: set_metadata(mock_scope, dataset, 'lifetime', -86400) undertaker.run(once=True)
def rule_injector(once=False, sleep_time=60): """ Main loop to check for asynchronous creation of replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try executable = 'judge-injector' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=100, blocked_rules=[key for key in paused_rules]) logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) else: for rule in rules: rule_id = rule[0] logger(logging.INFO, 'Injecting rule %s' % rule_id) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id, logger=logger) logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (RSEWriteBlocked) as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound: pass except InsufficientAccountLimit: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logger(logging.CRITICAL, 'Exception', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def rule_injector(once=False): """ Main loop to check for asynchronous creation of replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try live(executable='rucio-judge-injector', hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable='rucio-judge-injector', hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules( total_workers=heartbeat['nr_threads'] - 1, worker_number=heartbeat['assign_thread'], limit=100, blacklisted_rules=[key for key in paused_rules]) logging.debug( 'rule_injector[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, time.time() - start, len(rules))) if not rules and not once: logging.debug( 'rule_injector[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, str(len(paused_rules)))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] logging.info('rule_injector[%s/%s]: Injecting rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id) logging.debug( 'rule_injector[%s/%s]: injection of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow( ) + timedelta(seconds=randint(60, 600)) record_counter( 'rule.judge.exceptions.LocksDetected') logging.warning( 'rule_injector[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RSEBlacklisted as e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logging.warning( 'rule_injector[%s/%s]: RSEBlacklisted for rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logging.warning( 'rule_injector[%s/%s]: ReplicationRuleCreationTemporaryFailed for rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound as e: pass except InsufficientAccountLimit as e: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logging.info( 'rule_injector[%s/%s]: Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable='rucio-judge-injector', hostname=hostname, pid=pid, thread=current_thread)
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """ sleep_time = 60 executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) summary = {} lifetime_exceptions = get_lifetime_exceptions() prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) if not dry_run and date_check > now: logging.error( prepend_str + 'Atropos cannot run in non-dry-run mode for date in the future') else: while not graceful_stop.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'] - 1) logging.info(prepend_str + '%s rules to process' % (len(rules))) rule_idx = 0 for rule in rules: rule_idx += 1 logging.debug( prepend_str + 'Working on rule %s on DID %s:%s on %s' % (rule.id, rule.scope, rule.name, rule.rse_expression)) if (rule_idx % 1000) == 0: logging.info(prepend_str + '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expended eol_at rses = parse_expression(rule.rse_expression) eol_at = define_eol(rule.scope, rule.name, rses) # Check the exceptions if rule.name in lifetime_exceptions: if rule.eol_at > lifetime_exceptions[rule.name]: logging.info( prepend_str + 'Rule %s on DID %s:%s on %s expired. Extension requested till %s' % (rule.id, rule.scope, rule.name, rule.rse_expression, lifetime_exceptions[rule.name])) else: # If eol_at < requested extension, update eol_at logging.info( prepend_str + 'Updating rule %s on DID %s:%s on %s according to the exception till %s' % (rule.id, rule.scope, rule.name, rule.rse_expression, lifetime_exceptions[rule.name])) try: update_rule(rule.id, options={ 'eol_at': lifetime_exceptions[rule.name] }) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) elif eol_at != rule.eol_at: logging.warning( prepend_str + 'The computed eol %s differs from the one recorded %s for rule %s on %s:%s at %s' % (eol_at, rule.eol_at, rule.id, rule.scope, rule.name, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse'] not in summary: summary[lock['rse']] = {} if '%s:%s' % (rule.scope, rule.name ) not in summary[lock['rse']]: summary[lock['rse']]['%s:%s' % (rule.scope, rule.name)] = { 'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0 } if no_locks: logging.warning( prepend_str + 'Cannot find a lock for rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) if not dry_run: logging.info( prepend_str + 'Setting %s seconds lifetime for rule %s' % (grace_period, rule.id)) try: update_rule(rule.id, options={'lifetime': grace_period}) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s:%s' % (rule.id, rule.scope, rule.name)) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse]: tot_datasets += 1 tot_files += summary[rse][did].get('length', 0) tot_size += summary[rse][did].get('bytes', 0) logging.info( prepend_str + 'For RSE %s %s datasets will be deleted representing %s files and %s bytes' % (rse, tot_datasets, tot_files, tot_size)) if once: break else: tottime = time.time() - stime if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def add_files(lfns, account, ignore_availability, vo='def', session=None): """ Bulk add files : - Create the file and replica. - If doesn't exist create the dataset containing the file as well as a rule on the dataset on ANY sites. - Create all the ascendants of the dataset if they do not exist :param lfns: List of lfn (dictionary {'lfn': <lfn>, 'rse': <rse>, 'bytes': <bytes>, 'adler32': <adler32>, 'guid': <guid>, 'pfn': <pfn>} :param issuer: The issuer account. :param ignore_availability: A boolean to ignore blocklisted sites. :param vo: The VO to act on :param session: The session used """ rule_extension_list = [] attachments = [] # The list of scopes is necessary for the extract_scope filter_ = {'scope': InternalScope(scope='*', vo=vo)} scopes = list_scopes(filter_=filter_, session=session) scopes = [scope.external for scope in scopes] exist_lfn = [] try: lifetime_dict = config_get(section='dirac', option='lifetime', session=session) lifetime_dict = loads(lifetime_dict) except ConfigNotFound: lifetime_dict = {} except JSONDecodeError as err: raise InvalidType('Problem parsing lifetime option in dirac section : %s' % str(err)) except Exception as err: raise RucioException(str(err)) for lfn in lfns: # First check if the file exists filename = lfn['lfn'] lfn_scope, _ = extract_scope(filename, scopes) lfn_scope = InternalScope(lfn_scope, vo=vo) exists, did_type = _exists(lfn_scope, filename) if exists: continue # Get all the ascendants of the file lfn_split = filename.split('/') lpns = ["/".join(lfn_split[:idx]) for idx in range(2, len(lfn_split))] lpns.reverse() print(lpns) # The parent must be a dataset. Register it as well as the rule dsn_name = lpns[0] dsn_scope, _ = extract_scope(dsn_name, scopes) dsn_scope = InternalScope(dsn_scope, vo=vo) # Compute lifetime lifetime = None if dsn_scope in lifetime_dict: lifetime = lifetime_dict[dsn_scope] else: for pattern in lifetime_dict: if re.match(pattern, dsn_scope): lifetime = lifetime_dict[pattern] break exists, did_type = _exists(dsn_scope, dsn_name) if exists and did_type == DIDType.CONTAINER: raise UnsupportedOperation('Cannot create %s as dataset' % dsn_name) if (dsn_name not in exist_lfn) and not exists: print('Will create %s' % dsn_name) # to maintain a compatibility between master and LTS-1.26 branches remove keywords for first 3 arguments add_did(dsn_scope, dsn_name, DIDType.DATASET, account=InternalAccount(account, vo=vo), statuses=None, meta=None, rules=[{'copies': 1, 'rse_expression': 'ANY=true', 'weight': None, 'account': InternalAccount(account, vo=vo), 'lifetime': None, 'grouping': 'NONE'}], lifetime=None, dids=None, rse_id=None, session=session) exist_lfn.append(dsn_name) parent_name = lpns[1] parent_scope, _ = extract_scope(parent_name, scopes) parent_scope = InternalScope(parent_scope, vo=vo) attachments.append({'scope': parent_scope, 'name': parent_name, 'dids': [{'scope': dsn_scope, 'name': dsn_name}]}) rule_extension_list.append((dsn_scope, dsn_name)) if lifetime and (dsn_scope, dsn_name) not in rule_extension_list: # Reset the lifetime of the rule to the configured value rule = [rul for rul in list_rules({'scope': dsn_scope, 'name': dsn_name, 'account': InternalAccount(account, vo=vo)}, session=session) if rul['rse_expression'] == 'ANY=true'] if rule: update_rule(rule[0]['id'], options={'lifetime': lifetime}, session=session) rule_extension_list.append((dsn_scope, dsn_name)) # Register the file rse_id = lfn.get('rse_id', None) if not rse_id: raise InvalidType('Missing rse_id') bytes_ = lfn.get('bytes', None) guid = lfn.get('guid', None) adler32 = lfn.get('adler32', None) pfn = lfn.get('pfn', None) files = {'scope': lfn_scope, 'name': filename, 'bytes': bytes_, 'adler32': adler32} if pfn: files['pfn'] = str(pfn) if guid: files['meta'] = {'guid': guid} add_replicas(rse_id=rse_id, files=[files], dataset_meta=None, account=InternalAccount(account, vo=vo), ignore_availability=ignore_availability, session=session) add_rule(dids=[{'scope': lfn_scope, 'name': filename}], account=InternalAccount(account, vo=vo), copies=1, rse_expression=lfn['rse'], grouping=None, weight=None, lifetime=86400, locked=None, subscription_id=None, session=session) attachments.append({'scope': dsn_scope, 'name': dsn_name, 'dids': [{'scope': lfn_scope, 'name': filename}]}) # Now loop over the ascendants of the dataset and created them for lpn in lpns[1:]: child_scope, _ = extract_scope(lpn, scopes) child_scope = InternalScope(child_scope, vo=vo) exists, did_type = _exists(child_scope, lpn) if exists and did_type == DIDType.DATASET: raise UnsupportedOperation('Cannot create %s as container' % lpn) if (lpn not in exist_lfn) and not exists: print('Will create %s' % lpn) add_did(child_scope, lpn, DIDType.CONTAINER, account=InternalAccount(account, vo=vo), statuses=None, meta=None, rules=None, lifetime=None, dids=None, rse_id=None, session=session) exist_lfn.append(lpn) parent_name = lpns[lpns.index(lpn) + 1] parent_scope, _ = extract_scope(parent_name, scopes) parent_scope = InternalScope(parent_scope, vo=vo) attachments.append({'scope': parent_scope, 'name': parent_name, 'dids': [{'scope': child_scope, 'name': lpn}]}) # Finally attach everything attach_dids_to_dids(attachments, account=InternalAccount(account, vo=vo), ignore_duplicate=True, session=session)
def rebalance_rule(parent_rule, activity, rse_expression, priority, source_replica_expression='*\\bb8-enabled=false', comment=None): """ Rebalance a replication rule to a new RSE :param parent_rule: Replication rule to be rebalanced. :param activity: Activity to be used for the rebalancing. :param rse_expression: RSE expression of the new rule. :param priority: Priority of the newly created rule. :param source_replica_expression: Source replica expression of the new rule. :param comment: Comment to set on the new rules. :returns: The new child rule id. """ if parent_rule['expires_at'] is None: lifetime = None else: lifetime = (parent_rule['expires_at'] - datetime.utcnow()).days * 24 * 3600 + ( parent_rule['expires_at'] - datetime.utcnow()).seconds if parent_rule['grouping'] == RuleGrouping.ALL: grouping = 'ALL' elif parent_rule['grouping'] == RuleGrouping.NONE: grouping = 'NONE' else: grouping = 'DATASET' # ensure that expressions are for correct vo rule_vo = parent_rule['scope'].vo if parent_rule['scope'].vo != 'def': source_replica_expression = 'vo={}&({})'.format( rule_vo, source_replica_expression) rse_expression = 'vo={}&({})'.format(rule_vo, rse_expression) # check if concurrent replica at target rse does not exist concurrent_replica = False try: for lock in get_dataset_locks(parent_rule['scope'], parent_rule['name']): lock_rse_expr = lock['rse'] if rule_vo != 'def': lock_rse_expr = 'vo={}&({})'.format(rule_vo, lock_rse_expr) if lock_rse_expr == rse_expression: # may need to evaluate to be sure... could get 'vo=tst&(vo=tst&(MOCK))' concurrent_replica = True except Exception as error: concurrent_replica = True print('Exception: get_dataset_locks not feasible for %s %s:' % (parent_rule['scope'], parent_rule['name'])) raise error if concurrent_replica: return 'Concurrent replica exists at target rse!' print(concurrent_replica) child_rule = add_rule( dids=[{ 'scope': parent_rule['scope'], 'name': parent_rule['name'] }], account=parent_rule['account'], copies=parent_rule['copies'], rse_expression=rse_expression, grouping=grouping, weight=parent_rule['weight'], lifetime=lifetime, locked=parent_rule['locked'], subscription_id=parent_rule['subscription_id'], source_replica_expression=source_replica_expression, activity=activity, notify=parent_rule['notification'], purge_replicas=parent_rule['purge_replicas'], ignore_availability=False, comment=parent_rule['comments'] if not comment else comment, ask_approval=False, asynchronous=False, ignore_account_limit=True, priority=priority)[0] update_rule(rule_id=parent_rule['id'], options={ 'child_rule_id': child_rule, 'lifetime': 0 }) return child_rule
def rebalance_rule( parent_rule, activity, rse_expression, priority, source_replica_expression="*\\bb8-enabled=false", comment=None, session=None, ): """ Rebalance a replication rule to a new RSE :param parent_rule: Replication rule to be rebalanced. :param activity: Activity to be used for the rebalancing. :param rse_expression: RSE expression of the new rule. :param priority: Priority of the newly created rule. :param source_replica_expression: Source replica expression of the new rule. :param comment: Comment to set on the new rules. :returns: The new child rule id. """ if parent_rule["expires_at"] is None: lifetime = None else: lifetime = (parent_rule["expires_at"] - datetime.utcnow()).days * 24 * 3600 + ( parent_rule["expires_at"] - datetime.utcnow()).seconds if parent_rule["grouping"] == RuleGrouping.ALL: grouping = "ALL" elif parent_rule["grouping"] == RuleGrouping.NONE: grouping = "NONE" else: grouping = "DATASET" # check if concurrent replica at target rse does not exist concurrent_replica = False for lock in get_dataset_locks(parent_rule["scope"], parent_rule["name"]): lock_rse_expr = lock["rse"] if lock_rse_expr == rse_expression: concurrent_replica = True if concurrent_replica: return None child_rule = add_rule( dids=[{ "scope": parent_rule["scope"], "name": parent_rule["name"] }], account=parent_rule["account"], copies=parent_rule["copies"], rse_expression=rse_expression, grouping=grouping, weight=parent_rule["weight"], lifetime=lifetime, locked=parent_rule["locked"], subscription_id=parent_rule["subscription_id"], source_replica_expression=source_replica_expression, activity=activity, notify=parent_rule["notification"], purge_replicas=parent_rule["purge_replicas"], ignore_availability=False, comment=parent_rule["comments"] if not comment else comment, ask_approval=False, asynchronous=False, ignore_account_limit=True, priority=priority, session=session, )[0] update_rule( rule_id=parent_rule["id"], options={ "child_rule_id": child_rule, "lifetime": 0 }, session=session, ) return child_rule
logging.warning( 'rule_injector[%s/%s]: ReplicationRuleCreationTemporaryFailed for rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound, e: pass except InsufficientAccountLimit, e: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logging.info( 'rule_injector[%s/%s]: Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % (heartbeat['assign_thread'], heartbeat['nr_threads'] - 1, rule_id)) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError), e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception, e:
def run_once(paused_rules, heartbeat_handler, **_kwargs): worker_number, total_workers, logger = heartbeat_handler.live() try: start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules(total_workers=total_workers, worker_number=worker_number, limit=100, blocked_rules=[key for key in paused_rules]) logger( logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules: logger( logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) return for rule in rules: _, _, logger = heartbeat_handler.live() rule_id = rule[0] logger(logging.INFO, 'Injecting rule %s' % rule_id) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id, logger=logger) logger( logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) record_counter('rule.judge.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except (RSEWriteBlocked) as e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logger( logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except RuleNotFound: pass except InsufficientAccountLimit: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logger( logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__}) except Exception as e: logger(logging.CRITICAL, 'Exception', exc_info=True) record_counter('rule.judge.exceptions.{exception}', labels={'exception': e.__class__.__name__})