def test_list_expired_dids_with_locked_rules(self): """ UNDERTAKER (CORE): Test that the undertaker does not list expired dids with locked rules""" tmp_scope = InternalScope('mock') jdoe = InternalAccount('jdoe') root = InternalAccount('root') # Add quota set_local_account_limit(jdoe, get_rse_id('MOCK'), -1) dsn = { 'name': 'dsn_%s' % generate_uuid(), 'scope': tmp_scope, 'type': 'DATASET', 'lifetime': -1, 'rules': [{ 'account': jdoe, 'copies': 1, 'rse_expression': 'MOCK', 'locked': True, 'grouping': 'DATASET' }] } add_dids(dids=[dsn], account=root) for did in list_expired_dids(limit=1000): assert (did['scope'] != dsn['scope'] and did['name'] != dsn['name'])
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting', worker_number) logging.info('Undertaker(%s): started', worker_number) executable = 'undertaker' hostname = socket.gethostname() pid = os.getpid() thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) paused_dids = {} # {(scope, name): datetime} while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, older_than=6000) logging.info('Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'.format(locals())) # Refresh paused dids iter_paused_dids = deepcopy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] dids = list_expired_dids(worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads'], limit=10000) dids = [did for did in dids if (did['scope'], did['name']) not in paused_dids] if not dids and not once: logging.info('Undertaker(%s): Nothing to do. sleep 60.', worker_number) time.sleep(60) continue for chunk in chunks(dids, chunk_size): try: logging.info('Undertaker(%s): Receive %s dids to delete', worker_number, len(chunk)) delete_dids(dids=chunk, account=InternalAccount('root'), expire_rules=True) logging.info('Undertaker(%s): Delete %s dids', worker_number, len(chunk)) record_counter(counters='undertaker.delete_dids', delta=len(chunk)) except RuleNotFound as error: logging.error(error) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])) or match('.*55P03.*', str(e.args[0])) or match('.*3572.*', str(e.args[0])): for did in chunk: paused_dids[(did['scope'], did['name'])] = datetime.utcnow() + timedelta(seconds=randint(600, 2400)) record_counter('undertaker.delete_dids.exceptions.LocksDetected') logging.warning('undertaker[%s/%s]: Locks detected for chunk', heartbeat['assign_thread'], heartbeat['nr_threads']) else: logging.error('Undertaker(%s): Got database error %s.', worker_number, str(e)) except: logging.critical(traceback.format_exc()) time.sleep(1) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=thread) logging.info('Undertaker(%s): graceful stop requested', worker_number) logging.info('Undertaker(%s): graceful stop done', worker_number)
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting' % worker_number) logging.info('Undertaker(%s): started' % worker_number) while not graceful_stop.is_set(): try: dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000) if not dids and not once: logging.info('Undertaker(%s): Nothing to do. sleep 60.' % worker_number) time.sleep(60) continue for chunk in chunks(dids, chunk_size): try: logging.info('Undertaker(%s): Receive %s dids to delete' % (worker_number, len(chunk))) delete_dids(dids=chunk, account='root') logging.info('Undertaker(%s): Delete %s dids' % (worker_number, len(chunk))) record_counter(counters='undertaker.delete_dids', delta=len(chunk)) except DatabaseException, e: logging.error('Undertaker(%s): Got database error %s.' % (worker_number, str(e))) except: logging.error(traceback.format_exc()) time.sleep(1) if once: break logging.info('Undertaker(%s): graceful stop requested' % worker_number) logging.info('Undertaker(%s): graceful stop done' % worker_number)
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting', worker_number) logging.info('Undertaker(%s): started', worker_number) hostname = socket.gethostname() pid = os.getpid() thread = threading.current_thread() sanity_check(executable='rucio-undertaker', hostname=hostname) while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable='rucio-undertaker', hostname=hostname, pid=pid, thread=thread, older_than=6000) logging.info( 'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}' .format(locals())) dids = list_expired_dids(worker_number=heartbeat['assign_thread'] + 1, total_workers=heartbeat['nr_threads'], limit=10000) if not dids and not once: logging.info('Undertaker(%s): Nothing to do. sleep 60.', worker_number) time.sleep(60) continue for chunk in chunks(dids, chunk_size): try: logging.info('Undertaker(%s): Receive %s dids to delete', worker_number, len(chunk)) delete_dids(dids=chunk, account='root', expire_rules=True) logging.info('Undertaker(%s): Delete %s dids', worker_number, len(chunk)) record_counter(counters='undertaker.delete_dids', delta=len(chunk)) except RuleNotFound as error: logging.error(error) except DatabaseException as error: logging.error('Undertaker(%s): Got database error %s.', worker_number, str(error)) except: logging.critical(traceback.format_exc()) time.sleep(1) if once: break die(executable='rucio-undertaker', hostname=hostname, pid=pid, thread=thread) logging.info('Undertaker(%s): graceful stop requested', worker_number) logging.info('Undertaker(%s): graceful stop done', worker_number)
def run_once(paused_dids: Dict[Tuple, datetime], chunk_size: int, heartbeat_handler: HeartbeatHandler, **_kwargs): worker_number, total_workers, logger = heartbeat_handler.live() try: # Refresh paused dids iter_paused_dids = deepcopy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000) dids = [ did for did in dids if (did['scope'], did['name']) not in paused_dids ] if not dids: logger(logging.INFO, 'did not get any work') return for chunk in chunks(dids, chunk_size): _, _, logger = heartbeat_handler.live() try: logger(logging.INFO, 'Receive %s dids to delete', len(chunk)) delete_dids(dids=chunk, account=InternalAccount('root', vo='def'), expire_rules=True) logger(logging.INFO, 'Delete %s dids', len(chunk)) record_counter(name='undertaker.delete_dids', delta=len(chunk)) except RuleNotFound as error: logger(logging.ERROR, error) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])) or match( '.*55P03.*', str(e.args[0])) or match( '.*3572.*', str(e.args[0])): for did in chunk: paused_dids[( did['scope'], did['name'])] = datetime.utcnow() + timedelta( seconds=randint(600, 2400)) record_counter( 'undertaker.delete_dids.exceptions.{exception}', labels={'exception': 'LocksDetected'}) logger(logging.WARNING, 'Locks detected for chunk') else: logger(logging.ERROR, 'Got database error %s.', str(e)) except: logging.critical(traceback.format_exc())