def declare_bad_file_replicas(pfns, rse, issuer): """ Declare a list of bad replicas. :param pfns: The list of PFNs. :param rse: The RSE name. :param issuer: The issuer account. """ kwargs = {} if not permission.has_permission(issuer=issuer, action='declare_bad_file_replicas', kwargs=kwargs): raise exception.AccessDenied('Account %s can not declare bad replicas on %s' % (issuer, rse)) return replica.declare_bad_file_replicas(pfns=pfns, rse=rse)
""" REPLICA (CORE): Add bad replicas and list them""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in xrange(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK') rse_id1 = rse_info['id'] add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) declare_bad_file_replicas(replicas, 'MOCK') bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Adding replicas to non-deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in xrange(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK2') rse_id2 = rse_info['id'] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)
files=files, account='root', ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep[ 'name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Adding replicas to non-deterministic RSE files = [{ 'scope': tmp_scope,
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, rse_expression='MOCK', max_replicas_per_rse=100): """ Main loop to check for available replicas which are labeled as suspicious Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'. :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE. If more is found, processing is skipped and warning is printed. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] rses = [] for rse in parse_expression(expression=rse_expression): rses.append(rse['rse']) rses.sort() executable += ' --rse-expression ' + str(rses) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 # there is only 1 worker allowed for this daemon if total_workers != 1: logging.error( 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break start = time.time() logging.info( 'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,' + ' reported suspicious in the last %i days at least %i times which are available on other RSEs.', worker_number, total_workers, rse_expression, younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'available_elsewhere': True, 'is_suspicious': True } recoverable_replicas = get_suspicious_files( rse_expression, **getfileskwargs) logging.info( 'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.', worker_number, total_workers, time.time() - start, len(recoverable_replicas)) if not recoverable_replicas and not once: logging.info( 'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.', worker_number, total_workers, len(recoverable_replicas)) GRACEFUL_STOP.wait(60) else: logging.info( 'replica_recoverer[%i/%i]: looking for replica surls.', worker_number, total_workers) start = time.time() surls_to_recover = { } # dictionary of { rse1: [surl1, surl2, ... ], rse2: ... } cnt_surl_not_found = 0 for replica in recoverable_replicas: scope = replica['scope'] name = replica['name'] rse = replica['rse'] rse_id = replica['rse_id'] if GRACEFUL_STOP.is_set(): break if rse_id not in surls_to_recover: surls_to_recover[rse_id] = [] # for each suspicious replica, we get its surl through the list_replicas function surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': name}]): for site in rep['rses']: if site == rse_id: surls_to_recover[rse_id].append( rep['rses'][site][0]) surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logging.warning( 'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.', worker_number, total_workers, name, rse) logging.info( 'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.', worker_number, total_workers, len(recoverable_replicas) - cnt_surl_not_found, len(recoverable_replicas), time.time() - start) for rse_id in surls_to_recover: logging.info( 'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.', worker_number, total_workers, len(surls_to_recover[rse_id]), rse, str(surls_to_recover[rse_id])) if len(surls_to_recover[rse_id]) > max_replicas_per_rse: logging.warning( 'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.', worker_number, total_workers, max_replicas_per_rse, str(len(surls_to_recover[rse_id])), rse) else: declare_bad_file_replicas( pfns=surls_to_recover[rse_id], reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root'), status=BadFilesStatus.BAD, session=None) logging.info( 'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.', worker_number, total_workers, rse) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number, total_workers)
def __update_atime(self): """ Bulk update atime. """ replicas = [] rses = [] for report in self.__reports: try: # Identify suspicious files try: if self.__bad_files_patterns and report['eventType'] in [ 'get_sm', 'get_sm_a', 'get' ] and 'clientState' in report and report[ 'clientState'] not in [ 'DONE', 'FOUND_ROOT', 'ALREADY_DONE' ]: for pattern in self.__bad_files_patterns: if 'stateReason' in report and report[ 'stateReason'] and isinstance( report['stateReason'], str) and pattern.match( report['stateReason']): reason = report['stateReason'][:255] if 'url' not in report or not report['url']: logging.error( 'Missing url in the following trace : ' + str(report)) else: try: surl = report['url'] declare_bad_file_replicas( [ surl, ], reason=reason, issuer='root', status=BadFilesStatus.SUSPICIOUS) logging.info( 'Declare suspicious file %s with reason %s' % (report['url'], reason)) except Exception as error: logging.error( 'Failed to declare suspicious file' + str(error)) except Exception as error: logging.error('Problem with bad trace : %s . Error %s' % (str(report), str(error))) # check if scope in report. if not skip this one. if 'scope' not in report: record_counter('daemons.tracer.kronos.missing_scope') if report['eventType'] != 'touch': continue else: record_counter('daemons.tracer.kronos.with_scope') # handle all events starting with get* and download and touch events. if not report['eventType'].startswith('get') and not report[ 'eventType'].startswith('sm_get') and not report[ 'eventType'] == 'download' and not report[ 'eventType'] == 'touch': continue if report['eventType'].endswith('_es'): continue record_counter('daemons.tracer.kronos.total_get') if report['eventType'] == 'get': record_counter('daemons.tracer.kronos.dq2clients') elif report['eventType'] == 'get_sm' or report[ 'eventType'] == 'sm_get': if report['eventVersion'] == 'aCT': record_counter( 'daemons.tracer.kronos.panda_production_act') else: record_counter( 'daemons.tracer.kronos.panda_production') elif report['eventType'] == 'get_sm_a' or report[ 'eventType'] == 'sm_get_a': if report['eventVersion'] == 'aCT': record_counter( 'daemons.tracer.kronos.panda_analysis_act') else: record_counter('daemons.tracer.kronos.panda_analysis') elif report['eventType'] == 'download': record_counter('daemons.tracer.kronos.rucio_download') elif report['eventType'] == 'touch': record_counter('daemons.tracer.kronos.rucio_touch') else: record_counter('daemons.tracer.kronos.other_get') if report['eventType'] == 'download' or report[ 'eventType'] == 'touch': report['usrdn'] = report['account'] if report['usrdn'] in self.__excluded_usrdns: continue # handle touch and non-touch traces differently if report['eventType'] != 'touch': # check if the report has the right state. if 'eventVersion' in report: if report['eventVersion'] != 'aCT': if report['clientState'] in self.__excluded_states: continue if 'remoteSite' not in report: continue if not report['remoteSite']: continue if 'filename' not in report: if 'name' in report: report['filename'] = report['name'] rses = report['remoteSite'].strip().split(',') for rse in rses: replicas.append({ 'name': report['filename'], 'scope': report['scope'], 'rse': rse, 'accessed_at': datetime.utcfromtimestamp( report['traceTimeentryUnix']), 'traceTimeentryUnix': report['traceTimeentryUnix'], 'eventVersion': report['eventVersion'] }) else: # if touch event and if datasetScope is in the report then it means # that there is no file scope/name and therefore only the dataset is # put in the queue to be updated and the rest is skipped. if 'datasetScope' in report: rse = None if 'remoteSite' in report: rse = report['remoteSite'] self.__dataset_queue.put({ 'scope': report['datasetScope'], 'name': report['dataset'], 'rse': rse, 'accessed_at': datetime.utcfromtimestamp( report['traceTimeentryUnix']) }) continue else: if 'remoteSite' not in report: continue replicas.append({ 'name': report['filename'], 'scope': report['scope'], 'rse': report['remoteSite'], 'accessed_at': datetime.utcfromtimestamp( report['traceTimeentryUnix']) }) except (KeyError, AttributeError): logging.error(format_exc()) record_counter('daemons.tracer.kronos.report_error') continue for did in list_parent_dids(report['scope'], report['filename']): if did['type'] != DIDType.DATASET: continue # do not update _dis datasets if did['scope'] == 'panda' and '_dis' in did['name']: continue for rse in rses: self.__dataset_queue.put({ 'scope': did['scope'], 'name': did['name'], 'did_type': did['type'], 'rse': rse, 'accessed_at': datetime.utcfromtimestamp(report['traceTimeentryUnix']) }) logging.debug(replicas) try: start_time = time() for replica in replicas: # if touch replica hits a locked row put the trace back into queue for later retry if not touch_replica(replica): resubmit = { 'filename': replica['name'], 'scope': replica['scope'], 'remoteSite': replica['rse'], 'traceTimeentryUnix': replica['traceTimeentryUnix'], 'eventType': 'get', 'usrdn': 'someuser', 'clientState': 'DONE', 'eventVersion': replica['eventVersion'] } self.__conn.send(body=jdumps(resubmit), destination=self.__queue, headers={ 'appversion': 'rucio', 'resubmitted': '1' }) record_counter('daemons.tracer.kronos.sent_resubmitted') logging.warning( '(kronos_file) hit locked row, resubmitted to queue') record_timer('daemons.tracer.kronos.update_atime', (time() - start_time) * 1000) except Exception: logging.error(format_exc()) record_counter('daemons.tracer.kronos.update_error') logging.info('(kronos_file) updated %d replicas' % len(replicas))
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logger(logging.INFO, 'Minos started') chunk_size = 10 # The chunk size used for the commits while not GRACEFUL_STOP.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = states_mapping[pfn['state']] if state in [BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) elif state == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: vo = account.vo pfns = bad_replicas[(account, reason, state)] logger( logging.INFO, 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logger( logging.INFO, 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: vo_str = '' if vo == 'def' else ' on VO ' + vo logger( logging.DEBUG, 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(dict_rse[rse_id]))) nchunk = 0 tot_chunk = int( math.ceil(len(dict_rse[rse_id]) / chunk_size)) for chunk in chunks(dict_rse[rse_id], chunk_size): nchunk += 1 logger( logging.DEBUG, 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logger( logging.DEBUG, 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, 'Exception', exc_info=True) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: vo = account.vo pfns = temporary_unvailables[(account, reason, expires_at)] logger( logging.INFO, 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logger(logging.DEBUG, 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logger( logging.INFO, 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logger(logging.DEBUG, 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, vo=vo, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logger( logging.DEBUG, 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logger( logging.DEBUG, 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=reason, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except (UnsupportedOperation, ReplicaNotFound) as error: session.rollback() # pylint: disable=no-member logger( logging.ERROR, 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logger(logging.DEBUG, 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) # If the replica is already not available, it is removed from the bad PFNs table if rep['rse_id'] in unavailable_states: logger( logging.INFO, '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) # If the expiration date of the TEMPORARY_UNAVAILABLE is in the past, it is removed from the bad PFNs table elif expires_at < datetime.now(): logger( logging.INFO, 'PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns' % (str(rep['pfn']), expires_at)) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) # Else update everything in the same transaction else: try: session = get_session() update_replicas_states( [rep], nowait=False, session=session) bulk_add_bad_replicas( [rep], account, state=BadFilesStatus. TEMPORARY_UNAVAILABLE, reason=reason, expires_at=expires_at, session=session) bulk_delete_bad_pfns( pfns=[rep['pfn']], session=session) session.commit() # pylint: disable=no-member except Exception: logger( logging.ERROR, 'Cannot update state of %s' % str(rep['pfn'])) except (DataIdentifierNotFound, ReplicaNotFound): logger( logging.ERROR, 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, 'Exception', exc_info=True) session = get_session() except Exception as error: logger(logging.ERROR, '%s' % (str(error))) tottime = time.time() - start_time if once: break if len(pfns) == bulk: logger( logging.INFO, 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) elif tottime < sleep_time: logger(logging.INFO, 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=3600, active_mode=False): """ Main loop to check for available replicas which are labeled as suspicious. Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. Replicas that are the last remaining copy of a file have additional checks (checksum comparison, etc.) before being declared bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE is considered problematic and the suspicious replicas on that RSE are labeled as 'TEMPORARY_UNAVAILABLE'. :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then it should sleep until sleep_time is over. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] prepend_str = 'replica_recoverer: ' logger = formatted_logger(logging.log, prepend_str + '%s') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logger( logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.' ) vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format( 's' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logger( logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 # there is only 1 worker allowed for this daemon if total_workers != 1: logger( logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prepend_str + '%s') start = time.time() try: json_file = open( "/opt/rucio/etc/suspicious_replica_recoverer.json") except: logger( logging.WARNING, "An error occured whilst trying to open the JSON file.") break try: json_data = json.load(json_file) except ValueError: logger(logging.WARNING, "No JSON object could be decoded.") # Checking that the json file is formatedd properly. for i, entry in enumerate(json_data): if "datatype" not in entry or "action" not in entry: logger( logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i) break logger( logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'is_suspicious': True } for vo in vos: logger(logging.INFO, 'Start replica recovery for VO: %s', vo) recoverable_replicas = {} if vo not in recoverable_replicas: recoverable_replicas[vo] = {} # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse']) rse_list = sorted([ rse for rse in parse_expression( 'enable_suspicious_file_recovery=true') if rse['vo'] == vo ], key=lambda k: k['rse']) logger( logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:" ) for i in rse_list: logger(logging.DEBUG, '%s', i) for rse in rse_list: time_start_rse = time.time() rse_expr = rse['rse'] cnt_surl_not_found = 0 if rse_expr not in recoverable_replicas[vo]: recoverable_replicas[vo][rse_expr] = {} # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs suspicious_replicas_avail_elsewhere = get_suspicious_files( rse_expr, available_elsewhere=SuspiciousAvailability[ "EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs) # Get the suspicious replicas that are the last remaining copies suspicious_replicas_last_copy = get_suspicious_files( rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"] .value, filter_={'vo': vo}, **getfileskwargs) logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr) logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere)) for i in suspicious_replicas_avail_elsewhere: logger(logging.DEBUG, '%s', i) logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy)) for i in suspicious_replicas_last_copy: logger(logging.DEBUG, '%s', i) # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE. if (rse['availability'] not in {4, 5, 6, 7}) and ( (len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)): logger( logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability']) continue if suspicious_replicas_avail_elsewhere: for replica in suspicious_replicas_avail_elsewhere: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True for rep in list_replicas([{ 'scope': scope, 'name': rep_name }]): for rse_ in rep['rses']: if rse_ == rse_id: recoverable_replicas[vo][rse_expr][ rep_name] = { 'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True } surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger( logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) if suspicious_replicas_last_copy: for replica in suspicious_replicas_last_copy: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True # Should only return one rse, as there is only one replica remaining for rep in list_replicas([{ 'scope': scope, 'name': rep_name }]): recoverable_replicas[vo][rse_expr][ rep_name] = { 'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False } surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger( logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) logger( logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.', time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy), len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy)) if len(suspicious_replicas_avail_elsewhere) + len( suspicious_replicas_last_copy) != 0: logger( logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr) for i in recoverable_replicas[vo][rse_expr]: logger(logging.DEBUG, '%s', i) # Log file is long and hard to read -> implement some spacing logger( logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start) logger(logging.INFO, 'Begin check for problematic RSEs.') time_start_check_probl = time.time() # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE. # The suspicious files are marked as temporarily unavailable. list_problematic_rses = [] for rse_key in list(recoverable_replicas[vo].keys()): if len(recoverable_replicas[vo] [rse_key].values()) > limit_suspicious_files_on_rse: list_problematic_rses.append(rse_key) surls_list = [] for replica_value in recoverable_replicas[vo][ rse_key].values(): surls_list.append(replica_value['surl']) if active_mode: add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3)) logger( logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse) logger( logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key) for replica_values in recoverable_replicas[vo][ rse_key].values(): logger( logging.INFO, 'Temporarily unavailable: RSE: %s Scope: %s Name: %s PFN: %s', rse_key, replica_values['scope'], replica_values['name'], replica_values['surl']) # Remove the RSE from the dictionary as it has been dealt with. del recoverable_replicas[vo][rse_key] logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses)) for rse in list_problematic_rses: logger(logging.INFO, "%s", rse) auditor = 0 checksum = 0 # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious). # If they are the last remaining copies, deal with them differently. for rse_key in list(recoverable_replicas[vo].keys()): files_to_be_declared_bad = [] files_to_be_ignored = [] # Remove RSEs from dictionary that don't have any suspicious replicas if len(recoverable_replicas[vo][rse_key]) == 0: del recoverable_replicas[vo][rse_key] continue # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there rse_id = list(recoverable_replicas[vo] [rse_key].values())[0]['rse_id'] for replica_key in list( recoverable_replicas[vo][rse_key].keys()): file_metadata = get_metadata( recoverable_replicas[vo][rse_key][replica_key] ["scope"], recoverable_replicas[vo][rse_key] [replica_key]["name"]) recoverable_replicas[vo][rse_key][replica_key][ "datatype"] = str(file_metadata["datatype"]) if recoverable_replicas[vo][rse_key][replica_key][ 'available_elsewhere'] is True: # Replicas with other copies on at least one other RSE can safely be labeled as bad files_to_be_declared_bad.append( recoverable_replicas[vo][rse_key][replica_key]) # Remove replica from dictionary del recoverable_replicas[vo][rse_key][replica_key] elif recoverable_replicas[vo][rse_key][replica_key][ 'available_elsewhere'] is False: if (recoverable_replicas[vo][rse_key][replica_key] ['name'].startswith("log.")) or ( recoverable_replicas[vo][rse_key] [replica_key]['name'].startswith("user")): # Don't keep log files or user files files_to_be_declared_bad.append( recoverable_replicas[vo][rse_key] [replica_key]) del recoverable_replicas[vo][rse_key][ replica_key] else: # Deal with replicas based on their metadata. if file_metadata[ "datatype"] is None: # "None" type has no function "split()" files_to_be_ignored.append( recoverable_replicas[vo][rse_key] [replica_key]) continue for i in json_data: if i["datatype"] == file_metadata[ "datatype"].split("_")[-1]: action = i["action"] if action == "ignore": files_to_be_ignored.append( recoverable_replicas[vo] [rse_key][replica_key]) elif action == "declare bad": suspicious_reason = get_suspicious_reason( recoverable_replicas[vo] [rse_key][replica_key] ["rse_id"], recoverable_replicas[vo] [rse_key][replica_key] ["scope"], recoverable_replicas[vo] [rse_key][replica_key]["name"], nattempts) for reason in suspicious_reason: if "auditor" in reason[ "reason"].lower(): auditor += 1 files_to_be_declared_bad.append( recoverable_replicas[ vo][rse_key] [replica_key]) break elif "checksum" in reason[ "reason"].lower(): checksum += 1 files_to_be_declared_bad.append( recoverable_replicas[ vo][rse_key] [replica_key]) break else: logger( logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)", rse_key, replica_key, recoverable_replicas[vo] [rse_key][replica_key]['surl'], i["datatype"], i["action"]) break else: # If no policy has been set, default to ignoring the file (no action taken). files_to_be_ignored.append( recoverable_replicas[vo][rse_key] [replica_key]) logger( logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key) for i in files_to_be_ignored: logger( logging.INFO, 'Ignore: RSE: %s Scope: %s Name: %s Datatype: %s PFN: %s', rse_key, i["scope"], i["name"], i["datatype"], i["surl"]) logger( logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key) for i in files_to_be_declared_bad: logger( logging.INFO, 'Declare bad: RSE: %s Scope: %s Name: %s Datatype: %s PFN: %s', rse_key, i["scope"], i["name"], i["datatype"], i["surl"]) if files_to_be_declared_bad: logger( logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id)) logger( logging.INFO, 'Number of replicas with checksum problems: %i', checksum) logger( logging.INFO, 'Number of replicas that were declared suspicious by the auditor: %i', auditor) if active_mode: declare_bad_file_replicas( pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None) logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key) else: logger(logging.INFO, 'No files were declared bad on %s.\n', rse_key) logger( logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl) time_passed = time.time() - start logger(logging.INFO, 'Total time: %s seconds', time_passed) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logger(logging.INFO, 'Graceful stop done.')
def test_add_and_delete_bad_replicas(rse_factory, mock_scope, root_account, did_client, vo): """ REPLICA (CORE): Add bad replicas and delete them""" # Adding replicas to deterministic RSE nbfiles = 5 rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True, vo=vo) files = [{ 'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] client_files = [{ 'scope': file_['scope'].external, 'name': file_['name'] } for file_ in files] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) tmp_dsn = 'dataset_%s' % generate_uuid() did_client.add_dataset(scope=mock_scope.external, name=tmp_dsn) did_client.add_files_to_dataset(mock_scope.external, name=tmp_dsn, files=client_files, rse=rse1) # Declare replica bad replicas = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} # Check state of bad replicas list_bad_rep = [{ 'scope': rep['scope'].external, 'name': rep['name'] } for rep in list_bad_replicas_status( state=BadFilesStatus.BAD, rse_id=rse1_id, vo=vo)] for rep in client_files: assert rep in list_bad_rep assert [ rep for rep in list_bad_replicas_status( state=BadFilesStatus.DELETED, rse_id=rse1_id, vo=vo) ] == [] # Now delete the dataset delete_dids([{ 'scope': mock_scope, 'name': tmp_dsn, 'did_type': DIDType.DATASET, 'purge_replicas': True }], account=root_account) assert [ rep for rep in list_bad_replicas_status( state=BadFilesStatus.BAD, rse_id=rse1_id, vo=vo) ] == [] list_deleted_rep = [{ 'scope': rep['scope'].external, 'name': rep['name'] } for rep in list_bad_replicas_status( state=BadFilesStatus.DELETED, rse_id=rse1_id, vo=vo)] for rep in client_files: assert rep in list_deleted_rep
def test_add_list_bad_replicas(rse_factory, mock_scope, root_account): """ REPLICA (CORE): Add bad replicas and list them""" nbfiles = 5 # Adding replicas to deterministic RSE _, rse1_id = rse_factory.make_srm_rse(deterministic=True) files = [{ 'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse1_id: if badrep['scope'] == rep['scope'] and badrep['name'] == rep[ 'name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Adding replicas to non-deterministic RSE _, rse2_id = rse_factory.make_srm_rse(deterministic=False) files = [{ 'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://%s.cern.ch/srm/managerv2?SFN=/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), 'meta': { 'events': 10 } } for _ in range(nbfiles)] add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True) # Listing replicas on non-deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse2_id]) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse2_id: if badrep['scope'] == rep['scope'] and badrep['name'] == rep[ 'name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Try adding replicas already declared bad r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) output = ['%s Unknown replica' % rep for rep in replicas] assert list(r.keys()) == [rse2_id] list1 = r[rse2_id] list1.sort() list2 = ['%s Already declared' % clean_surls([rep])[0] for rep in replicas] list2.sort() assert list1 == list2 # Now adding non-existing bad replicas files = [ 'srm://%s.cern.ch/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), ] r = declare_bad_file_replicas(files, 'This is a good reason', root_account) output = ['%s Unknown replica' % rep for rep in files] assert r == {rse2_id: output}
def test_get_bad_replicas_backlog(rse_factory, mock_scope, root_account, file_config_mock): """ REPLICA (CORE): Check the behaviour of the necromancer in case of backlog on an RSE""" # Run necromancer once necromancer_run(threads=1, bulk=10000, once=True) nbfiles1 = 100 nbfiles2 = 20 # Adding replicas to deterministic RSE rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True) _, rse2_id = rse_factory.make_srm_rse(deterministic=True) # Create bad replicas on rse1 files = [{ 'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles1)] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) replicas = [] list_rep = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) list_rep.append({ 'scope': replica['scope'], 'name': replica['name'], 'rse': rse1, 'rse_id': rse1_id }) res = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert res == {} result = get_bad_replicas_backlog() assert rse1_id in result assert result[rse1_id] == nbfiles1 # Create more bad replicas on rse2 files = [{ 'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles2)] add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True) repl = [] for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): repl.extend(replica['rses'][rse2_id]) res = declare_bad_file_replicas(repl, 'This is a good reason', root_account) assert res == {} # List bad replicas on rse1 bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}]) assert len(bad_replicas) == nbfiles1 for rep in bad_replicas: assert rep in list_rep # Run necromancer once, all the files on RSE2 should be gone, 80 files should stay on RSE1 REGION.invalidate() get_bad_replicas_backlog() necromancer_run(threads=1, bulk=20, once=True) bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}, {'id': rse2_id}]) assert len(bad_replicas) == 80 for rep in bad_replicas: assert rep['rse_id'] == rse1_id
def process_output(output, sanity_check=True, compress=True): """Perform post-consistency-check actions. DARK files are put in the quarantined-replica table so that they may be deleted by the Dark Reaper. LOST files are reported as suspicious so that they may be further checked by the cloud squads. ``output`` should be an ``str`` with the absolute path to the file produced by ``consistency()``. It must maintain its naming convention. If ``sanity_check`` is ``True`` (default) and the number of entries in the output file is deemed excessive, the actions are aborted. If ``compress`` is ``True`` (default), the file is compressed with bzip2 after the actions are successfully performed. """ logger = logging.getLogger('auditor-worker') dark_replicas = [] lost_replicas = [] try: with open(output) as f: for line in f: label, path = line.rstrip().split(',', 1) scope, name = guess_replica_info(path) if label == 'DARK': dark_replicas.append({ 'path': path, 'scope': scope, 'name': name }) elif label == 'LOST': lost_replicas.append({'scope': scope, 'name': name}) else: raise ValueError('unexpected label') # Since the file is read immediately after its creation, any error # exposes a bug in the Auditor. except Exception as error: logger.critical('Error processing "%s"', output, exc_info=True) raise error rse = os.path.basename(output[:output.rfind('_')]) usage = get_rse_usage(rse, source='rucio')[0] threshold = config.config_get('auditor', 'threshold', False, 0.2) # Perform a basic sanity check by comparing the number of entries # with the total number of files on the RSE. If the percentage is # significant, there is most likely an issue with the site dump. found_error = False if len(dark_replicas) > threshold * usage['files']: logger.warning('Number of DARK files is exceeding threshold: "%s"', output) found_error = True if len(lost_replicas) > threshold * usage['files']: logger.warning('Number of LOST files is exceeding threshold: "%s"', output) found_error = True if found_error and sanity_check: raise AssertionError('sanity check failed') # While converting LOST replicas to PFNs, entries that do not # correspond to a replica registered in Rucio are silently dropped. lost_pfns = [ r['rses'][rse][0] for r in list_replicas(lost_replicas) if rse in r['rses'] ] add_quarantined_replicas(rse, dark_replicas) logger.debug('Processed %d DARK files from "%s"', len(dark_replicas), output) declare_bad_file_replicas(lost_pfns, reason='Reported by Auditor', issuer='root', status=BadFilesStatus.SUSPICIOUS) logger.debug('Processed %d LOST files from "%s"', len(lost_replicas), output) if compress: destination = bz2_compress_file(output) logger.debug('Compressed "%s"', destination)