def __update_bulk_replicas(replicas, session=None, logger=logging.log): """ Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way. :param replicas: List of replicas. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states(replicas, nowait=True, session=session) except ReplicaNotFound as error: logger(logging.WARNING, 'Failed to bulk update replicas, will do it one by one: %s', str(error)) raise ReplicaNotFound(error) for replica in replicas: if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logger(logging.INFO, "HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) return True
def __update_bulk_replicas(replicas, session=None): """ Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way. :param replicas: List of replicas. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states( replicas, nowait=True, add_tombstone=False, session=session ) # TODO: add_tombstone needs to be changed to True once set_tombstone is fixed except ReplicaNotFound as error: logging.warn( 'Failed to bulk update replicas, will do it one by one: %s', str(error)) raise ReplicaNotFound(error) for replica in replicas: if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) return True
def __update_replica(replica, session=None): """ Used by finisher to update a replica to a finished state. :param replica: Replica as a dictionary. :param rule_id: RULE id. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states([replica], nowait=True, add_tombstone=True, session=session) if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) except (UnsupportedOperation, ReplicaNotFound) as error: logging.warn( "ERROR WHEN HANDLING REQUEST %s DID %s:%s AT RSE %s STATE %s: %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state']), str(error)) # replica cannot be found. register it and schedule it for deletion try: if replica['state'] == ReplicaState.AVAILABLE and replica[ 'request_type'] != RequestType.STAGEIN: logging.info( "Replica cannot be found. Adding a replica %s:%s AT RSE %s with tombstone=utcnow", replica['scope'], replica['name'], replica['rse_id']) replica_core.add_replica( replica['rse_id'], replica['scope'], replica['name'], replica['bytes'], pfn=replica['pfn'] if 'pfn' in replica else None, account=InternalAccount( 'root', vo=replica['scope'].vo ), # it will deleted immediately, do we need to get the accurate account from rule? adler32=replica['adler32'], tombstone=datetime.datetime.utcnow(), session=session) if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s", replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state'])) except Exception as error: logging.error( 'Cannot register replica for DID %s:%s at RSE %s - potential dark data - %s', replica['scope'], replica['name'], replica['rse_id'], str(error)) raise return True
def remove_extra_replicas(self, to_remove): """ :param to_remove: replicas to remove from Rucio :return: """ scope = InternalScope(self.scope) with monitor.record_timer_block('cms_sync.time_remove_replica'): if to_remove and self.dry_run: logging.info('Dry run: Removing replicas %s from rse %s.', str(to_remove), self.rse) elif to_remove: logging.debug('Removing %s replicas from rse %s.', len(to_remove), self.rse) for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE): replicas = [{ 'scope': scope, 'name': lfn, "rse_id": self.rse_id, "state": "U" } for lfn in to_remove_chunk] # transactional_session here? # while lock is set stuck, judge-repairer might make transfer requests before rule is gone but does it matter? update_replicas_states( replicas=replicas, add_tombstone=False, ) # delete_replicas(rse=self.rse, issuer=self.account, # files=[{'scope': self.scope, 'name': lfn} for lfn in to_remove_chunk]) return len(to_remove)
def __update_bulk_replicas(replicas, req_type, rule_id, session=None): """ Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way. :param replicas: List of replicas. :param req_type: Request type: STAGEIN, STAGEOUT, TRANSFER. :param rule_id: RULE id. :param session: The database session to use. :returns commit_or_rollback: Boolean. """ try: replica_core.update_replicas_states(replicas, nowait=True, session=session) except ReplicaNotFound as error: logging.warn( 'Failed to bulk update replicas, will do it one by one: %s' % str(error)) raise ReplicaNotFound(error) for replica in replicas: if not replica['archived']: request_core.archive_request(replica['request_id'], session=session) logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s" % (replica['request_id'], replica['scope'], replica['name'], replica['rse_id'], str(replica['state']))) return True
def update_replicas_states(rse, files, issuer): """ Update File replica information and state. :param rse: The RSE name. :param files: The list of files. :param issuer: The issuer account. """ for v_file in files: v_file.update( {"type": "FILE"}) # Make sure DIDs are identified as files for checking validate_schema(name='dids', obj=files) rse_id = get_rse_id(rse=rse) kwargs = {'rse': rse, 'rse_id': rse_id} if not permission.has_permission( issuer=issuer, action='update_replicas_states', kwargs=kwargs): raise exception.AccessDenied( 'Account %s can not update file replicas state on %s' % (issuer, rse)) replicas = [] for file in files: rep = file rep['rse_id'] = rse_id rep['scope'] = InternalScope(rep['scope']) replicas.append(rep) replica.update_replicas_states(replicas=replicas)
def add_sync_rule(rse, scope, block, account, session=None): rse_id = get_rse_id(rse=rse, session=session) scope = InternalScope(scope) account = InternalAccount(account) files = [] for file in list_files(scope, block, long=False, session=session): try: update_replicas_states( replicas=[ { "scope": scope, "name": file["name"], "rse_id": rse_id, "state": "A", } ], add_tombstone=False, session=session, ) except ReplicaNotFound: add_replicas( rse_id=rse_id, files=[ { "scope": scope, "name": file["name"], "bytes": file["bytes"], "adler32": file["adler32"], } ], account=account, ignore_availability=True, session=session, ) rules = add_rule( dids=[{"scope": scope, "name": block}], copies=1, rse_expression=rse, weight=None, lifetime=None, grouping="DATASET", account=account, locked=False, subscription_id=None, source_replica_expression=None, activity="Data Consolidation", notify=None, purge_replicas=False, ignore_availability=True, comment=None, ask_approval=False, asynchronous=False, priority=3, split_container=False, meta=json.dumps({"phedex_group": "DataOps", "phedex_custodial": True}), session=session, ) return rules[0]
def update_replicas_states(rse, files, issuer, vo='def'): """ Update File replica information and state. :param rse: The RSE name. :param files: The list of files. :param issuer: The issuer account. :param vo: The VO to act on. """ validate_schema(name='dids', obj=files) rse_id = get_rse_id(rse=rse, vo=vo) kwargs = {'rse': rse, 'rse_id': rse_id} if not permission.has_permission( issuer=issuer, vo=vo, action='update_replicas_states', kwargs=kwargs): raise exception.AccessDenied( 'Account %s can not update file replicas state on %s' % (issuer, rse)) replicas = [] for file in files: rep = file rep['rse_id'] = rse_id rep['scope'] = InternalScope(rep['scope'], vo=vo) replicas.append(rep) replica.update_replicas_states(replicas=replicas)
def add_missing_replicas(self, missing): """ :param missing: possible missing lfns :return: """ with monitor.record_timer_block('cms_sync.time_add_replica'): if missing and self.dry_run: logging.info('Dry run: Adding replicas %s to rse %s.', str(missing), self.rse) elif missing: logging.info('Adding %s replicas to rse %s.', len(missing), self.rse) replicas_to_add = [self.replicas[lfn] for lfn in missing] files = replica_file_list(replicas=replicas_to_add, scope=self.scope) for rucio_file in files: try: update_file = copy.deepcopy(rucio_file) update_file.update({'scope': InternalScope(self.scope), "rse_id": self.rse_id, "state": "A"}) update_replicas_states(replicas=[update_file], add_tombstone=False) except ReplicaNotFound: resurrect_file = copy.deepcopy(rucio_file) resurrect_file.update({'scope': 'cms', 'type': 'FILE'}) try: add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account, ignore_availability=True) except RucioException: logging.critical('Could not add %s to %s. Constraint violated?', resurrect_file, self.rse) resurrect_file.update({'scope': 'cms', 'type': 'FILE'}) # Reset to Internal scope by call resurrect([resurrect_file], issuer=self.account) resurrect_file.update({'scope': 'cms', 'type': 'FILE'}) # Reset to Internal scope by call add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account, ignore_availability=True) logging.critical('Resurrected %s at %s', resurrect_file, self.rse) # add_replicas(rse=self.rse, files=files, issuer=self.account) lfns = [item['name'] for item in list_files(scope=self.scope, name=self.block_name, long=False)] missing_lfns = list(set(missing) - set(lfns)) if missing_lfns: logging.debug('Attaching %s lfns to %s at %s', len(missing_lfns), self.block_name, self.rse) dids = [{'scope': self.scope, 'name': lfn} for lfn in missing_lfns] try: attach_dids(scope=self.scope, name=self.block_name, attachment={'dids': dids}, issuer=self.account) except FileAlreadyExists: logging.warning('Trying to attach already existing files to %s', self.block_name) except DataIdentifierNotFound: logging.critical('Could not attach to %s at %s. Constraint violated?', self.block_name, self.rse) return len(missing_lfns)
def update_replicas_states(rse, files, issuer): """ Update File replica information and state. :param rse: The RSE name. :param files: The list of files. :param issuer: The issuer account. """ validate_schema(name='dids', obj=files) kwargs = {'rse': rse} if not permission.has_permission(issuer=issuer, action='update_replicas_states', kwargs=kwargs): raise exception.AccessDenied('Account %s can not update file replicas state on %s' % (issuer, rse)) replicas = [] for file in files: rep = file rep['rse'] = rse replicas.append(rep) replica.update_replicas_states(replicas=replicas)
def delete_sync_rule(rule_id, session=None): rule = get_rule(rule_id, session=session) if rule["did_type"] != DIDType.DATASET: raise RuntimeError("Rule applies to did with wrong type") block = rule["name"] try: rse_id = get_rse_id(rse=rule["rse_expression"], session=session) except RSENotFound: raise RuntimeError("Rule does not apply to a specific RSE") scope = rule["scope"] account = rule["account"] files = [] for file in list_files(scope, block, long=False, session=session): files.append( {"scope": scope, "name": file["name"], "rse_id": rse_id, "state": "U"} ) update_replicas_states( replicas=files, add_tombstone=False, session=session ) delete_rule(rule_id=rule_id, purge_replicas=True, soft=False, session=session)
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logging.info(prepend_str + 'Minos started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = pfn['state'] if states_mapping[state] in [ BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS ]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) if states_mapping[ state] == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: vo = account.vo pfns = bad_replicas[(account, reason, state)] logging.info( prepend_str + 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: vo_str = '' if vo == 'def' else ' on VO ' + vo logging.debug(prepend_str + 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(dict_rse[rse_id]))) nchunk = 0 tot_chunk = int( math.ceil(len(dict_rse[rse_id]) / chunk_size)) for chunk in chunks(dict_rse[rse_id], chunk_size): nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logging.debug(prepend_str + 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: vo = account.vo pfns = temporary_unvailables[(account, reason, expires_at)] logging.info( prepend_str + 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logging.debug(prepend_str + 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logging.debug(prepend_str + 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, vo=vo, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logging.debug(prepend_str + 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=None, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except (UnsupportedOperation, ReplicaNotFound) as error: session.rollback() # pylint: disable=no-member logging.error( prepend_str + 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logging.debug(prepend_str + 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) if rep['rse_id'] in unavailable_states: logging.info( prepend_str + '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) elif expires_at < datetime.now(): logging.info( '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns', prepend_str, str(rep['pfn']), expires_at) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) except (DataIdentifierNotFound, ReplicaNotFound) as error: logging.error( prepend_str + 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if len(pfns) == bulk: logging.info( prepend_str + 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info(prepend_str + 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info(prepend_str + '%s expired replicas returned' % len(expired_replicas)) logging.debug(prepend_str + 'List of expired replicas returned %s' % str(expired_replicas)) replicas = [] bad_replicas = [] for replica in expired_replicas: replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE }) bad_replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE }) session = get_session() nchunk = 0 tot_chunk = int(math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): # Process and update the replicas in chunks replicas = [{ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE } for replica in chunk] # Remove the replicas from bad_replicas table in chunks bad_replicas = [{ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE } for replica in chunk] try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning( prepend_str + 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s' % str(error)) for idx in range(len(chunk)): logging.debug(prepend_str + 'Working on %s' % (str(replicas[idx]))) try: get_metadata(replicas[idx]['scope'], replicas[idx]['name']) update_replicas_states([ replicas[idx], ], nowait=True, session=session) bulk_delete_bad_replicas([ bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound as error: session.rollback() # pylint: disable=no-member logging.warning( prepend_str + 'DID %s:%s does not exist anymore. ' % (bad_replicas[idx]['scope'], bad_replicas[idx]['name'])) bulk_delete_bad_replicas([ bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound as error: session.rollback() # pylint: disable=no-member logging.warning( prepend_str + '%s:%s on RSEID %s does not exist anymore. ' % (replicas[idx]['scope'], replicas[idx]['name'], replicas[idx]['rse_id'])) bulk_delete_bad_replicas([ bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception as error: logging.critical(traceback.format_exc()) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration started') chunk_size = 500 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) try: # Get list of expired TU replicas expired_replicas = list_expired_temporary_unavailable_replicas( total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], limit=1000) replicas = [] bad_replicas = [] for replica in expired_replicas: replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE }) bad_replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE }) session = get_session() for chunk in chunks(replicas, chunk_size): # Process and update the replicas in chunks update_replicas_states(chunk, nowait=False, session=session) for chunk in chunks(bad_replicas, chunk_size): # Remove the replicas from bad_replicas table in chunks bulk_delete_bad_replicas(chunk, session=session) session.commit() # pylint: disable=no-member except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. """ logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals()) while not graceful_stop.is_set(): try: max_deleting_rate = 0 for rse in rses: deleting_rate = 0 rse_info = rsemgr.get_rse_info(rse['rse']) rse_protocol = rse_core.get_rse_protocols(rse['rse']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse'])) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse'])) try: needed_free_space, max_being_deleted_files = None, 10000 if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space = 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse'])) s = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas))) if not replicas: logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse'])) continue p = rsemgr.create_protocol(rse_info, 'delete', scheme=None) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files))) try: s = time.time() update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files]) for replica in files: try: replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0]) except ReplicaUnAvailable as e: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e)) logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg)) replica['pfn'] = None add_message('deletion-planned', {'scope': replica['scope'], 'name': replica['name'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'rse': rse_info['rse']}) # logging.debug('update_replicas_states %s' % (time.time() - s)) monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) if not scheme: try: deleted_files = [] p.connect() for replica in files: try: logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) s = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) else: if replica['pfn']: p.delete(replica['pfn']) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000) duration = time.time() - s deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-done', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'duration': duration}) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) except SourceNotFound: err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': err_msg}) except (ServiceUnavailable, RSEAccessDenied) as e: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except Exception as e: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) except (ServiceUnavailable, RSEAccessDenied) as e: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) finally: p.close() s = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse=rse['rse'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s)) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) deleting_rate += len(deleted_files) except: logging.critical(traceback.format_exc()) deleting_rate = deleting_rate * 1.0 / max_being_deleted_files if deleting_rate > max_deleting_rate: max_deleting_rate = deleting_rate except: logging.critical(traceback.format_exc()) if once: break logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate)) sleep_time = int((1 - max_deleting_rate) * 60 + 1) time.sleep(sleep_time) except: logging.critical(traceback.format_exc()) logging.info('Graceful stop requested') logging.info('Graceful stop done')
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logging.info(prepend_str + 'Minos started') chunk_size = 500 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = pfn['state'] if states_mapping[state] in [ BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS ]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) if states_mapping[ state] == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: pfns = bad_replicas[(account, reason, state)] logging.debug( prepend_str + 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() try: for chunk in chunks(pfns, chunk_size): unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: pfns = temporary_unvailables[(account, reason, expires_at)] logging.debug( prepend_str + 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logging.debug(prepend_str + 'Extracting RSEs') _, dict_rse, unknown_replicas = get_pfn_to_rse(pfns) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse in dict_rse: replicas = [] rse_id = get_rse_id(rse=rse, session=None) logging.debug(prepend_str + 'Running on RSE %s' % rse) for rep in get_did_from_pfns(pfns=dict_rse[rse], rse=rse, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table session = get_session() try: for chunk in chunks(replicas, chunk_size): update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=None, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos Temporary Expiration starting') time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) try: # Get list of expired TU replicas logger(logging.INFO, 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logger(logging.INFO, '%s expired replicas returned', len(expired_replicas)) logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) session.rollback() session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, str(traceback.format_exc())) session = get_session() except Exception: logger(logging.CRITICAL, str(traceback.format_exc())) if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. """ logging.info('Starting Reaper: Worker %(worker_number)s, ' 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) # Generate a hash just for the subset of RSEs rse_names = [rse['rse'] for rse in rses] hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) nothing_to_do = {} while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) checkpoint_time = datetime.datetime.now() # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) max_deleting_rate = 0 for rse in sort_rses(rses): try: if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now(): heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) checkpoint_time = datetime.datetime.now() if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now(): continue logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number, rse['rse'], nothing_to_do.get(rse['id'])) rse_info = rsemgr.get_rse_info(rse_id=rse['id']) rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse']) nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' needed_free_space, max_being_deleted_files = None, 100 needed_free_space_per_child = None if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space, needed_free_space_per_child = 0, 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse']) else: if total_children and total_children > 0: needed_free_space_per_child = needed_free_space / float(total_children) start = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse_id=rse['id'], bytes=needed_free_space_per_child, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas)) if not replicas: nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s', worker_number, child_number, rse['rse'], nothing_to_do[rse['id']]) continue prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files)) try: update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True) for replica in files: try: replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error)) logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg) replica['pfn'] = None monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) try: deleted_files = [] prot.connect() for replica in files: try: deletion_dict = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) start = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) else: if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None: pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration) except SourceNotFound: err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) if replica['state'] == ReplicaState.AVAILABLE: deletion_dict['reason'] = str(err_msg) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except Exception as error: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) payload = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo add_message('deletion-failed', payload) break finally: prot.close() start = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse_id=rse['id'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) except DatabaseException as error: logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error)) except UnsupportedOperation as error: logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) except RSENotFound as error: logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) if once: break time.sleep(1) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
req = request.get_request(response['request_id'], session=session) if req['request_type'] == RequestType.STAGEIN: rse_update_name = rse_core.list_rse_attributes(response['dst_rse'], session=session)['staging_buffer'] logging.debug('OVERRIDE REPLICA DID %s:%s RSE %s TO %s' % (response['scope'], response['name'], response['dst_rse'], rse_update_name)) try: tss = time.time() logging.debug('UPDATE REPLICA STATE DID %s:%s RSE %s' % (response['scope'], response['name'], rse_update_name)) # make sure we do not leave the transaction try: # try quickly replica.update_replicas_states([{'rse': rse_update_name, 'scope': response['scope'], 'name': response['name'], 'state': ReplicaState.AVAILABLE}], nowait=False, session=session) except UnsupportedOperation: # replica not found, but it has been transferred because cancel came too late. # so we need to register the replica and schedule it for deletion again record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000) logging.warn('DID %s:%s AT RSE %s NOT FOUND - Registering replica and scheduling for immediate deletion' % (response['scope'], response['name'], rse_name)) did_meta = None try: did_meta = did.get_metadata(response['scope'], response['name'], session=session) except:
def declare_bad_file_replicas(dids, rse_id, reason, issuer, status=BadFilesStatus.BAD, scheme=None, session=None): """ Declare a list of bad replicas. :param dids: The list of DIDs. :param rse_id: The RSE id. :param reason: The reason of the loss. :param issuer: The issuer account. :param status: Either BAD or SUSPICIOUS. :param scheme: The scheme of the PFNs. :param session: The database session in use. """ unknown_replicas = [] replicas = [] for did in dids: scope = InternalScope(did['scope'], vo=issuer.vo) name = did['name'] path = None scope, name, path, exists, already_declared, size = __exist_replicas( rse_id, [(scope, name, path)], session=session)[0] if exists and ( (str(status) == str(BadFilesStatus.BAD) and not already_declared) or str(status) == str(BadFilesStatus.SUSPICIOUS)): replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.BAD }) new_bad_replica = models.BadReplicas(scope=scope, name=name, rse_id=rse_id, reason=reason, state=status, account=issuer, bytes=size) new_bad_replica.save(session=session, flush=False) session.query(models.Source).filter_by( scope=scope, name=name, rse_id=rse_id).delete(synchronize_session=False) else: if already_declared: unknown_replicas.append( '%s:%s %s' % (did['scope'], did['name'], 'Already declared')) else: unknown_replicas.append( '%s:%s %s' % (did['scope'], did['name'], 'Unknown replica')) if str(status) == str(BadFilesStatus.BAD): # For BAD file, we modify the replica state, not for suspicious try: # there shouldn't be any exceptions since all replicas exist update_replicas_states(replicas, session=session) except exception.UnsupportedOperation: raise exception.ReplicaNotFound( "One or several replicas don't exist.") try: session.flush() except IntegrityError as error: raise exception.RucioException(error.args) except DatabaseError as error: raise exception.RucioException(error.args) except FlushError as error: raise exception.RucioException(error.args) return unknown_replicas
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration starting', prepend_str) time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration started', prepend_str) chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info('%s Getting list of expired replicas', prepend_str) expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas)) logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical('%s %s', prepend_str, str(traceback.format_exc())) session = get_session() except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop requested', prepend_str) logging.info('%s Graceful stop done', prepend_str)