def test_pop_messages(self): """ MESSAGE (CORE): Test retrieve and delete messages """ truncate_messages() for i in range(10): add_message(event_type='TEST', payload={ 'foo': True, 'monty': 'python', 'number': i }) tmp = retrieve_messages(10) to_delete = [] for i in tmp: assert isinstance(i['payload'], dict) assert i['payload']['foo'] is True assert i['payload']['monty'] == 'python' assert i['payload']['number'] in list(range(100)) to_delete.append({ 'id': i['id'], 'created_at': i['created_at'], 'updated_at': i['created_at'], 'payload': str(i['payload']), 'event_type': i['event_type'] }) delete_messages(to_delete) assert retrieve_messages() == []
def delete_dids(dids, account, session=None): """ Delete data identifiers :param dids: The list of dids to delete. :param account: The account. :param session: The database session in use. """ rule_id_clause = [] content_clause = [] parent_content_clause = [] did_clause = [] for did in dids: logging.info('Removing did %s:%s' % (did['scope'], did['name'])) did_clause.append(and_(models.DataIdentifier.scope == did['scope'], models.DataIdentifier.name == did['name'])) parent_content_clause.append(and_(models.DataIdentifierAssociation.child_scope == did['scope'], models.DataIdentifierAssociation.child_name == did['name'])) rule_id_clause.append(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name'])) content_clause.append(and_(models.DataIdentifierAssociation.scope == did['scope'], models.DataIdentifierAssociation.name == did['name'])) # Send message for AMI add_message('ERASE', {'account': account, 'scope': did['scope'], 'name': did['name']}, session=session) # Delete rules on did if rule_id_clause: with record_timer_block('undertaker.rules'): for (rule_id, scope, name, rse_expression, ) in session.query(models.ReplicationRule.id, models.ReplicationRule.scope, models.ReplicationRule.name, models.ReplicationRule.rse_expression).filter(or_(*rule_id_clause)): logging.debug('Removing rule %s for did %s:%s on RSE-Expression %s' % (str(rule_id), scope, name, rse_expression)) rucio.core.rule.delete_rule(rule_id=rule_id, nowait=True, session=session) # Detach from parent dids: existing_parent_dids = False if parent_content_clause: with record_timer_block('undertaker.parent_content'): for parent_did in session.query(models.DataIdentifierAssociation).filter(or_(*parent_content_clause)): existing_parent_dids = True detach_dids(scope=parent_did.scope, name=parent_did.name, dids=[{'scope': parent_did.child_scope, 'name': parent_did.child_name}], session=session) # Remove content if content_clause: with record_timer_block('undertaker.content'): rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*content_clause)).\ delete(synchronize_session=False) record_counter(counters='undertaker.content.rowcount', delta=rowcount) # remove data identifier if existing_parent_dids: # Exit method early to give Judge time to remove locks (Otherwise, due to foreign keys, did removal does not work logging.debug('Leaving delete_dids early for Judge-Evaluator checks') return with record_timer_block('undertaker.dids'): rowcount = session.query(models.DataIdentifier).filter(or_(*did_clause)).\ filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)).\ delete(synchronize_session=False)
def add_dids(dids, account, session=None): """ Bulk add data identifiers. :param dids: A list of dids. :param account: The account owner. :param session: The database session in use. """ try: for did in dids: try: if isinstance(did['type'], str) or isinstance(did['type'], unicode): did['type'] = DIDType.from_sym(did['type']) if did['type'] == DIDType.FILE: raise exception.UnsupportedOperation("Only collection (dataset/container) can be registered." % locals()) # Lifetime expired_at = None if did.get('lifetime'): expired_at = datetime.utcnow() + timedelta(seconds=did['lifetime']) # Insert new data identifier new_did = models.DataIdentifier(scope=did['scope'], name=did['name'], account=did.get('account') or account, did_type=did['type'], monotonic=did.get('statuses', {}).get('monotonic', False), is_open=True, expired_at=expired_at) # Add metadata # ToDo: metadata validation # validate_meta(did.get('meta', {})) for key in did.get('meta', {}): new_did.update({key: did['meta'][key]}) new_did.save(session=session, flush=False) if did.get('rules', None): rucio.core.rule.add_rules(dids=[did, ], rules=did['rules'], session=session) event_type = None if did['type'] == DIDType.CONTAINER: event_type = 'REGISTER_CNT' if did['type'] == DIDType.DATASET: event_type = 'REGISTER_DTS' if event_type: add_message(event_type, {'account': account, 'scope': did['scope'], 'name': did['name'], 'expires': str(expired_at)}, session=session) except KeyError, e: # ToDo raise session.flush()
def add_monitor_message(response, session=None): if response['new_state'] == RequestState.DONE: transfer_status = 'transfer-done' elif response['new_state'] == RequestState.FAILED: transfer_status = 'transfer-failed' elif response['new_state'] == RequestState.LOST: transfer_status = 'transfer-lost' activity = response.get('activity', None) src_rse = response.get('src_rse', None) src_url = response.get('src_url', None) dst_rse = response.get('dst_rse', None) dst_url = response.get('dst_url', None) dst_protocol = dst_url.split(':')[0] if dst_url else None reason = response.get('reason', None) duration = response.get('duration', -1) filesize = response.get('filesize', None) md5 = response.get('md5', None) adler32 = response.get('adler32', None) scope = response.get('scope', None) name = response.get('name', None) job_m_replica = response.get('job_m_replica', None) if job_m_replica and str(job_m_replica) == str('true'): try: rse_name = get_source_rse(scope, name, src_url) except: logging.warn('Cannot get correct RSE for source url: %s(%s)' % (src_url, sys.exc_info()[1])) rse_name = None if rse_name and rse_name != src_rse: src_rse = rse_name logging.info('find RSE: %s for source surl: %s' % (src_rse, src_url)) add_message(transfer_status, {'activity': activity, 'request-id': response['request_id'], 'duration': duration, 'checksum-adler': adler32, 'checksum-md5': md5, 'file-size': filesize, 'guid': None, 'previous-request-id': response['previous_attempt_id'], 'protocol': dst_protocol, 'scope': response['scope'], 'name': response['name'], 'src-rse': src_rse, 'src-url': src_url, 'dst-rse': dst_rse, 'dst-url': dst_url, 'reason': reason, 'transfer-endpoint': response['external_host'], 'transfer-id': response['transfer_id'], 'transfer-link': 'https://%s:8449/fts3/ftsmon/#/job/%s' % (response['external_host'], response['transfer_id']), 'tool-id': 'rucio-conveyor'}, session=session)
def test_add_message(self): """ MESSAGE (CORE): Test valid and invalid message """ truncate_messages() add_message(event_type='NEW_DID', payload={'name': 'name', 'name_Y': 'scope_X', 'type': 'file'}) with assert_raises(InvalidObject): add_message(event_type='NEW_DID', payload={'name': 'name', 'name_Y': 'scope_X', 'type': int})
def set_transfers_state(transfers, submitted_at, session=None): """ Update the transfer info of a request. :param transfers: Dictionary containing request transfer info. :param session: Database session to use. """ try: for request_id in transfers: rowcount = session.query(models.Request)\ .filter_by(id=request_id)\ .filter(models.Request.state == RequestState.SUBMITTING)\ .update({'state': transfers[request_id]['state'], 'external_id': transfers[request_id]['external_id'], 'external_host': transfers[request_id]['external_host'], 'source_rse_id': transfers[request_id]['src_rse_id'], 'submitted_at': submitted_at}, synchronize_session=False) if rowcount == 0: raise RucioException("Failed to set requests %s tansfer %s: request doesn't exist or is not in SUBMITTING state" % (request_id, transfers[request_id])) request_type = transfers[request_id].get('request_type', None) msg = {'request-id': request_id, 'request-type': str(request_type).lower() if request_type else request_type, 'scope': transfers[request_id]['scope'], 'name': transfers[request_id]['name'], 'src-rse-id': transfers[request_id]['metadata'].get('src_rse_id', None), 'src-rse': transfers[request_id]['metadata'].get('src_rse', None), 'dst-rse-id': transfers[request_id]['metadata'].get('dst_rse_id', None), 'dst-rse': transfers[request_id]['metadata'].get('dst_rse', None), 'state': str(transfers[request_id]['state']), 'activity': transfers[request_id]['metadata'].get('activity', None), 'file-size': transfers[request_id]['metadata'].get('filesize', None), 'bytes': transfers[request_id]['metadata'].get('filesize', None), 'checksum-md5': transfers[request_id]['metadata'].get('md5', None), 'checksum-adler': transfers[request_id]['metadata'].get('adler32', None), 'external-id': transfers[request_id]['external_id'], 'external-host': transfers[request_id]['external_host'], 'queued_at': str(submitted_at)} if msg['request-type']: transfer_status = '%s-%s' % (msg['request-type'], msg['state']) else: transfer_status = 'transfer-%s' % msg['state'] transfer_status = transfer_status.lower() message_core.add_message(transfer_status, msg, session=session) except IntegrityError as error: raise RucioException(error.args)
def test_pop_messages(self): """ MESSAGE (CORE): Test retrieve and delete messages """ truncate_messages() for i in xrange(10): add_message(event_type='TEST', payload={'foo': True, 'monty': 'python', 'number': i}) tmp = retrieve_messages(10) to_delete = [] for i in tmp: assert_is_instance(i['payload'], dict) assert_equal(i['payload']['foo'], True) assert_equal(i['payload']['monty'], 'python') assert_in(i['payload']['number'], xrange(100)) to_delete.append(i['id']) delete_messages(to_delete) assert_equal(retrieve_messages(), [])
def test_pop_messages(self): """ MESSAGE (CORE): Test retrieve and delete messages """ truncate_messages() for i in xrange(10): add_message(event_type='TEST', payload={ 'foo': True, 'monty': 'python', 'number': i }) tmp = retrieve_messages(10) to_delete = [] for i in tmp: assert_is_instance(i['payload'], dict) assert_equal(i['payload']['foo'], True) assert_equal(i['payload']['monty'], 'python') assert_in(i['payload']['number'], xrange(100)) to_delete.append(i['id']) delete_messages(to_delete) assert_equal(retrieve_messages(), [])
def test_hermes(self): ''' HERMES (DAEMON): Test the messaging daemon. ''' for i in range(1, 4): add_message('test-type_%i' % i, {'test': i}) add_message( 'email', { 'to': config_get('messaging-hermes', 'email_test').split(','), 'subject': 'Half-Life %i' % i, 'body': ''' Good morning, and welcome to the Black Mesa Transit System. This automated train is provided for the security and convenience of the Black Mesa Research Facility personnel. The time is eight-forty seven A.M... Current outside temperature is ninety three degrees with an estimated high of one hundred and five. Before exiting the train, be sure to check your area for personal belongings. Thank you, and have a very safe, and productive day.''' }) hermes.run(once=True, send_email=False)
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. """ logging.info('Starting Reaper: Worker %(worker_number)s, ' 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) # Generate a hash just for the subset of RSEs rse_names = [rse['rse'] for rse in rses] hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) nothing_to_do = {} while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) checkpoint_time = datetime.datetime.now() # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) max_deleting_rate = 0 for rse in sort_rses(rses): try: if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now(): heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) checkpoint_time = datetime.datetime.now() if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now(): continue logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number, rse['rse'], nothing_to_do.get(rse['id'])) rse_info = rsemgr.get_rse_info(rse_id=rse['id']) rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse']) nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' needed_free_space, max_being_deleted_files = None, 100 needed_free_space_per_child = None if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space, needed_free_space_per_child = 0, 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse']) else: if total_children and total_children > 0: needed_free_space_per_child = needed_free_space / float(total_children) start = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse_id=rse['id'], bytes=needed_free_space_per_child, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas)) if not replicas: nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s', worker_number, child_number, rse['rse'], nothing_to_do[rse['id']]) continue prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files)) try: update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True) for replica in files: try: replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error)) logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg) replica['pfn'] = None monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) try: deleted_files = [] prot.connect() for replica in files: try: deletion_dict = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) start = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) else: if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None: pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration) except SourceNotFound: err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) if replica['state'] == ReplicaState.AVAILABLE: deletion_dict['reason'] = str(err_msg) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except Exception as error: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) payload = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo add_message('deletion-failed', payload) break finally: prot.close() start = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse_id=rse['id'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) except DatabaseException as error: logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error)) except UnsupportedOperation as error: logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) except RSENotFound as error: logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) if once: break time.sleep(1) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def add_monitor_message(request, response, session=None): if request['request_type']: transfer_status = '%s-%s' % (request['request_type'], response['new_state']) else: transfer_status = 'transfer-%s' % (response['new_state']) transfer_status = transfer_status.lower() activity = response.get('activity', None) src_type = response.get('src_type', None) src_rse = response.get('src_rse', None) src_url = response.get('src_url', None) dst_type = response.get('dst_type', None) dst_rse = response.get('dst_rse', None) dst_url = response.get('dst_url', None) dst_protocol = dst_url.split(':')[0] if dst_url else None reason = response.get('reason', None) duration = response.get('duration', -1) filesize = response.get('filesize', None) md5 = response.get('md5', None) adler32 = response.get('adler32', None) created_at = response.get('created_at', None) submitted_at = response.get('submitted_at', None) started_at = response.get('started_at', None) transferred_at = response.get('transferred_at', None) account = response.get('account', None) if response['external_host']: transfer_link = '%s/fts3/ftsmon/#/job/%s' % (response['external_host'].replace('8446', '8449'), response['transfer_id']) else: # for LOST request, response['external_host'] maybe is None transfer_link = None add_message(transfer_status, {'activity': activity, 'request-id': response['request_id'], 'duration': duration, 'checksum-adler': adler32, 'checksum-md5': md5, 'file-size': filesize, 'bytes': filesize, 'guid': None, 'previous-request-id': response['previous_attempt_id'], 'protocol': dst_protocol, 'scope': response['scope'], 'name': response['name'], 'src-type': src_type, 'src-rse': src_rse, 'src-url': src_url, 'dst-type': dst_type, 'dst-rse': dst_rse, 'dst-url': dst_url, 'reason': reason, 'transfer-endpoint': response['external_host'], 'transfer-id': response['transfer_id'], 'transfer-link': transfer_link, 'created_at': str(created_at) if created_at else None, 'submitted_at': str(submitted_at) if submitted_at else None, 'started_at': str(started_at) if started_at else None, 'transferred_at': str(transferred_at) if transferred_at else None, 'tool-id': 'rucio-conveyor', 'account': account}, session=session)
def delete_from_storage(replicas, prot, rse_info, staging_areas, prepend_str): deleted_files = [] rse_name = rse_info['rse'] rse_id = rse_info['id'] try: prot.connect() for replica in replicas: # Physical deletion try: deletion_dict = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme'] } logging.info('%s Deletion ATTEMPT of %s:%s as %s on %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) start = time.time() # For STAGING RSEs, no physical deletion if rse_id in staging_areas: logging.warning( '%s Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) continue if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info[ 'sign_url'] is not None: pfn = get_signed_url(rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning( '%s Deletion UNAVAILABLE of %s:%s as %s on %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) monitor.record_timer( 'daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse_name), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logging.info( '%s Deletion SUCCESS of %s:%s as %s on %s in %s seconds', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, duration) except SourceNotFound: err_msg = '%s Deletion NOTFOUND of %s:%s as %s on %s' % ( prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name) logging.warning(err_msg) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) if replica['state'] == ReplicaState.AVAILABLE: deletion_dict['reason'] = str(err_msg) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: logging.warning( '%s Deletion NOACCESS of %s:%s as %s on %s: %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except Exception as error: logging.critical( '%s Deletion CRITICAL of %s:%s as %s on %s: %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, str(traceback.format_exc())) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in replicas: logging.warning('%s Deletion NOACCESS of %s:%s as %s on %s: %s', prepend_str, replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) add_message( 'deletion-failed', { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] }) finally: prot.close() return deleted_files
def reaper(rses=[], worker_number=0, total_workers=1, chunk_size=100, once=False, scheme=None, sleep_time=60): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param sleep_time: Thread sleep time after each chunk of work. """ logging.info('Starting Light Reaper %s-%s: Will work on RSEs: %s', worker_number, total_workers, ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256(sys.argv[0] + ''.join([rse['rse'] for rse in rses])).hexdigest() sanity_check(executable=None, hostname=hostname) while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'light-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals())) nothing_to_do = True start_time = time.time() random.shuffle(rses) for rse in rses: rse_id = rse['id'] rse = rse['rse'] replicas = list_expired_temporary_dids( rse_id=rse_id, limit=chunk_size, worker_number=worker_number, total_workers=total_workers) rse_info = rsemgr.get_rse_info(rse_id=rse_id) rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id) prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) deleted_replicas = [] try: prot.connect() for replica in replicas: nothing_to_do = False try: # pfn = str(rsemgr.lfns2pfns(rse_settings=rse_info, # lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], # operation='delete', scheme=scheme).values()[0]) pfn = 's3://%s%s%s' % (prot.attributes['hostname'], prot.attributes['prefix'], replica['name']) # logging.debug('Light Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, replica['scope'], replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %s seconds', replica['scope'], replica['name'], pfn, rse, duration) payload = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-done', payload) deleted_replicas.append(replica) except SourceNotFound: err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % ( replica['scope'], replica['name'], pfn, rse) logger(logging.WARNING, err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = 'Deletion NOACCESS of %s:%s as %s on %s: %s' % ( replica['scope'], replica['name'], pfn, rse, str(error)) logger(logging.WARNING, err_msg) payload = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) except: logger(logging.CRITICAL, traceback.format_exc()) finally: prot.close() delete_temporary_dids(dids=deleted_replicas) if once: break if once: break if nothing_to_do: logger(logging.INFO, 'Nothing to do. I will sleep for 60s') daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def add_exception(dids, account, pattern, comments, expires_at, session=None): """ Add exceptions to Lifetime Model. :param dids: The list of dids :param account: The account of the requester. :param pattern: The account. :param comments: The comments associated to the exception. :param expires_at: The expiration date of the exception. :param session: The database session in use. returns: The id of the exception. """ exception_id = generate_uuid() text = 'Account %s requested a lifetime extension for a list of DIDs that can be found below\n' % account reason = comments volume = None lifetime = None if comments.find('||||') > -1: reason, volume = comments.split('||||') text += 'The reason for the extension is "%s"\n' % reason text += 'It represents %s datasets\n' % len(dids) if volume: text += 'The estimated physical volume is %s\n' % volume if expires_at and isinstance(expires_at, string_types): lifetime = str_to_date(expires_at) text += 'The lifetime exception should expires on %s\n' % str( expires_at) elif isinstance(expires_at, datetime): lifetime = expires_at text += 'The lifetime exception should expires on %s\n' % str( expires_at) text += 'Link to approve or reject this request can be found at the end of the mail\n' text += '\n' text += 'DIDTYPE SCOPE NAME\n' text += '\n' truncated_message = False for did in dids: did_type = None if 'did_type' in did: if isinstance(did['did_type'], string_types): did_type = DIDType.from_sym(did['did_type']) else: did_type = did['did_type'] new_exception = models.LifetimeExceptions( id=exception_id, scope=did['scope'], name=did['name'], did_type=did_type, account=account, pattern=pattern, comments=reason, state=LifetimeExceptionsState.WAITING, expires_at=lifetime) if len(text) < 3000: text += '%s %s %s\n' % (str(did_type), did['scope'], did['name']) else: truncated_message = True try: new_exception.save(session=session, flush=False) except IntegrityError as error: if match('.*ORA-00001.*', str(error.args[0])) \ or match('.*IntegrityError.*UNIQUE constraint failed.*', str(error.args[0])) \ or match('.*1062.*Duplicate entry.*for key.*', str(error.args[0])) \ or match('.*IntegrityError.*columns? .*not unique.*', error.args[0]): raise LifetimeExceptionDuplicate() raise RucioException(error.args[0]) if truncated_message: text += '...\n' text += 'List too long. Truncated\n' text += '\n' text += 'Approve: https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=approve\n' % str( exception_id) text += 'Deny: https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=deny\n' % str( exception_id) approvers_email = get('lifetime_model', 'approvers_email', default=[], session=session) if approvers_email: approvers_email = approvers_email.split(',') # pylint: disable=no-member add_message(event_type='email', payload={ 'body': text, 'to': approvers_email, 'subject': '[RUCIO] Request to approve lifetime exception %s' % str(exception_id) }, session=session) return exception_id
def delete_from_storage(replicas, prot, rse_info, staging_areas, auto_exclude_threshold, logger=logging.log): deleted_files = [] rse_name = rse_info['rse'] rse_id = rse_info['id'] noaccess_attempts = 0 pfns_to_bulk_delete = [] try: prot.connect() for replica in replicas: # Physical deletion try: deletion_dict = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo logger(logging.DEBUG, 'Deletion ATTEMPT of %s:%s as %s on %s', replica['scope'], replica['name'], replica['pfn'], rse_name) start = time.time() # For STAGING RSEs, no physical deletion if rse_id in staging_areas: logger( logging.WARNING, 'Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', replica['scope'], replica['name'], replica['pfn'], rse_name) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) continue if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info[ 'sign_url'] is not None: pfn = get_signed_url(rse_id, rse_info['sign_url'], 'delete', pfn) if prot.attributes['scheme'] == 'globus': pfns_to_bulk_delete.append(replica['pfn']) else: prot.delete(pfn) else: logger(logging.WARNING, 'Deletion UNAVAILABLE of %s:%s as %s on %s', replica['scope'], replica['name'], replica['pfn'], rse_name) monitor.record_timer('daemons.reaper.delete.{scheme}.{rse}', (time.time() - start) * 1000, labels={ 'scheme': prot.attributes['scheme'], 'rse': rse_name }) duration = time.time() - start deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %.2f seconds', replica['scope'], replica['name'], replica['pfn'], rse_name, duration) except SourceNotFound: duration = time.time() - start err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s in %.2f seconds' % ( replica['scope'], replica['name'], replica['pfn'], rse_name, duration) logger(logging.WARNING, '%s', err_msg) deletion_dict['reason'] = 'File Not Found' deletion_dict['duration'] = duration add_message('deletion-not-found', deletion_dict) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: duration = time.time() - start logger(logging.WARNING, 'Deletion NOACCESS of %s:%s as %s on %s: %s in %.2f', replica['scope'], replica['name'], replica['pfn'], rse_name, str(error), duration) deletion_dict['reason'] = str(error) deletion_dict['duration'] = duration add_message('deletion-failed', deletion_dict) noaccess_attempts += 1 if noaccess_attempts >= auto_exclude_threshold: logger( logging.INFO, 'Too many (%d) NOACCESS attempts for %s. RSE will be temporarly excluded.', noaccess_attempts, rse_name) REGION.set('temporary_exclude_%s' % rse_id, True) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) break except Exception as error: duration = time.time() - start logger( logging.CRITICAL, 'Deletion CRITICAL of %s:%s as %s on %s in %.2f seconds : %s', replica['scope'], replica['name'], replica['pfn'], rse_name, duration, str(traceback.format_exc())) deletion_dict['reason'] = str(error) deletion_dict['duration'] = duration add_message('deletion-failed', deletion_dict) if pfns_to_bulk_delete and prot.attributes['scheme'] == 'globus': logger(logging.DEBUG, 'Attempting bulk delete on RSE %s for scheme %s', rse_name, prot.attributes['scheme']) prot.bulk_delete(pfns_to_bulk_delete) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in replicas: logger(logging.WARNING, 'Deletion NOACCESS of %s:%s as %s on %s: %s', replica['scope'], replica['name'], replica['pfn'], rse_name, str(error)) payload = { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_name, 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) logger(logging.INFO, 'Cannot connect to %s. RSE will be temporarly excluded.', rse_name) REGION.set('temporary_exclude_%s' % rse_id, True) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) finally: prot.close() return deleted_files
def reaper(rses, chunk_size=100, once=False, scheme=None, sleep_time=300): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param sleep_time: Thread sleep time after each chunk of work. """ pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256( (sys.argv[0] + ''.join(rses)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Starting Dark Reaper on RSEs: %s', ', '.join(rses)) if not once: logger(logging.INFO, 'Waiting for heartbeat synchonization') GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals())) nothing_to_do = True start_time = time.time() rses_to_process = list( set(rses) & set(list_rses_with_quarantined_replicas())) random.shuffle(rses_to_process) for rse_id in rses_to_process: # The following query returns the list of real replicas (deleted_replicas) and list of dark replicas (dark_replicas) # Real replicas can be directly removed from the quarantine table deleted_replicas, dark_replicas = list_quarantined_replicas( rse_id=rse_id, limit=chunk_size, worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads']) rse_info = rsemgr.get_rse_info(rse_id=rse_id) rse = rse_info['rse'] prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') try: prot.connect() for replica in dark_replicas: nothing_to_do = False scope = '' if replica['scope']: scope = replica['scope'].external try: pfn = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': scope, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) logger(logging.INFO, 'Deletion ATTEMPT of %s:%s as %s on %s', scope, replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %s seconds', scope, replica['name'], pfn, rse, duration) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-done', payload) deleted_replicas.append(replica) except SourceNotFound: err_msg = ( 'Deletion NOTFOUND of %s:%s as %s on %s' % (scope, replica['name'], pfn, rse)) logger(logging.WARNING, err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = ( 'Deletion NOACCESS of %s:%s as %s on %s: %s' % (scope, replica['name'], pfn, rse, str(error))) logger(logging.WARNING, err_msg) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) except Exception: logging.critical(traceback.format_exc()) finally: prot.close() delete_quarantined_replicas(rse_id=rse_id, replicas=deleted_replicas) if once: break if once: break if nothing_to_do: logger(logging.INFO, 'Nothing to do') daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except Exception: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. """ logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals()) while not graceful_stop.is_set(): try: max_deleting_rate = 0 for rse in rses: deleting_rate = 0 rse_info = rsemgr.get_rse_info(rse['rse']) rse_protocol = rse_core.get_rse_protocols(rse['rse']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse'])) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse'])) try: needed_free_space, max_being_deleted_files = None, 10000 if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space = 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse'])) s = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas))) if not replicas: logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse'])) continue p = rsemgr.create_protocol(rse_info, 'delete', scheme=None) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files))) try: s = time.time() update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files]) for replica in files: try: replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0]) except ReplicaUnAvailable as e: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e)) logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg)) replica['pfn'] = None add_message('deletion-planned', {'scope': replica['scope'], 'name': replica['name'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'rse': rse_info['rse']}) # logging.debug('update_replicas_states %s' % (time.time() - s)) monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) if not scheme: try: deleted_files = [] p.connect() for replica in files: try: logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) s = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) else: if replica['pfn']: p.delete(replica['pfn']) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000) duration = time.time() - s deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-done', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'duration': duration}) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) except SourceNotFound: err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': err_msg}) except (ServiceUnavailable, RSEAccessDenied) as e: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except Exception as e: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) except (ServiceUnavailable, RSEAccessDenied) as e: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) finally: p.close() s = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse=rse['rse'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s)) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) deleting_rate += len(deleted_files) except: logging.critical(traceback.format_exc()) deleting_rate = deleting_rate * 1.0 / max_being_deleted_files if deleting_rate > max_deleting_rate: max_deleting_rate = deleting_rate except: logging.critical(traceback.format_exc()) if once: break logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate)) sleep_time = int((1 - max_deleting_rate) * 60 + 1) time.sleep(sleep_time) except: logging.critical(traceback.format_exc()) logging.info('Graceful stop requested') logging.info('Graceful stop done')
def reaper(rses=[], worker_number=1, total_workers=1, chunk_size=100, once=False, scheme=None): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. """ logging.info('Starting Dark Reaper %s-%s: Will work on RSEs: %s', worker_number, total_workers, str(rses)) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256(sys.argv[0] + ''.join(rses)).hexdigest() sanity_check(executable=None, hostname=hostname) while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info( 'Dark Reaper({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}' .format(locals())) nothing_to_do = True random.shuffle(rses) for rse_id in rses: rse = rse_core.get_rse_name(rse_id=rse_id) replicas = list_quarantined_replicas( rse_id=rse_id, limit=chunk_size, worker_number=worker_number, total_workers=total_workers) rse_info = rsemgr.get_rse_info(rse_id=rse_id) prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) deleted_replicas = [] try: prot.connect() for replica in replicas: nothing_to_do = False try: pfn = str( rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{ 'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values()[0]) logging.info( 'Dark Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, replica['scope'], replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logging.info( 'Dark Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, total_workers, replica['scope'], replica['name'], pfn, rse, duration) add_message( 'deletion-done', { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] }) deleted_replicas.append(replica) except SourceNotFound: err_msg = 'Dark Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % ( worker_number, total_workers, replica['scope'], replica['name'], pfn, rse) logging.warning(err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = 'Dark Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % ( worker_number, total_workers, replica['scope'], replica['name'], pfn, rse, str(error)) logging.warning(err_msg) add_message( 'deletion-failed', { 'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] }) except: logging.critical(traceback.format_exc()) finally: prot.close() delete_quarantined_replicas(rse_id=rse_id, replicas=deleted_replicas) if once: break if once: break if nothing_to_do: logging.info( 'Dark Reaper %s-%s: Nothing to do. I will sleep for 60s', worker_number, total_workers) time.sleep(60) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def update_bad_request(req, dest_rse, new_state, detail, session=None): if new_state == RequestState.FAILED: request.set_request_state(req['request_id'], new_state, session=session) activity = 'default' if req['attributes']: if type(req['attributes']) is dict: req_attributes = json.loads(json.dumps(req['attributes'])) else: req_attributes = json.loads(str(req['attributes'])) activity = req_attributes['activity'] if req_attributes['activity'] else 'default' tss = time.time() add_message('transfer-failed', {'activity': activity, 'request-id': req['request_id'], 'checksum-adler': None, 'checksum-md5': None, 'dst-rse': dest_rse, 'dst-url': None, 'name': req['name'], 'guid': None, 'file-size': None, 'previous-request-id': req['request_id'], 'protocol': None, 'reason': detail, 'transfer-link': None, 'scope': req['scope'], 'src-rse': None, 'src-url': None, 'tool-id': 'rucio-conveyor', 'transfer-endpoint': config_get('conveyor', 'ftshosts'), 'transfer-id': None}, session=session) request.archive_request(req['request_id'], session=session) logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'], req['name'], req['request_id'], detail)) try: replica.update_replicas_states([{'rse': dest_rse, 'scope': req['scope'], 'name': req['name'], 'state': ReplicaState.UNAVAILABLE}], session=session) except: logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise tss = time.time() try: lock.failed_transfer(req['scope'], req['name'], req['dest_rse_id'], session=session) except: logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'], req['name'], dest_rse, traceback.format_exc())) raise record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)