Пример #1
0
    def test_pop_messages(self):
        """ MESSAGE (CORE): Test retrieve and delete messages """

        truncate_messages()
        for i in range(10):
            add_message(event_type='TEST',
                        payload={
                            'foo': True,
                            'monty': 'python',
                            'number': i
                        })

        tmp = retrieve_messages(10)
        to_delete = []
        for i in tmp:
            assert isinstance(i['payload'], dict)
            assert i['payload']['foo'] is True
            assert i['payload']['monty'] == 'python'
            assert i['payload']['number'] in list(range(100))
            to_delete.append({
                'id': i['id'],
                'created_at': i['created_at'],
                'updated_at': i['created_at'],
                'payload': str(i['payload']),
                'event_type': i['event_type']
            })

        delete_messages(to_delete)

        assert retrieve_messages() == []
Пример #2
0
def delete_dids(dids, account, session=None):
    """
    Delete data identifiers

    :param dids: The list of dids to delete.
    :param account: The account.
    :param session: The database session in use.
    """

    rule_id_clause = []
    content_clause = []
    parent_content_clause = []
    did_clause = []
    for did in dids:
        logging.info('Removing did %s:%s' % (did['scope'], did['name']))
        did_clause.append(and_(models.DataIdentifier.scope == did['scope'], models.DataIdentifier.name == did['name']))
        parent_content_clause.append(and_(models.DataIdentifierAssociation.child_scope == did['scope'], models.DataIdentifierAssociation.child_name == did['name']))
        rule_id_clause.append(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name']))
        content_clause.append(and_(models.DataIdentifierAssociation.scope == did['scope'], models.DataIdentifierAssociation.name == did['name']))

        # Send message for AMI
        add_message('ERASE', {'account': account,
                              'scope': did['scope'],
                              'name': did['name']},
                    session=session)

    # Delete rules on did
    if rule_id_clause:
        with record_timer_block('undertaker.rules'):
            for (rule_id, scope, name, rse_expression, ) in session.query(models.ReplicationRule.id,
                                                                          models.ReplicationRule.scope,
                                                                          models.ReplicationRule.name,
                                                                          models.ReplicationRule.rse_expression).filter(or_(*rule_id_clause)):
                logging.debug('Removing rule %s for did %s:%s on RSE-Expression %s' % (str(rule_id), scope, name, rse_expression))
                rucio.core.rule.delete_rule(rule_id=rule_id, nowait=True, session=session)

    # Detach from parent dids:
    existing_parent_dids = False
    if parent_content_clause:
        with record_timer_block('undertaker.parent_content'):
            for parent_did in session.query(models.DataIdentifierAssociation).filter(or_(*parent_content_clause)):
                existing_parent_dids = True
                detach_dids(scope=parent_did.scope, name=parent_did.name, dids=[{'scope': parent_did.child_scope, 'name': parent_did.child_name}], session=session)

    # Remove content
    if content_clause:
        with record_timer_block('undertaker.content'):
            rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*content_clause)).\
                delete(synchronize_session=False)
        record_counter(counters='undertaker.content.rowcount',  delta=rowcount)

    # remove data identifier
    if existing_parent_dids:
        # Exit method early to give Judge time to remove locks (Otherwise, due to foreign keys, did removal does not work
        logging.debug('Leaving delete_dids early for Judge-Evaluator checks')
        return
    with record_timer_block('undertaker.dids'):
        rowcount = session.query(models.DataIdentifier).filter(or_(*did_clause)).\
            filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)).\
            delete(synchronize_session=False)
Пример #3
0
def add_dids(dids, account, session=None):
    """
    Bulk add data identifiers.

    :param dids: A list of dids.
    :param account: The account owner.
    :param session: The database session in use.
    """
    try:

        for did in dids:
            try:

                if isinstance(did['type'], str) or isinstance(did['type'], unicode):
                    did['type'] = DIDType.from_sym(did['type'])

                if did['type'] == DIDType.FILE:
                    raise exception.UnsupportedOperation("Only collection (dataset/container) can be registered." % locals())

                # Lifetime
                expired_at = None
                if did.get('lifetime'):
                    expired_at = datetime.utcnow() + timedelta(seconds=did['lifetime'])

                # Insert new data identifier
                new_did = models.DataIdentifier(scope=did['scope'], name=did['name'], account=did.get('account') or account,
                                                did_type=did['type'], monotonic=did.get('statuses', {}).get('monotonic', False),
                                                is_open=True, expired_at=expired_at)
                # Add metadata
                # ToDo: metadata validation
                # validate_meta(did.get('meta', {}))
                for key in did.get('meta', {}):
                    new_did.update({key: did['meta'][key]})

                new_did.save(session=session, flush=False)

                if did.get('rules', None):
                    rucio.core.rule.add_rules(dids=[did, ], rules=did['rules'], session=session)

                event_type = None
                if did['type'] == DIDType.CONTAINER:
                    event_type = 'REGISTER_CNT'
                if did['type'] == DIDType.DATASET:
                    event_type = 'REGISTER_DTS'
                if event_type:
                    add_message(event_type, {'account': account,
                                             'scope': did['scope'],
                                             'name': did['name'],
                                             'expires': str(expired_at)},
                                session=session)

            except KeyError, e:
                # ToDo
                raise

        session.flush()
Пример #4
0
def add_monitor_message(response, session=None):
    if response['new_state'] == RequestState.DONE:
        transfer_status = 'transfer-done'
    elif response['new_state'] == RequestState.FAILED:
        transfer_status = 'transfer-failed'
    elif response['new_state'] == RequestState.LOST:
        transfer_status = 'transfer-lost'

    activity = response.get('activity', None)
    src_rse = response.get('src_rse', None)
    src_url = response.get('src_url', None)
    dst_rse = response.get('dst_rse', None)
    dst_url = response.get('dst_url', None)
    dst_protocol = dst_url.split(':')[0] if dst_url else None
    reason = response.get('reason', None)
    duration = response.get('duration', -1)
    filesize = response.get('filesize', None)
    md5 = response.get('md5', None)
    adler32 = response.get('adler32', None)
    scope = response.get('scope', None)
    name = response.get('name', None)
    job_m_replica = response.get('job_m_replica', None)
    if job_m_replica and str(job_m_replica) == str('true'):
        try:
            rse_name = get_source_rse(scope, name, src_url)
        except:
            logging.warn('Cannot get correct RSE for source url: %s(%s)' % (src_url, sys.exc_info()[1]))
            rse_name = None
        if rse_name and rse_name != src_rse:
            src_rse = rse_name
            logging.info('find RSE: %s for source surl: %s' % (src_rse, src_url))

    add_message(transfer_status, {'activity': activity,
                                  'request-id': response['request_id'],
                                  'duration': duration,
                                  'checksum-adler': adler32,
                                  'checksum-md5': md5,
                                  'file-size': filesize,
                                  'guid': None,
                                  'previous-request-id': response['previous_attempt_id'],
                                  'protocol': dst_protocol,
                                  'scope': response['scope'],
                                  'name': response['name'],
                                  'src-rse': src_rse,
                                  'src-url': src_url,
                                  'dst-rse': dst_rse,
                                  'dst-url': dst_url,
                                  'reason': reason,
                                  'transfer-endpoint': response['external_host'],
                                  'transfer-id': response['transfer_id'],
                                  'transfer-link': 'https://%s:8449/fts3/ftsmon/#/job/%s' % (response['external_host'],
                                                                                             response['transfer_id']),
                                  'tool-id': 'rucio-conveyor'},
                session=session)
Пример #5
0
    def test_add_message(self):
        """ MESSAGE (CORE): Test valid and invalid message """

        truncate_messages()

        add_message(event_type='NEW_DID', payload={'name': 'name',
                                                   'name_Y': 'scope_X',
                                                   'type': 'file'})

        with assert_raises(InvalidObject):
            add_message(event_type='NEW_DID', payload={'name': 'name',
                                                       'name_Y': 'scope_X',
                                                       'type': int})
Пример #6
0
    def test_add_message(self):
        """ MESSAGE (CORE): Test valid and invalid message """

        truncate_messages()

        add_message(event_type='NEW_DID', payload={'name': 'name',
                                                   'name_Y': 'scope_X',
                                                   'type': 'file'})

        with assert_raises(InvalidObject):
                add_message(event_type='NEW_DID', payload={'name': 'name',
                                                           'name_Y': 'scope_X',
                                                           'type':  int})
Пример #7
0
def set_transfers_state(transfers, submitted_at, session=None):
    """
    Update the transfer info of a request.

    :param transfers:  Dictionary containing request transfer info.
    :param session:    Database session to use.
    """

    try:
        for request_id in transfers:
            rowcount = session.query(models.Request)\
                              .filter_by(id=request_id)\
                              .filter(models.Request.state == RequestState.SUBMITTING)\
                              .update({'state': transfers[request_id]['state'],
                                       'external_id': transfers[request_id]['external_id'],
                                       'external_host': transfers[request_id]['external_host'],
                                       'source_rse_id': transfers[request_id]['src_rse_id'],
                                       'submitted_at': submitted_at},
                                      synchronize_session=False)
            if rowcount == 0:
                raise RucioException("Failed to set requests %s tansfer %s: request doesn't exist or is not in SUBMITTING state" % (request_id, transfers[request_id]))

            request_type = transfers[request_id].get('request_type', None)
            msg = {'request-id': request_id,
                   'request-type': str(request_type).lower() if request_type else request_type,
                   'scope': transfers[request_id]['scope'],
                   'name': transfers[request_id]['name'],
                   'src-rse-id': transfers[request_id]['metadata'].get('src_rse_id', None),
                   'src-rse': transfers[request_id]['metadata'].get('src_rse', None),
                   'dst-rse-id': transfers[request_id]['metadata'].get('dst_rse_id', None),
                   'dst-rse': transfers[request_id]['metadata'].get('dst_rse', None),
                   'state': str(transfers[request_id]['state']),
                   'activity': transfers[request_id]['metadata'].get('activity', None),
                   'file-size': transfers[request_id]['metadata'].get('filesize', None),
                   'bytes': transfers[request_id]['metadata'].get('filesize', None),
                   'checksum-md5': transfers[request_id]['metadata'].get('md5', None),
                   'checksum-adler': transfers[request_id]['metadata'].get('adler32', None),
                   'external-id': transfers[request_id]['external_id'],
                   'external-host': transfers[request_id]['external_host'],
                   'queued_at': str(submitted_at)}

            if msg['request-type']:
                transfer_status = '%s-%s' % (msg['request-type'], msg['state'])
            else:
                transfer_status = 'transfer-%s' % msg['state']
            transfer_status = transfer_status.lower()
            message_core.add_message(transfer_status, msg, session=session)

    except IntegrityError as error:
        raise RucioException(error.args)
Пример #8
0
    def test_pop_messages(self):
        """ MESSAGE (CORE): Test retrieve and delete messages """

        truncate_messages()
        for i in xrange(10):
            add_message(event_type='TEST', payload={'foo': True,
                                                    'monty': 'python',
                                                    'number': i})

        tmp = retrieve_messages(10)
        to_delete = []
        for i in tmp:
            assert_is_instance(i['payload'], dict)
            assert_equal(i['payload']['foo'], True)
            assert_equal(i['payload']['monty'], 'python')
            assert_in(i['payload']['number'], xrange(100))
            to_delete.append(i['id'])
        delete_messages(to_delete)

        assert_equal(retrieve_messages(), [])
Пример #9
0
    def test_pop_messages(self):
        """ MESSAGE (CORE): Test retrieve and delete messages """

        truncate_messages()
        for i in xrange(10):
            add_message(event_type='TEST',
                        payload={
                            'foo': True,
                            'monty': 'python',
                            'number': i
                        })

        tmp = retrieve_messages(10)
        to_delete = []
        for i in tmp:
            assert_is_instance(i['payload'], dict)
            assert_equal(i['payload']['foo'], True)
            assert_equal(i['payload']['monty'], 'python')
            assert_in(i['payload']['number'], xrange(100))
            to_delete.append(i['id'])
        delete_messages(to_delete)

        assert_equal(retrieve_messages(), [])
Пример #10
0
    def test_hermes(self):
        ''' HERMES (DAEMON): Test the messaging daemon. '''
        for i in range(1, 4):
            add_message('test-type_%i' % i, {'test': i})
            add_message(
                'email', {
                    'to':
                    config_get('messaging-hermes', 'email_test').split(','),
                    'subject':
                    'Half-Life %i' % i,
                    'body':
                    '''
                                  Good morning, and welcome to the Black Mesa Transit System.

                                  This automated train is provided for the security and convenience of
                                  the Black Mesa Research Facility personnel. The time is eight-forty
                                  seven A.M... Current outside temperature is ninety three degrees with
                                  an estimated high of one hundred and five. Before exiting the train,
                                  be sure to check your area for personal belongings.

                                  Thank you, and have a very safe, and productive day.'''
                })

        hermes.run(once=True, send_email=False)
Пример #11
0
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100,
           once=False, greedy=False, scheme=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number,
                                 rse['rse'], nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse_id=rse['id'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(total_children)

                    start = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse_id=rse['id'],
                                                          bytes=needed_free_space_per_child,
                                                          limit=max_being_deleted_files,
                                                          worker_number=child_number,
                                                          total_workers=total_children,
                                                          delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                                     worker_number, child_number, rse['rse'],
                                     nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info,
                                                                               lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                                                                               operation='delete', scheme=scheme).values())[0])
                                except (ReplicaUnAvailable, ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        deletion_dict = {'scope': replica['scope'].external,
                                                         'name': replica['name'],
                                                         'rse': rse_info['rse'],
                                                         'rse_id': rse_info['id'],
                                                         'file-size': replica['bytes'],
                                                         'bytes': replica['bytes'],
                                                         'url': replica['pfn'],
                                                         'protocol': prot.attributes['scheme']}
                                        if replica['scope'].vo != 'def':
                                            deletion_dict['vo'] = replica['scope'].vo
                                        logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                            logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                            worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None:
                                                    pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                        deletion_dict['duration'] = duration
                                        add_message('deletion-done', deletion_dict)
                                        logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration)
                                    except SourceNotFound:
                                        err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                        if replica['state'] == ReplicaState.AVAILABLE:
                                            deletion_dict['reason'] = str(err_msg)
                                            add_message('deletion-failed', deletion_dict)
                                    except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except Exception as error:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                    payload = {'scope': replica['scope'].external,
                                               'name': replica['name'],
                                               'rse': rse_info['rse'],
                                               'rse_id': rse_info['id'],
                                               'file-size': replica['bytes'],
                                               'bytes': replica['bytes'],
                                               'url': replica['pfn'],
                                               'reason': str(error),
                                               'protocol': prot.attributes['scheme']}
                                    if replica['scope'].vo != 'def':
                                        deletion_dict['vo'] = replica['scope'].vo
                                    add_message('deletion-failed', payload)
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start)
                            monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Пример #12
0
def add_monitor_message(request, response, session=None):
    if request['request_type']:
        transfer_status = '%s-%s' % (request['request_type'], response['new_state'])
    else:
        transfer_status = 'transfer-%s' % (response['new_state'])
    transfer_status = transfer_status.lower()

    activity = response.get('activity', None)
    src_type = response.get('src_type', None)
    src_rse = response.get('src_rse', None)
    src_url = response.get('src_url', None)
    dst_type = response.get('dst_type', None)
    dst_rse = response.get('dst_rse', None)
    dst_url = response.get('dst_url', None)
    dst_protocol = dst_url.split(':')[0] if dst_url else None
    reason = response.get('reason', None)
    duration = response.get('duration', -1)
    filesize = response.get('filesize', None)
    md5 = response.get('md5', None)
    adler32 = response.get('adler32', None)
    created_at = response.get('created_at', None)
    submitted_at = response.get('submitted_at', None)
    started_at = response.get('started_at', None)
    transferred_at = response.get('transferred_at', None)
    account = response.get('account', None)

    if response['external_host']:
        transfer_link = '%s/fts3/ftsmon/#/job/%s' % (response['external_host'].replace('8446', '8449'), response['transfer_id'])
    else:
        # for LOST request, response['external_host'] maybe is None
        transfer_link = None

    add_message(transfer_status, {'activity': activity,
                                  'request-id': response['request_id'],
                                  'duration': duration,
                                  'checksum-adler': adler32,
                                  'checksum-md5': md5,
                                  'file-size': filesize,
                                  'bytes': filesize,
                                  'guid': None,
                                  'previous-request-id': response['previous_attempt_id'],
                                  'protocol': dst_protocol,
                                  'scope': response['scope'],
                                  'name': response['name'],
                                  'src-type': src_type,
                                  'src-rse': src_rse,
                                  'src-url': src_url,
                                  'dst-type': dst_type,
                                  'dst-rse': dst_rse,
                                  'dst-url': dst_url,
                                  'reason': reason,
                                  'transfer-endpoint': response['external_host'],
                                  'transfer-id': response['transfer_id'],
                                  'transfer-link': transfer_link,
                                  'created_at': str(created_at) if created_at else None,
                                  'submitted_at': str(submitted_at) if submitted_at else None,
                                  'started_at': str(started_at) if started_at else None,
                                  'transferred_at': str(transferred_at) if transferred_at else None,
                                  'tool-id': 'rucio-conveyor',
                                  'account': account},
                session=session)
Пример #13
0
def delete_from_storage(replicas, prot, rse_info, staging_areas, prepend_str):
    deleted_files = []
    rse_name = rse_info['rse']
    rse_id = rse_info['id']
    try:
        prot.connect()
        for replica in replicas:
            # Physical deletion
            try:
                deletion_dict = {
                    'scope': replica['scope'].external,
                    'name': replica['name'],
                    'rse': rse_name,
                    'file-size': replica['bytes'],
                    'bytes': replica['bytes'],
                    'url': replica['pfn'],
                    'protocol': prot.attributes['scheme']
                }
                logging.info('%s Deletion ATTEMPT of %s:%s as %s on %s',
                             prepend_str, replica['scope'], replica['name'],
                             replica['pfn'], rse_name)
                start = time.time()
                # For STAGING RSEs, no physical deletion
                if rse_id in staging_areas:
                    logging.warning(
                        '%s Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                        prepend_str, replica['scope'], replica['name'],
                        replica['pfn'], rse_name)
                    deleted_files.append({
                        'scope': replica['scope'],
                        'name': replica['name']
                    })
                    continue

                if replica['pfn']:
                    pfn = replica['pfn']
                    # sign the URL if necessary
                    if prot.attributes['scheme'] == 'https' and rse_info[
                            'sign_url'] is not None:
                        pfn = get_signed_url(rse_info['sign_url'], 'delete',
                                             pfn)
                    prot.delete(pfn)
                else:
                    logging.warning(
                        '%s Deletion UNAVAILABLE of %s:%s as %s on %s',
                        prepend_str, replica['scope'], replica['name'],
                        replica['pfn'], rse_name)

                monitor.record_timer(
                    'daemons.reaper.delete.%s.%s' %
                    (prot.attributes['scheme'], rse_name),
                    (time.time() - start) * 1000)
                duration = time.time() - start

                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })

                deletion_dict['duration'] = duration
                add_message('deletion-done', deletion_dict)
                logging.info(
                    '%s Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name, duration)

            except SourceNotFound:
                err_msg = '%s Deletion NOTFOUND of %s:%s as %s on %s' % (
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name)
                logging.warning(err_msg)
                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })
                if replica['state'] == ReplicaState.AVAILABLE:
                    deletion_dict['reason'] = str(err_msg)
                    add_message('deletion-failed', deletion_dict)

            except (ServiceUnavailable, RSEAccessDenied,
                    ResourceTemporaryUnavailable) as error:
                logging.warning(
                    '%s Deletion NOACCESS of %s:%s as %s on %s: %s',
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name, str(error))
                deletion_dict['reason'] = str(error)
                add_message('deletion-failed', deletion_dict)

            except Exception as error:
                logging.critical(
                    '%s Deletion CRITICAL of %s:%s as %s on %s: %s',
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name, str(traceback.format_exc()))
                deletion_dict['reason'] = str(error)
                add_message('deletion-failed', deletion_dict)

    except (ServiceUnavailable, RSEAccessDenied,
            ResourceTemporaryUnavailable) as error:
        for replica in replicas:
            logging.warning('%s Deletion NOACCESS of %s:%s as %s on %s: %s',
                            prepend_str, replica['scope'], replica['name'],
                            replica['pfn'], rse_name, str(error))
            add_message(
                'deletion-failed', {
                    'scope': replica['scope'].external,
                    'name': replica['name'],
                    'rse': rse_name,
                    'file-size': replica['bytes'],
                    'bytes': replica['bytes'],
                    'url': replica['pfn'],
                    'reason': str(error),
                    'protocol': prot.attributes['scheme']
                })

    finally:
        prot.close()
    return deleted_files
Пример #14
0
def reaper(rses=[],
           worker_number=0,
           total_workers=1,
           chunk_size=100,
           once=False,
           scheme=None,
           sleep_time=60):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param total_workers:  The total number of workers.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param sleep_time: Thread sleep time after each chunk of work.
    """
    logging.info('Starting Light Reaper %s-%s: Will work on RSEs: %s',
                 worker_number, total_workers,
                 ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(sys.argv[0] +
                                     ''.join([rse['rse']
                                              for rse in rses])).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            prepend_str = 'light-reaper [%i/%i] : ' % (
                heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals()))
            nothing_to_do = True
            start_time = time.time()

            random.shuffle(rses)
            for rse in rses:
                rse_id = rse['id']
                rse = rse['rse']
                replicas = list_expired_temporary_dids(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=worker_number,
                    total_workers=total_workers)

                rse_info = rsemgr.get_rse_info(rse_id=rse_id)
                rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id)
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)
                deleted_replicas = []
                try:
                    prot.connect()
                    for replica in replicas:
                        nothing_to_do = False
                        try:
                            # pfn = str(rsemgr.lfns2pfns(rse_settings=rse_info,
                            #                            lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                            #                            operation='delete', scheme=scheme).values()[0])
                            pfn = 's3://%s%s%s' % (prot.attributes['hostname'],
                                                   prot.attributes['prefix'],
                                                   replica['name'])
                            # logging.debug('Light Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, replica['scope'], replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logger(
                                logging.INFO,
                                'Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                replica['scope'], replica['name'], pfn, rse,
                                duration)
                            payload = {
                                'scope': replica['scope'].external,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica.get('bytes') or 0,
                                'bytes': replica.get('bytes') or 0,
                                'url': pfn,
                                'duration': duration,
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-done', payload)
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (
                                replica['scope'], replica['name'], pfn, rse)
                            logger(logging.WARNING, err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = 'Deletion NOACCESS of %s:%s as %s on %s: %s' % (
                                replica['scope'], replica['name'], pfn, rse,
                                str(error))
                            logger(logging.WARNING, err_msg)
                            payload = {
                                'scope': replica['scope'].external,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica['bytes'] or 0,
                                'bytes': replica['bytes'] or 0,
                                'url': pfn,
                                'reason': str(error),
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-failed', payload)

                        except:
                            logger(logging.CRITICAL, traceback.format_exc())
                finally:
                    prot.close()

                delete_temporary_dids(dids=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logger(logging.INFO, 'Nothing to do. I will sleep for 60s')
                daemon_sleep(start_time=start_time,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Пример #15
0
def add_exception(dids, account, pattern, comments, expires_at, session=None):
    """
    Add exceptions to Lifetime Model.

    :param dids:        The list of dids
    :param account:     The account of the requester.
    :param pattern:     The account.
    :param comments:    The comments associated to the exception.
    :param expires_at:  The expiration date of the exception.
    :param session:     The database session in use.

    returns:            The id of the exception.
    """
    exception_id = generate_uuid()
    text = 'Account %s requested a lifetime extension for a list of DIDs that can be found below\n' % account
    reason = comments
    volume = None
    lifetime = None
    if comments.find('||||') > -1:
        reason, volume = comments.split('||||')
    text += 'The reason for the extension is "%s"\n' % reason
    text += 'It represents %s datasets\n' % len(dids)
    if volume:
        text += 'The estimated physical volume is %s\n' % volume
    if expires_at and isinstance(expires_at, string_types):
        lifetime = str_to_date(expires_at)
        text += 'The lifetime exception should expires on %s\n' % str(
            expires_at)
    elif isinstance(expires_at, datetime):
        lifetime = expires_at
        text += 'The lifetime exception should expires on %s\n' % str(
            expires_at)
    text += 'Link to approve or reject this request can be found at the end of the mail\n'
    text += '\n'
    text += 'DIDTYPE SCOPE NAME\n'
    text += '\n'
    truncated_message = False
    for did in dids:
        did_type = None
        if 'did_type' in did:
            if isinstance(did['did_type'], string_types):
                did_type = DIDType.from_sym(did['did_type'])
            else:
                did_type = did['did_type']
        new_exception = models.LifetimeExceptions(
            id=exception_id,
            scope=did['scope'],
            name=did['name'],
            did_type=did_type,
            account=account,
            pattern=pattern,
            comments=reason,
            state=LifetimeExceptionsState.WAITING,
            expires_at=lifetime)
        if len(text) < 3000:
            text += '%s %s %s\n' % (str(did_type), did['scope'], did['name'])
        else:
            truncated_message = True
        try:
            new_exception.save(session=session, flush=False)
        except IntegrityError as error:
            if match('.*ORA-00001.*', str(error.args[0])) \
                    or match('.*IntegrityError.*UNIQUE constraint failed.*', str(error.args[0])) \
                    or match('.*1062.*Duplicate entry.*for key.*', str(error.args[0])) \
                    or match('.*IntegrityError.*columns? .*not unique.*', error.args[0]):
                raise LifetimeExceptionDuplicate()
            raise RucioException(error.args[0])
    if truncated_message:
        text += '...\n'
        text += 'List too long. Truncated\n'
    text += '\n'
    text += 'Approve:   https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=approve\n' % str(
        exception_id)
    text += 'Deny:      https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=deny\n' % str(
        exception_id)
    approvers_email = get('lifetime_model',
                          'approvers_email',
                          default=[],
                          session=session)
    if approvers_email:
        approvers_email = approvers_email.split(',')  # pylint: disable=no-member

    add_message(event_type='email',
                payload={
                    'body':
                    text,
                    'to':
                    approvers_email,
                    'subject':
                    '[RUCIO] Request to approve lifetime exception %s' %
                    str(exception_id)
                },
                session=session)
    return exception_id
Пример #16
0
def delete_from_storage(replicas,
                        prot,
                        rse_info,
                        staging_areas,
                        auto_exclude_threshold,
                        logger=logging.log):
    deleted_files = []
    rse_name = rse_info['rse']
    rse_id = rse_info['id']
    noaccess_attempts = 0
    pfns_to_bulk_delete = []
    try:
        prot.connect()
        for replica in replicas:
            # Physical deletion
            try:
                deletion_dict = {
                    'scope': replica['scope'].external,
                    'name': replica['name'],
                    'rse': rse_name,
                    'file-size': replica['bytes'],
                    'bytes': replica['bytes'],
                    'url': replica['pfn'],
                    'protocol': prot.attributes['scheme']
                }
                if replica['scope'].vo != 'def':
                    deletion_dict['vo'] = replica['scope'].vo
                logger(logging.DEBUG, 'Deletion ATTEMPT of %s:%s as %s on %s',
                       replica['scope'], replica['name'], replica['pfn'],
                       rse_name)
                start = time.time()
                # For STAGING RSEs, no physical deletion
                if rse_id in staging_areas:
                    logger(
                        logging.WARNING,
                        'Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                        replica['scope'], replica['name'], replica['pfn'],
                        rse_name)
                    deleted_files.append({
                        'scope': replica['scope'],
                        'name': replica['name']
                    })
                    continue

                if replica['pfn']:
                    pfn = replica['pfn']
                    # sign the URL if necessary
                    if prot.attributes['scheme'] == 'https' and rse_info[
                            'sign_url'] is not None:
                        pfn = get_signed_url(rse_id, rse_info['sign_url'],
                                             'delete', pfn)
                    if prot.attributes['scheme'] == 'globus':
                        pfns_to_bulk_delete.append(replica['pfn'])
                    else:
                        prot.delete(pfn)
                else:
                    logger(logging.WARNING,
                           'Deletion UNAVAILABLE of %s:%s as %s on %s',
                           replica['scope'], replica['name'], replica['pfn'],
                           rse_name)

                monitor.record_timer('daemons.reaper.delete.{scheme}.{rse}',
                                     (time.time() - start) * 1000,
                                     labels={
                                         'scheme': prot.attributes['scheme'],
                                         'rse': rse_name
                                     })
                duration = time.time() - start

                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })

                deletion_dict['duration'] = duration
                add_message('deletion-done', deletion_dict)
                logger(
                    logging.INFO,
                    'Deletion SUCCESS of %s:%s as %s on %s in %.2f seconds',
                    replica['scope'], replica['name'], replica['pfn'],
                    rse_name, duration)

            except SourceNotFound:
                duration = time.time() - start
                err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s in %.2f seconds' % (
                    replica['scope'], replica['name'], replica['pfn'],
                    rse_name, duration)
                logger(logging.WARNING, '%s', err_msg)
                deletion_dict['reason'] = 'File Not Found'
                deletion_dict['duration'] = duration
                add_message('deletion-not-found', deletion_dict)
                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })

            except (ServiceUnavailable, RSEAccessDenied,
                    ResourceTemporaryUnavailable) as error:
                duration = time.time() - start
                logger(logging.WARNING,
                       'Deletion NOACCESS of %s:%s as %s on %s: %s in %.2f',
                       replica['scope'], replica['name'], replica['pfn'],
                       rse_name, str(error), duration)
                deletion_dict['reason'] = str(error)
                deletion_dict['duration'] = duration
                add_message('deletion-failed', deletion_dict)
                noaccess_attempts += 1
                if noaccess_attempts >= auto_exclude_threshold:
                    logger(
                        logging.INFO,
                        'Too many (%d) NOACCESS attempts for %s. RSE will be temporarly excluded.',
                        noaccess_attempts, rse_name)
                    REGION.set('temporary_exclude_%s' % rse_id, True)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    break

            except Exception as error:
                duration = time.time() - start
                logger(
                    logging.CRITICAL,
                    'Deletion CRITICAL of %s:%s as %s on %s in %.2f seconds : %s',
                    replica['scope'], replica['name'], replica['pfn'],
                    rse_name, duration, str(traceback.format_exc()))
                deletion_dict['reason'] = str(error)
                deletion_dict['duration'] = duration
                add_message('deletion-failed', deletion_dict)

        if pfns_to_bulk_delete and prot.attributes['scheme'] == 'globus':
            logger(logging.DEBUG,
                   'Attempting bulk delete on RSE %s for scheme %s', rse_name,
                   prot.attributes['scheme'])
            prot.bulk_delete(pfns_to_bulk_delete)

    except (ServiceUnavailable, RSEAccessDenied,
            ResourceTemporaryUnavailable) as error:
        for replica in replicas:
            logger(logging.WARNING,
                   'Deletion NOACCESS of %s:%s as %s on %s: %s',
                   replica['scope'], replica['name'], replica['pfn'], rse_name,
                   str(error))
            payload = {
                'scope': replica['scope'].external,
                'name': replica['name'],
                'rse': rse_name,
                'file-size': replica['bytes'],
                'bytes': replica['bytes'],
                'url': replica['pfn'],
                'reason': str(error),
                'protocol': prot.attributes['scheme']
            }
            if replica['scope'].vo != 'def':
                payload['vo'] = replica['scope'].vo
            add_message('deletion-failed', payload)
        logger(logging.INFO,
               'Cannot connect to %s. RSE will be temporarly excluded.',
               rse_name)
        REGION.set('temporary_exclude_%s' % rse_id, True)
        labels = {'rse': rse_name}
        EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
    finally:
        prot.close()
    return deleted_files
Пример #17
0
def reaper(rses, chunk_size=100, once=False, scheme=None, sleep_time=300):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(
        (sys.argv[0] + ''.join(rses)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    # heartbeat
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=thread,
                     hash_executable=hash_executable)
    prepend_str = 'dark-reaper [%i/%i] : ' % (heartbeat['assign_thread'],
                                              heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Starting Dark Reaper on RSEs: %s', ', '.join(rses))

    if not once:
        logger(logging.INFO, 'Waiting for heartbeat synchonization')
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            prepend_str = 'dark-reaper [%i/%i] : ' % (
                heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals()))
            nothing_to_do = True
            start_time = time.time()

            rses_to_process = list(
                set(rses) & set(list_rses_with_quarantined_replicas()))
            random.shuffle(rses_to_process)
            for rse_id in rses_to_process:
                # The following query returns the list of real replicas (deleted_replicas) and list of dark replicas (dark_replicas)
                # Real replicas can be directly removed from the quarantine table
                deleted_replicas, dark_replicas = list_quarantined_replicas(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=heartbeat['assign_thread'],
                    total_workers=heartbeat['nr_threads'])

                rse_info = rsemgr.get_rse_info(rse_id=rse_id)
                rse = rse_info['rse']
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)

                heartbeat = live(executable=executable,
                                 hostname=hostname,
                                 pid=pid,
                                 thread=thread,
                                 hash_executable=hash_executable)
                prepend_str = 'dark-reaper [%i/%i] : ' % (
                    heartbeat['assign_thread'], heartbeat['nr_threads'])
                logger = formatted_logger(logging.log, prepend_str + '%s')
                try:
                    prot.connect()
                    for replica in dark_replicas:
                        nothing_to_do = False
                        scope = ''
                        if replica['scope']:
                            scope = replica['scope'].external
                        try:
                            pfn = str(
                                list(
                                    rsemgr.lfns2pfns(
                                        rse_settings=rse_info,
                                        lfns=[{
                                            'scope': scope,
                                            'name': replica['name'],
                                            'path': replica['path']
                                        }],
                                        operation='delete',
                                        scheme=scheme).values())[0])
                            logger(logging.INFO,
                                   'Deletion ATTEMPT of %s:%s as %s on %s',
                                   scope, replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logger(
                                logging.INFO,
                                'Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                scope, replica['name'], pfn, rse, duration)
                            payload = {
                                'scope': scope,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica.get('bytes') or 0,
                                'bytes': replica.get('bytes') or 0,
                                'url': pfn,
                                'duration': duration,
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-done', payload)
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = (
                                'Deletion NOTFOUND of %s:%s as %s on %s' %
                                (scope, replica['name'], pfn, rse))
                            logger(logging.WARNING, err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = (
                                'Deletion NOACCESS of %s:%s as %s on %s: %s' %
                                (scope, replica['name'], pfn, rse, str(error)))
                            logger(logging.WARNING, err_msg)
                            payload = {
                                'scope': scope,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica['bytes'] or 0,
                                'bytes': replica['bytes'] or 0,
                                'url': pfn,
                                'reason': str(error),
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-failed', payload)

                        except Exception:
                            logging.critical(traceback.format_exc())
                finally:
                    prot.close()

                delete_quarantined_replicas(rse_id=rse_id,
                                            replicas=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logger(logging.INFO, 'Nothing to do')
                daemon_sleep(start_time=start_time,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except Exception:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Пример #18
0
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals())
    while not graceful_stop.is_set():
        try:
            max_deleting_rate = 0

            for rse in rses:
                deleting_rate = 0
                rse_info = rsemgr.get_rse_info(rse['rse'])
                rse_protocol = rse_core.get_rse_protocols(rse['rse'])

                if not rse_protocol['availability_delete']:
                    logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse']))
                    continue

                # Temporary hack to force gfal for deletion
                for protocol in rse_info['protocols']:
                    if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                        protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse']))
                try:
                    needed_free_space, max_being_deleted_files = None, 10000
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space = 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse']))

                    s = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas)))

                    if not replicas:
                        logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse']))
                        continue

                    p = rsemgr.create_protocol(rse_info, 'delete', scheme=None)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files)))
                        try:
                            s = time.time()
                            update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files])

                            for replica in files:
                                try:
                                    replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0])
                                except ReplicaUnAvailable as e:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e))
                                    logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg))
                                    replica['pfn'] = None

                                add_message('deletion-planned', {'scope': replica['scope'],
                                                                 'name': replica['name'],
                                                                 'file-size': replica['bytes'],
                                                                 'url': replica['pfn'],
                                                                 'rse': rse_info['rse']})

                            # logging.debug('update_replicas_states %s' % (time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.being_deleted',  delta=len(files))

                            if not scheme:
                                try:
                                    deleted_files = []
                                    p.connect()
                                    for replica in files:
                                        try:
                                            logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            s = time.time()
                                            if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                                logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number,
                                                                                                                                                                                    child_number,
                                                                                                                                                                                    replica['scope'],
                                                                                                                                                                                    replica['name'],
                                                                                                                                                                                    replica['pfn'],
                                                                                                                                                                                    rse['rse']))
                                            else:
                                                if replica['pfn']:
                                                    p.delete(replica['pfn'])
                                                else:
                                                    logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000)
                                            duration = time.time() - s

                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                            add_message('deletion-done', {'scope': replica['scope'],
                                                                          'name': replica['name'],
                                                                          'rse': rse_info['rse'],
                                                                          'file-size': replica['bytes'],
                                                                          'url': replica['pfn'],
                                                                          'duration': duration})
                                            logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                        except SourceNotFound:
                                            err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                            logging.warning(err_msg)
                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': err_msg})
                                        except (ServiceUnavailable, RSEAccessDenied) as e:
                                            logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except Exception as e:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                except (ServiceUnavailable, RSEAccessDenied) as e:
                                    for replica in files:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                        add_message('deletion-failed', {'scope': replica['scope'],
                                                                        'name': replica['name'],
                                                                        'rse': rse_info['rse'],
                                                                        'file-size': replica['bytes'],
                                                                        'url': replica['pfn'],
                                                                        'reason': str(e)})
                                finally:
                                    p.close()
                            s = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse=rse['rse'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.done',  delta=len(deleted_files))
                            deleting_rate += len(deleted_files)
                        except:
                            logging.critical(traceback.format_exc())
                    deleting_rate = deleting_rate * 1.0 / max_being_deleted_files
                    if deleting_rate > max_deleting_rate:
                        max_deleting_rate = deleting_rate
                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate))
            sleep_time = int((1 - max_deleting_rate) * 60 + 1)
            time.sleep(sleep_time)

        except:
            logging.critical(traceback.format_exc())

    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
Пример #19
0
def reaper(rses=[],
           worker_number=1,
           total_workers=1,
           chunk_size=100,
           once=False,
           scheme=None):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param total_workers:  The total number of workers.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Dark Reaper %s-%s: Will work on RSEs: %s',
                 worker_number, total_workers, str(rses))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(sys.argv[0] + ''.join(rses)).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            logging.info(
                'Dark Reaper({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))
            nothing_to_do = True

            random.shuffle(rses)
            for rse_id in rses:
                rse = rse_core.get_rse_name(rse_id=rse_id)
                replicas = list_quarantined_replicas(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=worker_number,
                    total_workers=total_workers)

                rse_info = rsemgr.get_rse_info(rse_id=rse_id)
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)
                deleted_replicas = []
                try:
                    prot.connect()
                    for replica in replicas:
                        nothing_to_do = False
                        try:
                            pfn = str(
                                rsemgr.lfns2pfns(rse_settings=rse_info,
                                                 lfns=[{
                                                     'scope':
                                                     replica['scope'].external,
                                                     'name':
                                                     replica['name'],
                                                     'path':
                                                     replica['path']
                                                 }],
                                                 operation='delete',
                                                 scheme=scheme).values()[0])
                            logging.info(
                                'Dark Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s',
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logging.info(
                                'Dark Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse, duration)
                            add_message(
                                'deletion-done', {
                                    'scope': replica['scope'].external,
                                    'name': replica['name'],
                                    'rse': rse,
                                    'rse_id': rse_id,
                                    'file-size': replica.get('bytes') or 0,
                                    'bytes': replica.get('bytes') or 0,
                                    'url': pfn,
                                    'duration': duration,
                                    'protocol': prot.attributes['scheme']
                                })
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = 'Dark Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse)
                            logging.warning(err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = 'Dark Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse, str(error))
                            logging.warning(err_msg)
                            add_message(
                                'deletion-failed', {
                                    'scope': replica['scope'].external,
                                    'name': replica['name'],
                                    'rse': rse,
                                    'rse_id': rse_id,
                                    'file-size': replica['bytes'] or 0,
                                    'bytes': replica['bytes'] or 0,
                                    'url': pfn,
                                    'reason': str(error),
                                    'protocol': prot.attributes['scheme']
                                })

                        except:
                            logging.critical(traceback.format_exc())
                finally:
                    prot.close()

                delete_quarantined_replicas(rse_id=rse_id,
                                            replicas=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logging.info(
                    'Dark Reaper %s-%s: Nothing to do. I will sleep for 60s',
                    worker_number, total_workers)
                time.sleep(60)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Пример #20
0
def update_bad_request(req, dest_rse, new_state, detail, session=None):
    if new_state == RequestState.FAILED:
        request.set_request_state(req['request_id'], new_state, session=session)

        activity = 'default'
        if req['attributes']:
            if type(req['attributes']) is dict:
                req_attributes = json.loads(json.dumps(req['attributes']))
            else:
                req_attributes = json.loads(str(req['attributes']))
            activity = req_attributes['activity'] if req_attributes['activity'] else 'default'

        tss = time.time()
        add_message('transfer-failed', {'activity': activity,
                                        'request-id': req['request_id'],
                                        'checksum-adler': None,
                                        'checksum-md5': None,
                                        'dst-rse': dest_rse,
                                        'dst-url': None,
                                        'name': req['name'],
                                        'guid': None,
                                        'file-size': None,
                                        'previous-request-id': req['request_id'],
                                        'protocol': None,
                                        'reason': detail,
                                        'transfer-link': None,
                                        'scope': req['scope'],
                                        'src-rse': None,
                                        'src-url': None,
                                        'tool-id': 'rucio-conveyor',
                                        'transfer-endpoint': config_get('conveyor', 'ftshosts'),
                                        'transfer-id': None},
                    session=session)

        request.archive_request(req['request_id'], session=session)
        logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'],
                                                                req['name'],
                                                                req['request_id'],
                                                                detail))
        try:
            replica.update_replicas_states([{'rse': dest_rse,
                                             'scope': req['scope'],
                                             'name': req['name'],
                                             'state': ReplicaState.UNAVAILABLE}],
                                           session=session)
        except:
            logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'],
                                                                                                      req['name'],
                                                                                                      dest_rse,
                                                                                                      traceback.format_exc()))
            raise

        tss = time.time()
        try:
            lock.failed_transfer(req['scope'],
                                 req['name'],
                                 req['dest_rse_id'],
                                 session=session)
        except:
            logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'],
                                                                                         req['name'],
                                                                                         dest_rse,
                                                                                         traceback.format_exc()))
            raise
        record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)