Exemplo n.º 1
0
    def remove_extra_replicas(self, to_remove):
        """
        :param to_remove: replicas to remove from Rucio
        :return:
        """
        scope = InternalScope(self.scope)
        with monitor.record_timer_block('cms_sync.time_remove_replica'):
            if to_remove and self.dry_run:
                logging.info('Dry run: Removing replicas %s from rse %s.',
                             str(to_remove), self.rse)
            elif to_remove:
                logging.debug('Removing %s replicas from rse %s.',
                              len(to_remove), self.rse)
                for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                    replicas = [{
                        'scope': scope,
                        'name': lfn,
                        "rse_id": self.rse_id,
                        "state": "U"
                    } for lfn in to_remove_chunk]
                    # transactional_session here?
                    # while lock is set stuck, judge-repairer might make transfer requests before rule is gone but does it matter?
                    update_replicas_states(
                        replicas=replicas,
                        add_tombstone=False,
                    )

                # delete_replicas(rse=self.rse, issuer=self.account,
                #                     files=[{'scope': self.scope, 'name': lfn} for lfn in to_remove_chunk])
                return len(to_remove)
Exemplo n.º 2
0
def list_quarantined_replicas(rse_id, limit, worker_number=None, total_workers=None, session=None):
    """
    List RSE Quarantined File replicas.

    :param rse_id: the rse id.
    :param limit: The maximum number of replicas returned.
    :param worker_number:      id of the executing worker.
    :param total_workers:      Number of total workers.
    :param session: The database session in use.

    :returns: two lists :
              - The first one contains quarantine replicas actually registered in the replicas tables
              - The second one contains real "dark" files
    """

    replicas_clause = []
    quarantined_replicas = {}
    real_replicas = []
    dark_replicas = []
    query = session.query(models.QuarantinedReplica.path,
                          models.QuarantinedReplica.bytes,
                          models.QuarantinedReplica.scope,
                          models.QuarantinedReplica.name,
                          models.QuarantinedReplica.created_at).\
        filter(models.QuarantinedReplica.rse_id == rse_id)
    query = filter_thread_work(session=session, query=query, total_threads=total_workers, thread_id=worker_number, hash_variable='path')

    for path, bytes_, scope, name, created_at in query.limit(limit):
        if not (scope, name) in quarantined_replicas:
            quarantined_replicas[(scope, name)] = []
            replicas_clause.append(and_(models.RSEFileAssociation.scope == scope,
                                        models.RSEFileAssociation.name == name))
        quarantined_replicas[(scope, name)].append((path, bytes_, created_at))

    for chunk in chunks(replicas_clause, 20):
        query = session.query(models.RSEFileAssociation.scope,
                              models.RSEFileAssociation.name).\
            filter(models.RSEFileAssociation.rse_id == rse_id).\
            filter(or_(*chunk))

        for scope, name in query.all():
            reps = quarantined_replicas.pop((scope, name))
            real_replicas.extend([{'scope': scope,
                                   'name': name,
                                   'rse_id': rse_id,
                                   'path': rep[0],
                                   'bytes': rep[1],
                                   'created_at': rep[2]}
                                  for rep in reps])

    for key, value in quarantined_replicas.items():
        dark_replicas.extend([{'scope': key[0],
                               'name': key[1],
                               'rse_id': rse_id,
                               'path': rep[0],
                               'bytes': rep[1],
                               'created_at': rep[2]}
                              for rep in value])

    return real_replicas, dark_replicas
Exemplo n.º 3
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting' % worker_number)
    logging.info('Undertaker(%s): started' % worker_number)
    while not graceful_stop.is_set():
        try:
            dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000)
            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.' % worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete' % (worker_number, len(chunk)))
                    delete_dids(dids=chunk, account='root')
                    logging.info('Undertaker(%s): Delete %s dids' % (worker_number, len(chunk)))
                    record_counter(counters='undertaker.delete_dids',  delta=len(chunk))
                except DatabaseException, e:
                    logging.error('Undertaker(%s): Got database error %s.' % (worker_number, str(e)))
        except:
            logging.error(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    logging.info('Undertaker(%s): graceful stop requested' % worker_number)
    logging.info('Undertaker(%s): graceful stop done' % worker_number)
Exemplo n.º 4
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    executable = 'undertaker'
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)

    paused_dids = {}  # {(scope, name): datetime}

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, older_than=6000)
            logging.info('Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'.format(locals()))

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads'], limit=10000)

            dids = [did for did in dids if (did['scope'], did['name']) not in paused_dids]

            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.', worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete', worker_number, len(chunk))
                    delete_dids(dids=chunk, account=InternalAccount('root'), expire_rules=True)
                    logging.info('Undertaker(%s): Delete %s dids', worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids', delta=len(chunk))
                except RuleNotFound as error:
                    logging.error(error)
                except (DatabaseException, DatabaseError, UnsupportedOperation) as e:
                    if match('.*ORA-00054.*', str(e.args[0])) or match('.*55P03.*', str(e.args[0])) or match('.*3572.*', str(e.args[0])):
                        for did in chunk:
                            paused_dids[(did['scope'], did['name'])] = datetime.utcnow() + timedelta(seconds=randint(600, 2400))
                        record_counter('undertaker.delete_dids.exceptions.LocksDetected')
                        logging.warning('undertaker[%s/%s]: Locks detected for chunk', heartbeat['assign_thread'], heartbeat['nr_threads'])
                    else:
                        logging.error('Undertaker(%s): Got database error %s.', worker_number, str(e))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
Exemplo n.º 5
0
def add_quarantined_replicas(rse, replicas, session=None):
    """
    Bulk add quarantined file replicas.

    :param rse:      The rse name.
    :param replicas: A list of dicts with the replica information.
    :param session:  The database session in use.
    """
    rse_id = get_rse_id(rse, session=session)

    for chunk in chunks(replicas, 100):
        # Exlude files that have a registered replica.  This is a
        # safeguard against potential issues in the Auditor.
        file_clause = []
        for replica in chunk:
            file_clause.append(
                and_(
                    models.RSEFileAssociation.scope == replica.get(
                        'scope',
                        None), models.RSEFileAssociation.name == replica.get(
                            'name', None),
                    models.RSEFileAssociation.rse_id == rse_id))
        file_query = session.query(models.RSEFileAssociation.scope,
                                   models.RSEFileAssociation.name,
                                   models.RSEFileAssociation.rse_id).\
            with_hint(models.RSEFileAssociation, "index(REPLICAS REPLICAS_PK)", 'oracle').\
            filter(or_(*file_clause))
        existing_replicas = [(scope, name, rseid)
                             for scope, name, rseid in file_query]
        chunk = [
            replica for replica in chunk
            if (replica.get('scope', None), replica.get('name', None),
                rse_id) not in existing_replicas
        ]

        # Exclude files that have already been added to the quarantined
        # replica table.
        quarantine_clause = []
        for replica in chunk:
            quarantine_clause.append(
                and_(models.QuarantinedReplica.path == replica['path'],
                     models.QuarantinedReplica.rse_id == rse_id))
        quarantine_query = session.query(models.QuarantinedReplica.path,
                                         models.QuarantinedReplica.rse_id).\
            filter(or_(*quarantine_clause))
        quarantine_replicas = [(path, rseid)
                               for path, rseid in quarantine_query]
        chunk = [
            replica for replica in chunk
            if (replica['path'], rse_id) not in quarantine_replicas
        ]

        session.bulk_insert_mappings(models.QuarantinedReplica,
                                     [{
                                         'rse_id': rse_id,
                                         'path': file['path'],
                                         'scope': file.get('scope'),
                                         'name': file.get('name'),
                                         'bytes': file.get('bytes')
                                     } for file in chunk])
Exemplo n.º 6
0
def poller(worker_number=1, total_workers=1, chunk_size=100):

    print '%(worker_number)s / %(total_workers)s: Poller started' % locals()
    while not graceful_stop.is_set():
        try:
            s = time()
            transfer_requests = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                      state=RequestState.SUBMITTED,
                                                      thread=worker_number,
                                                      total_threads=total_workers,
                                                      limit=10000)
            n = len(transfer_requests)
            print '%(worker_number)s / %(total_workers)s: get_next %(n)s requests' % locals()

            if not transfer_requests:
                sleep(0.01)
                continue

            for chunk in chunks(transfer_requests, chunk_size):
                try:
                    s = time()
                    fts3.query_all(transfer_requests=chunk)
                    print 'fts3.query_all', time() - s
                    s = time()
                    common.update_requests_states(chunk)
                    print 'update_requests_states', time() - s
                except:
                    print traceback.format_exc()

        except:
            print traceback.format_exc()
Exemplo n.º 7
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable='rucio-undertaker', hostname=hostname)
    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable='rucio-undertaker',
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             older_than=6000)
            logging.info(
                'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'] +
                                     1,
                                     total_workers=heartbeat['nr_threads'],
                                     limit=10000)
            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.',
                             worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete',
                                 worker_number, len(chunk))
                    delete_dids(dids=chunk, account='root', expire_rules=True)
                    logging.info('Undertaker(%s): Delete %s dids',
                                 worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids',
                                   delta=len(chunk))
                except RuleNotFound as error:
                    logging.error(error)
                except DatabaseException as error:
                    logging.error('Undertaker(%s): Got database error %s.',
                                  worker_number, str(error))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable='rucio-undertaker',
        hostname=hostname,
        pid=pid,
        thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
Exemplo n.º 8
0
def run_once(paused_dids: Dict[Tuple, datetime], chunk_size: int,
             heartbeat_handler: HeartbeatHandler, **_kwargs):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        # Refresh paused dids
        iter_paused_dids = deepcopy(paused_dids)
        for key in iter_paused_dids:
            if datetime.utcnow() > paused_dids[key]:
                del paused_dids[key]

        dids = list_expired_dids(worker_number=worker_number,
                                 total_workers=total_workers,
                                 limit=10000)

        dids = [
            did for did in dids
            if (did['scope'], did['name']) not in paused_dids
        ]

        if not dids:
            logger(logging.INFO, 'did not get any work')
            return

        for chunk in chunks(dids, chunk_size):
            _, _, logger = heartbeat_handler.live()
            try:
                logger(logging.INFO, 'Receive %s dids to delete', len(chunk))
                delete_dids(dids=chunk,
                            account=InternalAccount('root', vo='def'),
                            expire_rules=True)
                logger(logging.INFO, 'Delete %s dids', len(chunk))
                record_counter(name='undertaker.delete_dids', delta=len(chunk))
            except RuleNotFound as error:
                logger(logging.ERROR, error)
            except (DatabaseException, DatabaseError,
                    UnsupportedOperation) as e:
                if match('.*ORA-00054.*', str(e.args[0])) or match(
                        '.*55P03.*', str(e.args[0])) or match(
                            '.*3572.*', str(e.args[0])):
                    for did in chunk:
                        paused_dids[(
                            did['scope'],
                            did['name'])] = datetime.utcnow() + timedelta(
                                seconds=randint(600, 2400))
                    record_counter(
                        'undertaker.delete_dids.exceptions.{exception}',
                        labels={'exception': 'LocksDetected'})
                    logger(logging.WARNING, 'Locks detected for chunk')
                else:
                    logger(logging.ERROR, 'Got database error %s.', str(e))
    except:
        logging.critical(traceback.format_exc())
Exemplo n.º 9
0
def add_quarantined_replicas(rse, replicas, session=None):
    """
    Bulk add quarantined file replicas.

    :param rse:     The rse name.
    :param files:   The list of files.
    :param session: The database session in use.

    :returns: True is successful.
    """
    rse_id = get_rse_id(rse, session=session)

    file_clause = []
    for replica in replicas:
        file_clause.append(
            and_(models.RSEFileAssociation.scope == replica.get('scope', None),
                 models.RSEFileAssociation.name == replica.get('name', None),
                 models.RSEFileAssociation.rse_id == rse_id))

    if file_clause:
        file_query = session.query(models.RSEFileAssociation.scope,
                                   models.RSEFileAssociation.name,
                                   models.RSEFileAssociation.rse_id).\
            with_hint(models.RSEFileAssociation, "index(REPLICAS REPLICAS_PK)", 'oracle').\
            filter(or_(*file_clause))

        existing_replicas = [(scope, name, rseid)
                             for scope, name, rseid in file_query]

        replicas = [
            replica for replica in replicas
            if (replica.get('scope', None), replica.get('name', None),
                rse_id) not in existing_replicas
        ]

        for files in chunks(replicas, 1000):
            session.bulk_insert_mappings(models.QuarantinedReplica,
                                         [{
                                             'rse_id': rse_id,
                                             'path': file['path'],
                                             'scope': file.get('scope'),
                                             'name': file.get('name'),
                                             'bytes': file.get('bytes')
                                         } for file in files])
Exemplo n.º 10
0
def bulk_group_transfers_for_globus(transfers, policy, group_bulk=200):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:  List of transfers to group.
    :param policy:     Policy to use to group.
    :param group_bulk: Bulk sizes.
    :param logger:     Optional decorated logger that can be passed from the calling daemons or servers.
    :return:           List of grouped transfers
    """

    grouped_transfers = {}
    for transfer_path in transfers.values():
        # Globus doesn't support multihop. Get the first hop only.
        transfer = transfer_path[0]
        external_host = transfer['external_host']

        # Some dict elements are not needed by globus transfertool, but are accessed by further common fts/globus code
        t_file = {
            'sources': transfer['sources'],
            'destinations': transfer['dest_urls'],
            'metadata': transfer['file_metadata'],
            'filesize': int(transfer['file_metadata']['filesize']),
            'request_type':
            transfer['file_metadata'].get('request_type', None),
            'activity': str(transfer['file_metadata']['activity'])
        }

        grouped_transfers.setdefault(external_host, []).append(t_file)

    if policy == 'single':
        group_bulk = 1

    grouped_jobs = {}
    for external_host in grouped_transfers:
        for xfers_files in chunks(grouped_transfers[external_host],
                                  group_bulk):
            # Job params are not used by globus trasnfertool, but are needed for further common fts/globus code
            grouped_jobs.setdefault(external_host, []).append({
                'files': xfers_files,
                'job_params': {}
            })
    return grouped_jobs
Exemplo n.º 11
0
def __delete_expired_tokens_account(account, session=None):
    """"
    Deletes expired tokens from the database.

    :param account: Account to delete expired tokens.
    :param session: The database session in use.
    """
    stmt_select = select(models.Token) \
        .where(and_(models.Token.expired_at < datetime.datetime.utcnow(),
                    models.Token.account == account)) \
        .with_for_update(skip_locked=True)
    result_select = session.execute(stmt_select)

    tokens = []
    for t in result_select.columns('token'):
        tokens.append(t.token)

    for t in chunks(tokens, 100):
        stmt_delete = delete(models.Token) \
            .where(models.Token.token.in_(t)) \
            .prefix_with("/*+ INDEX(TOKENS_ACCOUNT_EXPIRED_AT_IDX) */")
        session.execute(stmt_delete)
Exemplo n.º 12
0
    def remove_extra_replicas(self, to_remove):
        """
        :param to_remove: replicas to remove from Rucio
        :return:
        """

        with monitor.record_timer_block('cms_sync.time_remove_replica'):
            if to_remove and self.dry_run:
                logging.info('Dry run: Removing replicas %s from rse %s.',
                             str(to_remove), self.rse)
            elif to_remove:
                logging.debug('Removing %s replicas from rse %s.',
                              len(to_remove), self.rse)
                for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                    delete_replicas(rse=self.rse,
                                    issuer=self.account,
                                    files=[{
                                        'scope': self.scope,
                                        'name': lfn
                                    } for lfn in to_remove_chunk])
                    # FIXME: Do we need a retry here on DatabaseException? If so, use the retry module
                return len(to_remove)
Exemplo n.º 13
0
def bulk_group_transfers(transfer_paths, policy='single', group_bulk=200):
    """
    Group transfers in bulk based on certain criterias

    :param transfer_paths:  List of (potentially multihop) transfer paths to group. Each path is a list of single-hop transfers.
    :param policy:          Policy to use to group.
    :param group_bulk:      Bulk sizes.
    :return:                List of transfer groups
    """
    if policy == 'single':
        group_bulk = 1

    grouped_jobs = []
    for chunk in chunks(transfer_paths, group_bulk):
        # Globus doesn't support multihop. Get the first hop only.
        transfers = [transfer_path[0] for transfer_path in chunk]

        grouped_jobs.append({
            'transfers': transfers,
            # Job params are not used by globus transfertool, but are needed for further common fts/globus code
            'job_params': {}
        })

    return grouped_jobs
Exemplo n.º 14
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos Temporary Expiration starting')

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        try:
            # Get list of expired TU replicas
            logger(logging.INFO, 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logger(logging.INFO, '%s expired replicas returned', len(expired_replicas))
            logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except (DatabaseException, DatabaseError) as error:
                    if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                        logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                    else:
                        logger(logging.ERROR, 'Exception', exc_info=True)
                    session.rollback()
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logger(logging.CRITICAL, str(traceback.format_exc()))

        if once:
            break
        daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
Exemplo n.º 15
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

        stime = time.time()
        replicas = []
        try:
            replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                list_replicas = get_replicas_state(scope=scope, name=name)
                if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                    logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                    unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, [])
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep)))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(limit=1000000,
                                                         thread=heart_beat['assign_thread'],
                                                         total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now))
                update_history_time = time.time()

            tottime = time.time() - stime
            if len(replicas) == bulk:
                logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle')
            elif tottime < sleep_time:
                logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime)))
                time.sleep(sleep_time - tottime)
                continue

    logging.info(prepend_str + 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 16
0
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        conveyor_config = {item[0]: item[1] for item in items('conveyor')}
    except ConfigNotFound:
        logging.info('No configuration found for conveyor')
        conveyor_config = {}

    # Get suspicious patterns
    suspicious_patterns = conveyor_config.get('suspicious_pattern', [])
    if suspicious_patterns:
        pattern = str(suspicious_patterns)
        patterns = pattern.split(",")
        suspicious_patterns = [re.compile(pat.strip()) for pat in patterns]
    logging.debug("Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns])

    retry_protocol_mismatches = conveyor_config.get('retry_protocol_mismatches', False)

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info('%s Finisher starting - db_bulk(%i) bulk (%i)', prepend_str, db_bulk, bulk)

    graceful_stop.wait(10)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
            logging.debug('%s Starting new cycle', prepend_str)
            if activities is None:
                activities = [None]

            for activity in activities:
                logging.debug('%s Working on activity %s', prepend_str, activity)
                time1 = time.time()
                reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                             state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING,
                                                    RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES],
                                             limit=db_bulk,
                                             older_than=datetime.datetime.utcnow(),
                                             total_workers=heart_beat['nr_threads'] - 1,
                                             worker_number=heart_beat['assign_thread'],
                                             mode_all=True,
                                             hash_variable='rule_id')
                record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - time1) * 1000)
                time2 = time.time()
                if reqs:
                    logging.debug('%s Updating %i requests for activity %s', prepend_str, len(reqs), activity)

                for chunk in chunks(reqs, bulk):
                    try:
                        time3 = time.time()
                        __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, prepend_str)
                        record_timer('daemons.conveyor.finisher.handle_requests', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1))
                        record_counter('daemons.conveyor.finisher.handle_requests', len(chunk))
                    except Exception as error:
                        logging.warn('%s %s', prepend_str, str(error))
                if reqs:
                    logging.debug('%s Finish to update %s finished requests for activity %s in %s seconds', prepend_str, len(reqs), activity, time.time() - time2)

        except (DatabaseException, DatabaseError) as error:
            if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                logging.warn('%s Lock detected when handling request - skipping: %s', prepend_str, str(error))
            else:
                logging.error('%s %s', prepend_str, traceback.format_exc())
        except Exception as error:
            logging.critical('%s %s', prepend_str, str(error))

        if once:
            return

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logging.info('%s Sleeping for a while :  %s seconds', prepend_str, (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

    logging.info('%s Graceful stop requests', prepend_str)

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s Graceful stop done', prepend_str)
Exemplo n.º 17
0
def bulk_group_transfer(transfers, policy='rule', group_bulk=200, source_strategy=None, max_time_in_queue=None, session=None, logger=logging.log, group_by_scope=False, archive_timeout_override=None):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:                List of transfers to group.
    :param plicy:                    Policy to use to group.
    :param group_bulk:               Bulk sizes.
    :param source_strategy:          Strategy to group sources
    :param max_time_in_queue:        Maximum time in queue
    :param archive_timeout_override: Override the archive_timeout parameter for any transfers with it set (0 to unset)
    :param logger:                   Optional decorated logger that can be passed from the calling daemons or servers.
    :return:                         List of grouped transfers.
    """

    grouped_transfers = {}
    grouped_jobs = {}

    # Use empty string, but any string is OK, it is internal to this function only
    _catch_all_scopes_str = ''

    try:
        default_source_strategy = get(section='conveyor', option='default-source-strategy')
    except ConfigNotFound:
        default_source_strategy = 'orderly'

    try:
        activity_source_strategy = get(section='conveyor', option='activity-source-strategy')
        activity_source_strategy = loads(activity_source_strategy)
    except ConfigNotFound:
        activity_source_strategy = {}
    except ValueError:
        logger(logging.WARNING, 'activity_source_strategy not properly defined')
        activity_source_strategy = {}

    for request_id in transfers:
        transfer = transfers[request_id]

        verify_checksum, checksums_to_use = transfer_core.checksum_validation_strategy(transfer.src.rse.attributes, transfer.dst.rse.attributes, logger=logger)
        t_file = {'sources': transfer['sources'],
                  'destinations': transfer['dest_urls'],
                  'metadata': transfer['file_metadata'],
                  'filesize': int(transfer['file_metadata']['filesize']),
                  'checksum': None,
                  'verify_checksum': verify_checksum,
                  'selection_strategy': source_strategy if source_strategy else activity_source_strategy.get(str(transfer['file_metadata']['activity']), default_source_strategy),
                  'request_type': transfer['file_metadata'].get('request_type', None),
                  'activity': str(transfer['file_metadata']['activity'])}

        if verify_checksum != 'none':
            set_checksum_value(t_file, checksums_to_use)

        multihop = transfer.get('multihop', False)
        strict_copy = transfer.get('strict_copy', False)
        use_ipv4 = transfer.get('use_ipv4', False)

        external_host = transfer['external_host']
        scope = t_file['metadata']['scope']
        activity = t_file['activity']
        if group_by_scope:
            scope_str = scope.internal
        else:
            # Use a catch-all scope which will be removed at the end
            scope_str = _catch_all_scopes_str

        if external_host not in grouped_transfers:
            grouped_transfers[external_host] = {}
            grouped_jobs[external_host] = {}
            if scope_str not in grouped_transfers[external_host]:
                grouped_transfers[external_host][scope_str] = {}
                grouped_jobs[external_host][scope_str] = []

        current_transfers_group = grouped_transfers[external_host][scope_str]
        current_jobs_group = grouped_jobs[external_host][scope_str]

        job_params = {'account': transfer['account'],
                      'use_oidc': transfer_core.oidc_supported(transfer),
                      'verify_checksum': verify_checksum,
                      'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1,
                      'bring_online': transfer['bring_online'] if transfer['bring_online'] else None,
                      'job_metadata': {'issuer': 'rucio'},  # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc.
                      'overwrite': transfer['overwrite'],
                      'priority': 3}
        if transfer.get('archive_timeout', None):
            if archive_timeout_override is None:
                job_params['archive_timeout'] = transfer['archive_timeout']
            elif archive_timeout_override != 0:
                job_params['archive_timeout'] = archive_timeout_override
            # else don't set the value
        if multihop:
            job_params['multihop'] = True
        if strict_copy:
            job_params['strict_copy'] = True
        if use_ipv4:
            job_params['ipv4'] = True
            job_params['ipv6'] = False

        # Don't put optional & missing keys in the parameters
        if transfer['dest_spacetoken']:
            job_params.update({'spacetoken': transfer['dest_spacetoken']})
        if transfer['src_spacetoken']:
            job_params.update({'source_spacetoken': transfer['src_spacetoken']})

        if max_time_in_queue:
            if transfer['file_metadata']['activity'] in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue[transfer['file_metadata']['activity']]
            elif 'default' in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue['default']

        # for multiple source replicas, no bulk submission
        if len(transfer['sources']) > 1:
            job_params['job_metadata']['multi_sources'] = True
            current_jobs_group.append({'files': [t_file], 'job_params': job_params})
        else:
            job_params['job_metadata']['multi_sources'] = False
            job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (job_params['verify_checksum'], job_params.get('spacetoken', None),
                                                   job_params['copy_pin_lifetime'],
                                                   job_params['bring_online'], job_params['job_metadata'],
                                                   job_params.get('source_spacetoken', None),
                                                   job_params['overwrite'], job_params['priority'])
            if 'max_time_in_queue' in job_params:
                job_key = job_key + ',%s' % job_params['max_time_in_queue']

            if multihop:
                job_key = 'multihop_%s' % (transfer['initial_request_id'])

            if job_key not in current_transfers_group:
                current_transfers_group[job_key] = {}

            if multihop:
                policy_key = 'multihop_%s' % (transfer['initial_request_id'])
            else:
                if policy == 'rule':
                    policy_key = '%s' % (transfer['rule_id'])
                if policy == 'dest':
                    policy_key = '%s' % (t_file['metadata']['dst_rse'])
                if policy == 'src_dest':
                    policy_key = '%s,%s' % (t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'rule_src_dest':
                    policy_key = '%s,%s,%s' % (transfer['rule_id'], t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'activity_dest':
                    policy_key = '%s %s' % (activity, t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                if policy == 'activity_src_dest':
                    policy_key = '%s %s %s' % (activity, t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                    # maybe here we need to hash the key if it's too long

            if policy_key not in current_transfers_group[job_key]:
                current_transfers_group[job_key][policy_key] = {'files': [], 'job_params': job_params}
            current_transfers_policy = current_transfers_group[job_key][policy_key]
            if multihop:
                # The parent transfer should be the first of the list
                # TODO : Only work for a single hop now, need to be able to handle multiple hops
                if transfer['parent_request']:  # This is the child
                    current_transfers_policy['files'].append(t_file)
                else:
                    current_transfers_policy['files'].insert(0, t_file)
            else:
                current_transfers_policy['files'].append(t_file)

    # for jobs with different job_key, we cannot put in one job.
    for external_host in grouped_transfers:
        for scope_key in grouped_transfers[external_host]:
            for job_key in grouped_transfers[external_host][scope_key]:
                # for all policy groups in job_key, the job_params is the same.
                for policy_key in grouped_transfers[external_host][scope_key][job_key]:
                    job_params = grouped_transfers[external_host][scope_key][job_key][policy_key]['job_params']
                    for xfers_files in chunks(grouped_transfers[external_host][scope_key][job_key][policy_key]['files'], group_bulk):
                        # for the last small piece, just submit it.
                        grouped_jobs[external_host][scope_key].append({'files': xfers_files, 'job_params': job_params})

    if not group_by_scope:
        for external_host in grouped_jobs:
            grouped_jobs[external_host] = grouped_jobs[external_host][_catch_all_scopes_str]

    return grouped_jobs
Exemplo n.º 18
0
                             "Thread %i :Rule inserted in %f seconds" % (worker_number, time.time() - stime)
                         )
             except DataIdentifierNotFound, e:
                 logging.warning(e)
         if did["did_type"] == str(DIDType.FILE):
             monitor.record_counter(counters="transmogrifier.did.file.processed", delta=1)
         elif did["did_type"] == str(DIDType.DATASET):
             monitor.record_counter(counters="transmogrifier.did.dataset.processed", delta=1)
         elif did["did_type"] == str(DIDType.CONTAINER):
             monitor.record_counter(counters="transmogrifier.did.container.processed", delta=1)
         monitor.record_counter(counters="transmogrifier.did.processed", delta=1)
         identifiers.append(
             {"scope": did["scope"], "name": did["name"], "did_type": DIDType.from_sym(did["did_type"])}
         )
     time1 = time.time()
     for id in chunks(identifiers, 100):
         _retrial(set_new_dids, id, None)
     logging.info("Thread %i : Time to set the new flag : %f" % (worker_number, time.time() - time1))
     tottime = time.time() - start_time
     logging.info("Thread %i : It took %f seconds to process %i DIDs" % (worker_number, tottime, len(dids)))
     logging.debug("Thread %i : DIDs processed : %s" % (worker_number, str(dids)))
     monitor.record_counter(counters="transmogrifier.job.done", delta=1)
     monitor.record_timer(stat="transmogrifier.job.duration", time=1000 * tottime)
 except:
     exc_type, exc_value, exc_traceback = exc_info()
     logging.critical("".join(format_exception(exc_type, exc_value, exc_traceback)).strip())
     monitor.record_counter(counters="transmogrifier.job.error", delta=1)
     monitor.record_counter(counters="transmogrifier.addnewrule.error", delta=1)
 logging.info(once)
 if once is True:
     break
Exemplo n.º 19
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'],
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' %
                         len(expired_replicas))
            logging.debug(prepend_str +
                          'List of expired replicas returned %s' %
                          str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            nchunk = 0
            tot_chunk = int(math.ceil(len(replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                # Process and update the replicas in chunks
                replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                } for replica in chunk]
                # Remove the replicas from bad_replicas table in chunks
                bad_replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': BadFilesStatus.TEMPORARY_UNAVAILABLE
                } for replica in chunk]
                try:
                    nchunk += 1
                    logging.debug(prepend_str +
                                  'Running on %s chunk out of %s' %
                                  (nchunk, tot_chunk))
                    update_replicas_states(replicas,
                                           nowait=True,
                                           session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning(
                        prepend_str +
                        'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s'
                        % str(error))
                    for idx in range(len(chunk)):
                        logging.debug(prepend_str + 'Working on %s' %
                                      (str(replicas[idx])))
                        try:
                            get_metadata(replicas[idx]['scope'],
                                         replicas[idx]['name'])
                            update_replicas_states([
                                replicas[idx],
                            ],
                                                   nowait=True,
                                                   session=session)
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                'DID %s:%s does not exist anymore. ' %
                                (bad_replicas[idx]['scope'],
                                 bad_replicas[idx]['name']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                '%s:%s on RSEID %s does not exist anymore. ' %
                                (replicas[idx]['scope'], replicas[idx]['name'],
                                 replicas[idx]['rse_id']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())
                    session = get_session()

        except Exception as error:
            logging.critical(traceback.format_exc())

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 20
0
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                 thread, total_threads,
                                                                                 bulk))

    logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                thread, total_threads,
                                                                                bulk))

    while not graceful_stop.is_set():

        try:
            ts = time.time()

            logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than))
            reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                    state=RequestState.SUBMITTED,
                                    limit=bulk,
                                    older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than),
                                    process=process, total_processes=total_processes,
                                    thread=thread, total_threads=total_threads)
            record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000)

            if reqs:
                logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs)))

            if not reqs or reqs == []:
                if once:
                    break
                logging.debug("no requests found. will sleep 60 seconds")
                time.sleep(60)  # Only sleep if there is nothing to do
                continue

            for xfers in chunks(reqs, bulk):
                try:
                    req_ids = {}
                    for req in xfers:
                        record_counter('daemons.conveyor.poller.query_request')
                        if not req['external_host'] in req_ids:
                            req_ids[req['external_host']] = []
                        req_ids[req['external_host']].append((req['request_id'], req['external_id']))

                    responses = {}
                    for external_host in req_ids:
                        ts = time.time()
                        resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3')
                        record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host]))
                        responses = dict(responses.items() + resps.items())

                    for external_host in req_ids:
                        for request_id, external_id in req_ids[external_host]:
                            response = responses[request_id]
                            if isinstance(response, Exception):
                                logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id]))
                                record_counter('daemons.conveyor.poller.query_request_exception')
                                response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None}
                            common.update_request_state(response)
                            record_counter('daemons.conveyor.poller.update_request_state')
                            if response['new_state'] == RequestState.LOST:
                                record_counter('daemons.conveyor.poller.request_lost')
                except:
                    logging.critical(traceback.format_exc())

        except:
            logging.critical(traceback.format_exc())

        if once:
            return

    logging.debug('%i:%i - graceful stop requests' % (process, thread))

    logging.debug('%i:%i - graceful stop done' % (process, thread))
Exemplo n.º 21
0
def delete_replicas(rse, files, ignore_availability=True, session=None):
    """
    Delete file replicas.

    :param rse: the rse name.
    :param files: the list of files to delete.
    :param ignore_availability: Ignore the RSE blacklisting.
    :param session: The database session in use.
    """
    replica_rse = get_rse(rse=rse, session=session)

    if (not (replica_rse.availability & 1)) and not ignore_availability:
        raise exception.RessourceTemporaryUnavailable('%s is temporary unavailable for deleting' % rse)

    replica_condition, parent_condition, did_condition = list(), list(), list()
    for file in files:
        replica_condition.append(and_(models.RSEFileAssociation.scope == file['scope'], models.RSEFileAssociation.name == file['name']))
        parent_condition.append(and_(models.DataIdentifierAssociation.child_scope == file['scope'], models.DataIdentifierAssociation.child_name == file['name'],
                                     ~exists(select([1]).prefix_with("/*+ INDEX(REPLICAS REPLICAS_PK) */", dialect='oracle')).where(and_(models.RSEFileAssociation.scope == file['scope'], models.RSEFileAssociation.name == file['name']))))
        did_condition.append(and_(models.DataIdentifier.scope == file['scope'], models.DataIdentifier.name == file['name'],
                                  ~exists(select([1]).prefix_with("/*+ INDEX(REPLICAS REPLICAS_PK) */", dialect='oracle')).where(and_(models.RSEFileAssociation.scope == file['scope'], models.RSEFileAssociation.name == file['name']))))

    delta, bytes, rowcount = 0, 0, 0
    for c in chunks(replica_condition, 10):
        for (replica_bytes, ) in session.query(models.RSEFileAssociation.bytes).with_hint(models.RSEFileAssociation, "INDEX(REPLICAS REPLICAS_PK)", 'oracle').filter(models.RSEFileAssociation.rse_id == replica_rse.id).filter(or_(*c)):
            bytes += replica_bytes
            delta += 1

        rowcount += session.query(models.RSEFileAssociation).filter(models.RSEFileAssociation.rse_id == replica_rse.id).filter(or_(*c)).delete(synchronize_session=False)

    if rowcount != len(files):
        raise exception.ReplicaNotFound("One or several replicas don't exist.")

    # Delete did from the content for the last did
    while parent_condition:
        child_did_condition = list()
        tmp_parent_condition = list()
        for c in chunks(parent_condition, 10):

            query = session.query(models.DataIdentifierAssociation.scope, models.DataIdentifierAssociation.name,
                                  models.DataIdentifierAssociation.child_scope, models.DataIdentifierAssociation.child_name).\
                filter(or_(*c))
            for parent_scope, parent_name, child_scope, child_name in query:
                child_did_condition.append(and_(models.DataIdentifierAssociation.scope == parent_scope, models.DataIdentifierAssociation.name == parent_name,
                                                models.DataIdentifierAssociation.child_scope == child_scope, models.DataIdentifierAssociation.child_name == child_name))
                tmp_parent_condition.append(and_(models.DataIdentifierAssociation.child_scope == parent_scope, models.DataIdentifierAssociation.child_name == parent_name,
                                                 ~exists(select([1]).prefix_with("/*+ INDEX(CONTENTS CONTENTS_PK) */", dialect='oracle')).where(and_(models.DataIdentifierAssociation.scope == parent_scope,
                                                                                                                                                     models.DataIdentifierAssociation.name == parent_name))))
                did_condition.append(and_(models.DataIdentifier.scope == parent_scope, models.DataIdentifier.name == parent_name, models.DataIdentifier.is_open == False,
                                          ~exists([1]).where(and_(models.DataIdentifierAssociation.scope == parent_scope, models.DataIdentifierAssociation.name == parent_name))))  # NOQA

        if child_did_condition:
            for c in chunks(child_did_condition, 10):
                rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*c)).delete(synchronize_session=False)
            # update parent counters

        parent_condition = tmp_parent_condition

    for c in chunks(did_condition, 10):
        rowcount = session.query(models.DataIdentifier).with_hint(models.DataIdentifier, "INDEX(DIDS DIDS_PK)", 'oracle').filter(or_(*c)).delete(synchronize_session=False)

    # Decrease RSE counter
    decrease(rse_id=replica_rse.id, files=delta, bytes=bytes, session=session)
Exemplo n.º 22
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 500  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                pfns = bad_replicas[(account, reason, state)]
                logging.debug(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                try:
                    for chunk in chunks(pfns, chunk_size):
                        unknown_replicas = declare_bad_file_replicas(
                            pfns=chunk,
                            reason=reason,
                            issuer=account,
                            status=state,
                            session=session)
                        bulk_delete_bad_pfns(pfns=chunk, session=session)
                        session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.debug(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                _, dict_rse, unknown_replicas = get_pfn_to_rse(pfns)
                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse in dict_rse:
                    replicas = []
                    rse_id = get_rse_id(rse=rse, session=None)
                    logging.debug(prepend_str + 'Running on RSE %s' % rse)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse],
                                                 rse=rse,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    session = get_session()
                    try:
                        for chunk in chunks(replicas, chunk_size):
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                    except Exception:
                        session.rollback()  # pylint: disable=no-member
                        logging.critical(traceback.format_exc())

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 23
0
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False, session=None):
    """
    List file replicas for a list of data identifiers (DIDs).

    :param dids: The list of data identifiers (DIDs).
    :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...)
    :param unavailable: Also include unavailable replicas in the list.
    :param request_id: ID associated with the request for debugging.
    :param ignore_availability: Ignore the RSE blacklisting.
    :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary.
    :param session: The database session in use.
    """
    # Get the list of files
    rseinfo = {}
    replicas = {}
    replica_conditions, did_conditions = [], []
    # remove duplicate did from the list
    for did in [dict(tupleized) for tupleized in set(tuple(item.items()) for item in dids)]:
        if 'type' in did and did['type'] in (DIDType.FILE, DIDType.FILE.value) or 'did_type' in did and did['did_type'] in (DIDType.FILE, DIDType.FILE.value):
            if all_states:
                condition = and_(models.RSEFileAssociation.scope == did['scope'],
                                 models.RSEFileAssociation.name == did['name'])
            else:
                if not unavailable:
                    condition = and_(models.RSEFileAssociation.scope == did['scope'],
                                     models.RSEFileAssociation.name == did['name'],
                                     models.RSEFileAssociation.state == ReplicaState.AVAILABLE)
                else:
                    condition = and_(models.RSEFileAssociation.scope == did['scope'],
                                     models.RSEFileAssociation.name == did['name'],
                                     or_(models.RSEFileAssociation.state == ReplicaState.AVAILABLE,
                                         models.RSEFileAssociation.state == ReplicaState.UNAVAILABLE,
                                         models.RSEFileAssociation.state == ReplicaState.COPYING))
            replicas['%s:%s' % (did['scope'], did['name'])] = {'scope': did['scope'], 'name': did['name'], 'rses': {}}
            replica_conditions.append(condition)
        else:
            did_conditions.append(and_(models.DataIdentifier.scope == did['scope'],
                                       models.DataIdentifier.name == did['name']))

    if did_conditions:
        # Get files
        for scope, name, did_type in session.query(models.DataIdentifier.scope,
                                                   models.DataIdentifier.name,
                                                   models.DataIdentifier.did_type).filter(or_(*did_conditions)):
            if did_type == DIDType.FILE:
                replicas['%s:%s' % (scope, name)] = {'scope': scope, 'name': name, 'rses': {}}
                if all_states:
                    condition = and_(models.RSEFileAssociation.scope == scope,
                                     models.RSEFileAssociation.name == name)
                else:
                    if not unavailable:
                        condition = and_(models.RSEFileAssociation.scope == scope,
                                         models.RSEFileAssociation.name == name,
                                         models.RSEFileAssociation.state == ReplicaState.AVAILABLE)
                    else:
                        condition = and_(models.RSEFileAssociation.scope == scope,
                                         models.RSEFileAssociation.name == name,
                                         or_(models.RSEFileAssociation.state == ReplicaState.AVAILABLE,
                                             models.RSEFileAssociation.state == ReplicaState.UNAVAILABLE,
                                             models.RSEFileAssociation.state == ReplicaState.COPYING))
                replica_conditions.append(condition)
            else:
                # for dataset/container
                content_query = session.query(models.DataIdentifierAssociation.child_scope,
                                              models.DataIdentifierAssociation.child_name,
                                              models.DataIdentifierAssociation.child_type)
                content_query = content_query.with_hint(models.DataIdentifierAssociation, "INDEX(CONTENTS CONTENTS_PK)", 'oracle')
                child_dids = [(scope, name)]
                while child_dids:
                    s, n = child_dids.pop()
                    for tmp_did in content_query.filter_by(scope=s, name=n):
                        if tmp_did.child_type == DIDType.FILE:
                            replicas['%s:%s' % (tmp_did.child_scope, tmp_did.child_name)] = {'scope': tmp_did.child_scope,
                                                                                             'name': tmp_did.child_name,
                                                                                             'rses': {}}
                            if all_states:
                                condition = and_(models.RSEFileAssociation.scope == tmp_did.child_scope,
                                                 models.RSEFileAssociation.name == tmp_did.child_name)
                            else:
                                if not unavailable:
                                    condition = and_(models.RSEFileAssociation.scope == tmp_did.child_scope,
                                                     models.RSEFileAssociation.name == tmp_did.child_name,
                                                     models.RSEFileAssociation.state == ReplicaState.AVAILABLE)
                                else:
                                    condition = and_(models.RSEFileAssociation.scope == tmp_did.child_scope,
                                                     models.RSEFileAssociation.name == tmp_did.child_name,
                                                     or_(models.RSEFileAssociation.state == ReplicaState.AVAILABLE,
                                                         models.RSEFileAssociation.state == ReplicaState.UNAVAILABLE,
                                                         models.RSEFileAssociation.state == ReplicaState.COPYING))
                            replica_conditions.append(condition)
                        else:
                            child_dids.append((tmp_did.child_scope, tmp_did.child_name))

    # Get the list of replicas
    is_false = False
    tmp_protocols = {}
    key = None
    for replica_condition in chunks(replica_conditions, 50):

        replica_query = select(columns=(models.RSEFileAssociation.scope,
                                        models.RSEFileAssociation.name,
                                        models.RSEFileAssociation.bytes,
                                        models.RSEFileAssociation.md5,
                                        models.RSEFileAssociation.adler32,
                                        models.RSEFileAssociation.path,
                                        models.RSEFileAssociation.state,
                                        models.RSE.rse),
                               whereclause=and_(models.RSEFileAssociation.rse_id == models.RSE.id,
                                                models.RSE.deleted == is_false,
                                                or_(*replica_condition)),
                               order_by=(models.RSEFileAssociation.scope,
                                         models.RSEFileAssociation.name)).\
            with_hint(models.RSEFileAssociation.scope, text="INDEX(REPLICAS REPLICAS_PK)", dialect_name='oracle').\
            compile()
        # models.RSE.availability.op(avail_op)(0x100) != 0
        for scope, name, bytes, md5, adler32, path, state, rse in session.execute(replica_query.statement, replica_query.params).fetchall():
            if rse not in rseinfo:
                rseinfo[rse] = rsemgr.get_rse_info(rse, session=session)
            if not rseinfo[rse]['staging_area']:
                if not key:
                    key = '%s:%s' % (scope, name)
                elif key != '%s:%s' % (scope, name):
                    yield replicas[key]
                    del replicas[key]
                    key = '%s:%s' % (scope, name)

                if 'bytes' not in replicas[key]:
                    replicas[key]['bytes'] = bytes
                    replicas[key]['md5'] = md5
                    replicas[key]['adler32'] = adler32

                if rse not in replicas[key]['rses']:
                    replicas[key]['rses'][rse] = []

                if all_states:
                    if 'states' not in replicas[key]:
                        replicas[key]['states'] = {}
                    replicas[key]['states'][rse] = state
                # get protocols
                if rse not in tmp_protocols:
                    protocols = list()
                    if not schemes:
                        try:
                            protocols.append(rsemgr.create_protocol(rseinfo[rse], 'read'))
                        except exception.RSEProtocolNotSupported:
                            pass  # no need to be verbose
                        except:
                            print format_exc()
                    else:
                        for s in schemes:
                            try:
                                protocols.append(rsemgr.create_protocol(rse_settings=rseinfo[rse], operation='read', scheme=s))
                            except exception.RSEProtocolNotSupported:
                                pass  # no need to be verbose
                            except:
                                print format_exc()
                    tmp_protocols[rse] = protocols

                # get pfns
                pfns_cache = dict()
                for protocol in tmp_protocols[rse]:
                    if 'determinism_type' in protocol.attributes:  # PFN is cachable
                        try:
                            path = pfns_cache['%s:%s:%s' % (protocol.attributes['determinism_type'], scope, name)]
                        except KeyError:  # No cache entry scope:name found for this protocol
                            path = protocol._get_path(scope, name)
                            pfns_cache['%s:%s:%s' % (protocol.attributes['determinism_type'], scope, name)] = path
                    if not schemes or protocol.attributes['scheme'] in schemes:
                        try:
                            replicas[key]['rses'][rse].append(protocol.lfns2pfns(lfns={'scope': scope, 'name': name, 'path': path}).values()[0])
                        except:
                            # temporary protection
                            print format_exc()
                        if protocol.attributes['scheme'] == 'srm':
                            try:
                                replicas[key]['space_token'] = protocol.attributes['extended_attributes']['space_token']
                            except KeyError:
                                replicas[key]['space_token'] = None
    if key:
        yield replicas[key]

    # Files with no replicas
    for replica in replicas:
        if not replicas[replica]['rses']:
            yield replicas[replica]
Exemplo n.º 24
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 500  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'] - 1,
                worker_number=heart_beat['assign_thread'],
                limit=1000)

            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            for chunk in chunks(replicas, chunk_size):
                # Process and update the replicas in chunks
                update_replicas_states(chunk, nowait=False, session=session)
            for chunk in chunks(bad_replicas, chunk_size):
                # Remove the replicas from bad_replicas table in chunks
                bulk_delete_bad_replicas(chunk, session=session)

            session.commit()  # pylint: disable=no-member

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 25
0
def bulk_group_transfer(transfers,
                        policy='rule',
                        group_bulk=200,
                        source_strategy=None,
                        max_time_in_queue=None):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:             List of transfers to group.
    :param plicy:                 Policy to use to group.
    :param group_bulk:            Bulk sizes.
    :param source_strategy:       Strategy to group sources
    :param max_time_in_queue:     Maximum time in queue
    :return:                      List of grouped transfers.
    """

    grouped_transfers = {}
    grouped_jobs = {}

    try:
        default_source_strategy = get(section='conveyor',
                                      option='default-source-strategy')
    except ConfigNotFound:
        default_source_strategy = 'orderly'

    try:
        activity_source_strategy = get(section='conveyor',
                                       option='activity-source-strategy')
        activity_source_strategy = loads(activity_source_strategy)
    except ConfigNotFound:
        activity_source_strategy = {}
    except ValueError:
        logging.warning('activity_source_strategy not properly defined')
        activity_source_strategy = {}

    for request_id in transfers:
        transfer = transfers[request_id]

        verify_checksum = transfer['file_metadata'].get(
            'verify_checksum', 'both')
        t_file = {
            'sources':
            transfer['sources'],
            'destinations':
            transfer['dest_urls'],
            'metadata':
            transfer['file_metadata'],
            'filesize':
            int(transfer['file_metadata']['filesize']),
            'checksum':
            None,
            'verify_checksum':
            verify_checksum,
            'selection_strategy':
            source_strategy
            if source_strategy else activity_source_strategy.get(
                str(transfer['file_metadata']['activity']),
                default_source_strategy),
            'request_type':
            transfer['file_metadata'].get('request_type', None),
            'activity':
            str(transfer['file_metadata']['activity'])
        }
        if verify_checksum != 'none':
            if 'md5' in list(
                    t_file['metadata'].keys()) and t_file['metadata']['md5']:
                t_file['checksum'] = 'MD5:%s' % str(t_file['metadata']['md5'])
            if 'adler32' in list(t_file['metadata'].keys()
                                 ) and t_file['metadata']['adler32']:
                t_file['checksum'] = 'ADLER32:%s' % str(
                    t_file['metadata']['adler32'])

        external_host = transfer['external_host']
        scope = t_file['metadata']['scope']
        activity = t_file['activity']

        if external_host not in grouped_transfers:
            grouped_transfers[external_host] = {}
            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                grouped_jobs[external_host] = []
            elif activity in USER_ACTIVITY:
                grouped_jobs[external_host] = {}
                if scope not in grouped_transfers[external_host]:
                    grouped_transfers[external_host][scope] = {}
                    grouped_jobs[external_host][scope] = []

        job_params = {
            'verify_checksum':
            verify_checksum,
            'copy_pin_lifetime':
            transfer['copy_pin_lifetime']
            if transfer['copy_pin_lifetime'] else -1,
            'bring_online':
            transfer['bring_online'] if transfer['bring_online'] else None,
            'job_metadata': {
                'issuer': 'rucio'
            },  # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc.
            'overwrite':
            transfer['overwrite'],
            'priority':
            3,
            's3alternate':
            True
        }

        # Don't put optional & missing keys in the parameters
        if transfer['dest_spacetoken']:
            job_params.update({'spacetoken': transfer['dest_spacetoken']})
        if transfer['src_spacetoken']:
            job_params.update(
                {'source_spacetoken': transfer['src_spacetoken']})

        if max_time_in_queue:
            if transfer['file_metadata']['activity'] in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue[
                    transfer['file_metadata']['activity']]
            elif 'default' in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue['default']

        # for multiple source replicas, no bulk submission
        if len(transfer['sources']) > 1:
            job_params['job_metadata']['multi_sources'] = True
            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                grouped_jobs[external_host].append({
                    'files': [t_file],
                    'job_params': job_params
                })
            elif activity in USER_ACTIVITY:
                grouped_jobs[external_host][scope].append({
                    'files': [t_file],
                    'job_params':
                    job_params
                })
        else:
            job_params['job_metadata']['multi_sources'] = False
            job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (
                job_params['verify_checksum'],
                job_params.get('spacetoken',
                               None), job_params['copy_pin_lifetime'],
                job_params['bring_online'], job_params['job_metadata'],
                job_params.get('source_spacetoken', None),
                job_params['overwrite'], job_params['priority'])
            if 'max_time_in_queue' in job_params:
                job_key = job_key + ',%s' % job_params['max_time_in_queue']

            if job_key not in grouped_transfers[external_host]:
                if USER_TRANSFERS not in ['cms'
                                          ] or activity not in USER_ACTIVITY:
                    grouped_transfers[external_host][job_key] = {}
                elif activity in USER_ACTIVITY:
                    grouped_transfers[external_host][scope][job_key] = {}

            if policy == 'rule':
                policy_key = '%s' % (transfer['rule_id'])
            if policy == 'dest':
                policy_key = '%s' % (t_file['metadata']['dst_rse'])
            if policy == 'src_dest':
                policy_key = '%s,%s' % (t_file['metadata']['src_rse'],
                                        t_file['metadata']['dst_rse'])
            if policy == 'rule_src_dest':
                policy_key = '%s,%s,%s' % (transfer['rule_id'],
                                           t_file['metadata']['src_rse'],
                                           t_file['metadata']['dst_rse'])
            if policy == 'activity_dest':
                policy_key = '%s %s' % (activity,
                                        t_file['metadata']['dst_rse'])
                policy_key = "_".join(policy_key.split(' '))
            if policy == 'activity_src_dest':
                policy_key = '%s %s %s' % (activity,
                                           t_file['metadata']['src_rse'],
                                           t_file['metadata']['dst_rse'])
                policy_key = "_".join(policy_key.split(' '))
            # maybe here we need to hash the key if it's too long

            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                if policy_key not in grouped_transfers[external_host][job_key]:
                    grouped_transfers[external_host][job_key][policy_key] = {
                        'files': [t_file],
                        'job_params': job_params
                    }
                else:
                    grouped_transfers[external_host][job_key][policy_key][
                        'files'].append(t_file)
            elif activity in USER_ACTIVITY:
                if policy_key not in grouped_transfers[external_host][scope][
                        job_key]:
                    grouped_transfers[external_host][scope][job_key][
                        policy_key] = {
                            'files': [t_file],
                            'job_params': job_params
                        }
                else:
                    grouped_transfers[external_host][scope][job_key][
                        policy_key]['files'].append(t_file)

    # for jobs with different job_key, we cannot put in one job.
    for external_host in grouped_transfers:
        if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
            for job_key in grouped_transfers[external_host]:
                # for all policy groups in job_key, the job_params is the same.
                for policy_key in grouped_transfers[external_host][job_key]:
                    job_params = grouped_transfers[external_host][job_key][
                        policy_key]['job_params']
                    for xfers_files in chunks(
                            grouped_transfers[external_host][job_key]
                        [policy_key]['files'], group_bulk):
                        # for the last small piece, just submit it.
                        grouped_jobs[external_host].append({
                            'files':
                            xfers_files,
                            'job_params':
                            job_params
                        })
        elif activity in USER_ACTIVITY:
            for scope_key in grouped_transfers[external_host]:
                for job_key in grouped_transfers[external_host][scope_key]:
                    # for all policy groups in job_key, the job_params is the same.
                    for policy_key in grouped_transfers[external_host][
                            scope_key][job_key]:
                        job_params = grouped_transfers[external_host][
                            scope_key][job_key][policy_key]['job_params']
                        for xfers_files in chunks(
                                grouped_transfers[external_host][scope_key]
                            [job_key][policy_key]['files'], group_bulk):
                            # for the last small piece, just submit it.
                            grouped_jobs[external_host][scope_key].append({
                                'files':
                                xfers_files,
                                'job_params':
                                job_params
                            })

    return grouped_jobs
Exemplo n.º 26
0
def reaper(rses,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60):
    """
    Main loop to select and delete files.

    :param rses:           List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param chunk_size:     The size of chunk for deletion.
    :param once:           If True, only runs one iteration of the main loop.
    :param greedy:         If True, delete right away replicas with tombstone.
    :param scheme:         Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:  The delay to query replicas in BEING_DELETED state.
    :param sleep_time:     Time between two cycles.
    """

    try:
        max_deletion_thread = get('reaper', 'nb_workers_by_hostname')
    except ConfigNotFound as error:
        max_deletion_thread = 5
    hostname = socket.getfqdn()
    executable = sys.argv[0]
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info('%s Reaper starting', prepend_str)

    time.sleep(
        10
    )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info('%s Reaper started', prepend_str)

    while not GRACEFUL_STOP.is_set():

        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] +
                                                 1, heart_beat['nr_threads'])
            tot_needed_free_space = 0
            for rse in rses:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logging.debug('%s RSE %s is blacklisted for delete',
                                  prepend_str, rse['rse'])
                    continue
                max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(
                    rse['rse'], rse['id'], prepend_str)
                # Check if greedy mode
                if greedy:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        1000000000000, max_being_deleted_files
                    ]
                    tot_needed_free_space += 1000000000000
                else:
                    if needed_free_space:
                        dict_rses[(rse['rse'], rse['id'])] = [
                            needed_free_space, max_being_deleted_files
                        ]
                        tot_needed_free_space += needed_free_space
                    else:
                        logging.debug('%s Nothing to delete on %s',
                                      prepend_str, rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=itemgetter(1), reverse=True))
            logging.debug(
                '%s List of RSEs to process ordered by needed space desc : %s',
                prepend_str, str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logging.debug('%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                max_workers = ceil(dict_rses[rse_key][0] /
                                   tot_needed_free_space * 1000 /
                                   heart_beat['nr_threads'])
                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            skip_until_next_run = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                if rse_id in skip_until_next_run:
                    continue
                logging.debug(
                    '%s Working on %s. Percentage of the total space needed %.2f',
                    prepend_str, rse_name,
                    needed_free_space / tot_needed_free_space * 100)
                rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logging.debug('%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]

                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logging.debug(
                        '%s Too many deletion threads for %s on RSE %s. Back off',
                        prepend_str, rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue

                logging.info(
                    '%s Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    prepend_str, rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logging.debug('%s Total deletion workers for %s : %i',
                              prepend_str, rse_hostname,
                              tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            session=None)
                    logging.debug(
                        '%s list_and_mark_unlocked_replicas  on %s for %s bytes in %s seconds: %s replicas',
                        prepend_str, rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logging.info(
                            '%s Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            prepend_str, rse_name, chunk_size, len(replicas))
                        skip_until_next_run.append(rse_id)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logging.error('%s %s', prepend_str, str(error))
                    continue
                except Exception:
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))

                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    rsemgr.lfns2pfns(
                                        rse_settings=rse_info,
                                        lfns=[{
                                            'scope': replica['scope'],
                                            'name': replica['name'],
                                            'path': replica['path']
                                        }],
                                        operation='delete',
                                        scheme=scheme).values()[0])
                                time.sleep(random.uniform(0, 0.01))
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logging.warning(
                                    '%s Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    prepend_str, replica['scope'],
                                    replica['name'], rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logging.critical('%s %s', prepend_str,
                                                 str(traceback.format_exc()))

                        deleted_files = delete_from_storage(
                            file_replicas, prot, rse_info, staging_areas,
                            prepend_str)
                        logging.info('%s %i files processed in %s seconds',
                                     prepend_str, len(file_replicas),
                                     time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logging.debug(
                            '%s delete_replicas successed on %s : %s replicas in %s seconds',
                            prepend_str, rse_name, len(deleted_files),
                            time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done',
                                               delta=len(deleted_files))

                except Exception as error:
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logging.info('%s Will sleep for %s seconds', prepend_str,
                             sleep_time - tottime)
                time.sleep(sleep_time - tottime)

        except DatabaseException as error:
            logging.warning('%s Reaper:  %s', prepend_str, str(error))
        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
    return
Exemplo n.º 27
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                            unknown_replicas.extend(
                                tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logging.info(
                            prepend_str +
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logging.debug(prepend_str +
                                      'Running on RSE %s%s with %s replicas' %
                                      (get_rse_name(rse_id=rse_id), vo_str,
                                       len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logging.debug(prepend_str +
                                              'Unknown replicas : %s' %
                                              (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logging.debug(prepend_str +
                                  'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logging.debug(prepend_str +
                                  'Running on %s replicas on RSE %s' %
                                  (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.error(
                                prepend_str +
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logging.debug(prepend_str + 'Working on %s' %
                                              (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    if rep['rse_id'] in unavailable_states:
                                        logging.info(
                                            prepend_str +
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    elif expires_at < datetime.now():
                                        logging.info(
                                            '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns',
                                            prepend_str, str(rep['pfn']),
                                            expires_at)
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound) as error:
                                    logging.error(
                                        prepend_str +
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logging.critical(traceback.format_exc())
                            session = get_session()

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logging.info(
                prepend_str +
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 28
0
def transmogrifier(bulk=5, once=False, sleep_time=60):
    """
    Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash,
    identifies the subscriptions matching the DIDs and
    submit a replication rule for each DID matching a subscription.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'transmogrifier'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

        dids, subscriptions = [], []
        tottime = 0
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])

        try:
            #  Get the new DIDs based on the is_new flag
            for did in list_new_dids(thread=heart_beat['assign_thread'],
                                     total_threads=heart_beat['nr_threads'],
                                     chunk_size=bulk,
                                     did_type=None):
                dids.append({
                    'scope': did['scope'],
                    'did_type': str(did['did_type']),
                    'name': did['name']
                })

            sub_dict = {3: []}
            #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
            #  The priority is defined as 'policyid'
            for sub in list_subscriptions(None, None):
                if sub['state'] != SubscriptionState.INACTIVE and sub[
                        'lifetime'] and (datetime.now() > sub['lifetime']):
                    update_subscription(
                        name=sub['name'],
                        account=sub['account'],
                        metadata={'state': SubscriptionState.INACTIVE})

                elif sub['state'] in [
                        SubscriptionState.ACTIVE, SubscriptionState.UPDATED
                ]:
                    priority = 3
                    if 'policyid' in sub:
                        if int(sub['policyid']) not in sub_dict:
                            sub_dict[int(sub['policyid'])] = []
                        priority = int(sub['policyid'])
                    sub_dict[priority].append(sub)
            priorities = list(sub_dict.keys())
            priorities.sort()
            #  Order the subscriptions according to their priority
            for priority in priorities:
                subscriptions.extend(sub_dict[priority])
        except SubscriptionNotFound as error:
            logging.warning(prepend_str + 'No subscriptions defined: %s' %
                            (str(error)))
            time.sleep(10)
            continue
        except Exception as error:
            logging.error(
                prepend_str +
                'Failed to get list of new DIDs or subscriptions: %s' %
                (str(error)))

        try:
            results = {}
            start_time = time.time()
            blacklisted_rse_id = [
                rse['id'] for rse in list_rses({'availability_write': False})
            ]
            logging.debug(prepend_str + 'In transmogrifier worker')
            identifiers = []
            #  Loop over all the new dids
            for did in dids:
                did_success = True
                if did['did_type'] == str(
                        DIDType.DATASET) or did['did_type'] == str(
                            DIDType.CONTAINER):
                    did_tag = '%s:%s' % (did['scope'].internal, did['name'])
                    results[did_tag] = []
                    try:
                        metadata = get_metadata(did['scope'], did['name'])
                        # Loop over all the subscriptions
                        for subscription in subscriptions:
                            #  Check if the DID match the subscription
                            if is_matching_subscription(
                                    subscription, did, metadata) is True:
                                filter_string = loads(subscription['filter'])
                                split_rule = filter_string.get(
                                    'split_rule', False)
                                stime = time.time()
                                results[did_tag].append(subscription['id'])
                                logging.info(prepend_str +
                                             '%s:%s matches subscription %s' %
                                             (did['scope'], did['name'],
                                              subscription['name']))
                                rules = loads(
                                    subscription['replication_rules'])
                                created_rules = {}
                                cnt = 0
                                for rule_dict in rules:
                                    cnt += 1
                                    created_rules[cnt] = []
                                    # Get all the rule and subscription parameters
                                    grouping = rule_dict.get(
                                        'grouping', 'DATASET')
                                    lifetime = rule_dict.get('lifetime', None)
                                    ignore_availability = rule_dict.get(
                                        'ignore_availability', None)
                                    weight = rule_dict.get('weight', None)
                                    source_replica_expression = rule_dict.get(
                                        'source_replica_expression', None)
                                    locked = rule_dict.get('locked', None)
                                    if locked == 'True':
                                        locked = True
                                    else:
                                        locked = False
                                    purge_replicas = rule_dict.get(
                                        'purge_replicas', False)
                                    if purge_replicas == 'True':
                                        purge_replicas = True
                                    else:
                                        purge_replicas = False
                                    rse_expression = str(
                                        rule_dict['rse_expression'])
                                    comment = str(subscription['comments'])
                                    subscription_id = str(subscription['id'])
                                    account = subscription['account']
                                    copies = int(rule_dict['copies'])
                                    activity = rule_dict.get(
                                        'activity', 'User Subscriptions')
                                    try:
                                        validate_schema(name='activity',
                                                        obj=activity)
                                    except InputValidationError as error:
                                        logging.error(
                                            prepend_str +
                                            'Error validating the activity %s'
                                            % (str(error)))
                                        activity = 'User Subscriptions'
                                    if lifetime:
                                        lifetime = int(lifetime)

                                    str_activity = "".join(activity.split())
                                    success = False
                                    nattempt = 5
                                    attemptnr = 0
                                    skip_rule_creation = False

                                    selected_rses = []
                                    chained_idx = rule_dict.get(
                                        'chained_idx', None)
                                    if chained_idx:
                                        params = {}
                                        if rule_dict.get(
                                                'associated_site_idx', None):
                                            params[
                                                'associated_site_idx'] = rule_dict.get(
                                                    'associated_site_idx',
                                                    None)
                                        logging.debug(
                                            '%s Chained subscription identified. Will use %s',
                                            prepend_str,
                                            str(created_rules[chained_idx]))
                                        algorithm = rule_dict.get(
                                            'algorithm', None)
                                        selected_rses = select_algorithm(
                                            algorithm,
                                            created_rules[chained_idx], params)
                                    else:
                                        # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm
                                        if split_rule:
                                            vo = account.vo
                                            rses = parse_expression(
                                                rse_expression,
                                                filter={'vo': vo})
                                            list_of_rses = [
                                                rse['id'] for rse in rses
                                            ]
                                            # Check that some rule doesn't already exist for this DID and subscription
                                            preferred_rse_ids = []
                                            for rule in list_rules(
                                                    filters={
                                                        'subscription_id':
                                                        subscription_id,
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }):
                                                already_existing_rses = [
                                                    (rse['rse'], rse['id']) for
                                                    rse in parse_expression(
                                                        rule['rse_expression'],
                                                        filter={'vo': vo})
                                                ]
                                                for rse, rse_id in already_existing_rses:
                                                    if (rse_id in list_of_rses
                                                        ) and (
                                                            rse_id not in
                                                            preferred_rse_ids):
                                                        preferred_rse_ids.append(
                                                            rse_id)
                                            if len(preferred_rse_ids
                                                   ) >= copies:
                                                skip_rule_creation = True
                                            rse_id_dict = {}
                                            for rse in rses:
                                                rse_id_dict[
                                                    rse['id']] = rse['rse']
                                            try:
                                                rseselector = RSESelector(
                                                    account=account,
                                                    rses=rses,
                                                    weight=weight,
                                                    copies=copies -
                                                    len(preferred_rse_ids))
                                                selected_rses = [
                                                    rse_id_dict[rse_id]
                                                    for rse_id, _, _ in
                                                    rseselector.select_rse(
                                                        0,
                                                        preferred_rse_ids=
                                                        preferred_rse_ids,
                                                        copies=copies,
                                                        blacklist=
                                                        blacklisted_rse_id)
                                                ]
                                            except (InsufficientTargetRSEs,
                                                    InsufficientAccountLimit,
                                                    InvalidRuleWeight,
                                                    RSEOverQuota) as error:
                                                logging.warning(
                                                    prepend_str +
                                                    'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites'
                                                    % (subscription['name'],
                                                       account, str(error)))
                                                # Now including the blacklisted sites
                                                try:
                                                    rseselector = RSESelector(
                                                        account=account,
                                                        rses=rses,
                                                        weight=weight,
                                                        copies=copies -
                                                        len(preferred_rse_ids))
                                                    selected_rses = [
                                                        rse_id_dict[rse_id]
                                                        for rse_id, _, _ in
                                                        rseselector.select_rse(
                                                            0,
                                                            preferred_rse_ids=
                                                            preferred_rse_ids,
                                                            copies=copies,
                                                            blacklist=[])
                                                    ]
                                                    ignore_availability = True
                                                except (InsufficientTargetRSEs,
                                                        InsufficientAccountLimit,
                                                        InvalidRuleWeight,
                                                        RSEOverQuota) as error:
                                                    logging.error(
                                                        prepend_str +
                                                        'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                        %
                                                        (subscription['name'],
                                                         account, str(error)))
                                                    monitor.record_counter(
                                                        counters=
                                                        'transmogrifier.addnewrule.errortype.%s'
                                                        % (str(error.__class__.
                                                               __name__)),
                                                        delta=1)
                                                    # The DID won't be reevaluated at the next cycle
                                                    did_success = did_success and True
                                                    continue

                                    for attempt in range(0, nattempt):
                                        attemptnr = attempt
                                        nb_rule = 0
                                        #  Try to create the rule
                                        try:
                                            if split_rule:
                                                if not skip_rule_creation:
                                                    for rse in selected_rses:
                                                        if isinstance(
                                                                selected_rses,
                                                                dict):
                                                            source_replica_expression = selected_rses[
                                                                rse].get(
                                                                    'source_replica_expression',
                                                                    None)
                                                            weight = selected_rses[
                                                                rse].get(
                                                                    'weight',
                                                                    None)
                                                        logging.info(
                                                            prepend_str +
                                                            'Will insert one rule for %s:%s on %s'
                                                            %
                                                            (did['scope'],
                                                             did['name'], rse))
                                                        rule_ids = add_rule(
                                                            dids=[{
                                                                'scope':
                                                                did['scope'],
                                                                'name':
                                                                did['name']
                                                            }],
                                                            account=account,
                                                            copies=1,
                                                            rse_expression=rse,
                                                            grouping=grouping,
                                                            weight=weight,
                                                            lifetime=lifetime,
                                                            locked=locked,
                                                            subscription_id=
                                                            subscription_id,
                                                            source_replica_expression
                                                            =source_replica_expression,
                                                            activity=activity,
                                                            purge_replicas=
                                                            purge_replicas,
                                                            ignore_availability=
                                                            ignore_availability,
                                                            comment=comment)
                                                        created_rules[
                                                            cnt].append(
                                                                rule_ids[0])
                                                        nb_rule += 1
                                                        if nb_rule == copies:
                                                            success = True
                                                            break
                                            else:
                                                rule_ids = add_rule(
                                                    dids=[{
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }],
                                                    account=account,
                                                    copies=copies,
                                                    rse_expression=
                                                    rse_expression,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription['id'],
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment)
                                                created_rules[cnt].append(
                                                    rule_ids[0])
                                                nb_rule += 1
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.done',
                                                delta=nb_rule)
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.activity.%s'
                                                % str_activity,
                                                delta=nb_rule)
                                            success = True
                                            break
                                        except (InvalidReplicationRule,
                                                InvalidRuleWeight,
                                                InvalidRSEExpression,
                                                StagingAreaRuleRequiresLifetime,
                                                DuplicateRule) as error:
                                            # Errors that won't be retried
                                            success = True
                                            logging.error(prepend_str + '%s' %
                                                          (str(error)))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                            break
                                        except (ReplicationRuleCreationTemporaryFailed,
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                DatabaseException,
                                                RSEBlacklisted,
                                                RSEWriteBlocked) as error:
                                            # Errors to be retried
                                            logging.error(
                                                prepend_str +
                                                '%s Will perform an other attempt %i/%i'
                                                % (str(error), attempt + 1,
                                                   nattempt))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                        except Exception:
                                            # Unexpected errors
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.unknown',
                                                delta=1)
                                            exc_type, exc_value, exc_traceback = exc_info(
                                            )
                                            logging.critical(
                                                prepend_str + ''.join(
                                                    format_exception(
                                                        exc_type, exc_value,
                                                        exc_traceback)).strip(
                                                        ))

                                    did_success = (did_success and success)
                                    if (attemptnr +
                                            1) == nattempt and not success:
                                        logging.error(
                                            prepend_str +
                                            'Rule for %s:%s on %s cannot be inserted'
                                            % (did['scope'], did['name'],
                                               rse_expression))
                                    else:
                                        logging.info(
                                            prepend_str +
                                            '%s rule(s) inserted in %f seconds'
                                            % (str(nb_rule),
                                               time.time() - stime))
                    except DataIdentifierNotFound as error:
                        logging.warning(prepend_str + error)

                if did_success:
                    if did['did_type'] == str(DIDType.FILE):
                        monitor.record_counter(
                            counters='transmogrifier.did.file.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.DATASET):
                        monitor.record_counter(
                            counters='transmogrifier.did.dataset.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.CONTAINER):
                        monitor.record_counter(
                            counters='transmogrifier.did.container.processed',
                            delta=1)
                    monitor.record_counter(
                        counters='transmogrifier.did.processed', delta=1)
                    identifiers.append({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        DIDType.from_sym(did['did_type'])
                    })

            time1 = time.time()

            #  Mark the DIDs as processed
            for identifier in chunks(identifiers, 100):
                _retrial(set_new_dids, identifier, None)

            logging.info(prepend_str + 'Time to set the new flag : %f' %
                         (time.time() - time1))
            tottime = time.time() - start_time
            for sub in subscriptions:
                update_subscription(
                    name=sub['name'],
                    account=sub['account'],
                    metadata={'last_processed': datetime.now()})
            logging.info(prepend_str +
                         'It took %f seconds to process %i DIDs' %
                         (tottime, len(dids)))
            logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids)))
            monitor.record_counter(counters='transmogrifier.job.done', delta=1)
            monitor.record_timer(stat='transmogrifier.job.duration',
                                 time=1000 * tottime)
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(
                format_exception(exc_type, exc_value, exc_traceback)).strip())
            monitor.record_counter(counters='transmogrifier.job.error',
                                   delta=1)
            monitor.record_counter(counters='transmogrifier.addnewrule.error',
                                   delta=1)
        if once is True:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 29
0
def finisher(once=False,
             process=0,
             total_processes=1,
             thread=0,
             total_threads=1,
             sleep_time=60,
             activities=None,
             bulk=100,
             db_bulk=1000):
    """
    Main loop to update the replicas and rules based on finished requests.
    """

    logging.info(
        'finisher starting - process (%i/%i) thread (%i/%i) db_bulk(%i) bulk (%i)'
        % (process, total_processes, thread, total_threads, db_bulk, bulk))
    try:
        suspicious_patterns = []
        pattern = get(section='conveyor',
                      option='suspicious_pattern',
                      session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            suspicious_patterns.append(re.compile(pat.strip()))
    except ConfigNotFound:
        suspicious_patterns = []
    logging.debug("Suspicious patterns: %s" %
                  [pat.pattern for pat in suspicious_patterns])

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1,
                                         hb['nr_threads'])
    graceful_stop.wait(10)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            hb = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1,
                                                 hb['nr_threads'])
            logging.debug(prepend_str + 'Starting new cycle')
            if activities is None:
                activities = [None]

            for activity in activities:
                logging.debug(prepend_str +
                              'Working on activity %s' % activity)
                ts = time.time()
                reqs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[
                        RequestState.DONE, RequestState.FAILED,
                        RequestState.LOST, RequestState.SUBMITTING,
                        RequestState.SUBMISSION_FAILED,
                        RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES
                    ],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow(),
                    activity=activity,
                    process=process,
                    total_processes=total_processes,
                    thread=hb['assign_thread'],
                    total_threads=hb['nr_threads'])
                record_timer('daemons.conveyor.finisher.000-get_next',
                             (time.time() - ts) * 1000)
                stime1 = time.time()
                if reqs:
                    logging.debug(prepend_str +
                                  'Updating %i requests for activity %s' %
                                  (len(reqs), activity))

                for chunk in chunks(reqs, bulk):
                    try:
                        ts = time.time()
                        __handle_requests(chunk, suspicious_patterns)
                        record_timer(
                            'daemons.conveyor.finisher.handle_requests',
                            (time.time() - ts) * 1000 /
                            (len(chunk) if len(chunk) else 1))
                        record_counter(
                            'daemons.conveyor.finisher.handle_requests',
                            len(chunk))
                    except:
                        logging.warn(str(traceback.format_exc()))
                if reqs:
                    logging.debug(
                        prepend_str +
                        'Finish to update %s finished requests for activity %s in %s seconds'
                        % (len(reqs), activity, time.time() - stime1))

        except (DatabaseException, DatabaseError) as error:
            if isinstance(
                    error.args[0],
                    tuple) and (re.match('.*ORA-00054.*', error.args[0][0]) or
                                ('ERROR 1205 (HY000)' in error.args[0][0])):
                logging.warn(
                    prepend_str +
                    'Lock detected when handling request - skipping: %s' %
                    (str(error)))
            else:
                logging.error(prepend_str + '%s' % (traceback.format_exc()))
        except:
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))
        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logging.info(prepend_str + 'Sleeping for a while :  %s seconds' %
                         (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

        if once:
            return

    logging.info(prepend_str + 'Graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 30
0
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100,
           once=False, greedy=False, scheme=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number,
                                 rse['rse'], nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse_id=rse['id'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(total_children)

                    start = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse_id=rse['id'],
                                                          bytes=needed_free_space_per_child,
                                                          limit=max_being_deleted_files,
                                                          worker_number=child_number,
                                                          total_workers=total_children,
                                                          delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                                     worker_number, child_number, rse['rse'],
                                     nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info,
                                                                               lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                                                                               operation='delete', scheme=scheme).values())[0])
                                except (ReplicaUnAvailable, ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        deletion_dict = {'scope': replica['scope'].external,
                                                         'name': replica['name'],
                                                         'rse': rse_info['rse'],
                                                         'rse_id': rse_info['id'],
                                                         'file-size': replica['bytes'],
                                                         'bytes': replica['bytes'],
                                                         'url': replica['pfn'],
                                                         'protocol': prot.attributes['scheme']}
                                        if replica['scope'].vo != 'def':
                                            deletion_dict['vo'] = replica['scope'].vo
                                        logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                            logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                            worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None:
                                                    pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                        deletion_dict['duration'] = duration
                                        add_message('deletion-done', deletion_dict)
                                        logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration)
                                    except SourceNotFound:
                                        err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                        if replica['state'] == ReplicaState.AVAILABLE:
                                            deletion_dict['reason'] = str(err_msg)
                                            add_message('deletion-failed', deletion_dict)
                                    except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except Exception as error:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                    payload = {'scope': replica['scope'].external,
                                               'name': replica['name'],
                                               'rse': rse_info['rse'],
                                               'rse_id': rse_info['id'],
                                               'file-size': replica['bytes'],
                                               'bytes': replica['bytes'],
                                               'url': replica['pfn'],
                                               'reason': str(error),
                                               'protocol': prot.attributes['scheme']}
                                    if replica['scope'].vo != 'def':
                                        deletion_dict['vo'] = replica['scope'].vo
                                    add_message('deletion-failed', payload)
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start)
                            monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Exemplo n.º 31
0
def bulk_group_transfer(transfers,
                        policy='rule',
                        group_bulk=200,
                        fts_source_strategy='auto',
                        max_time_in_queue=None):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:             List of transfers to group.
    :param plicy:                 Policy to use to group.
    :param group_bulk:            Bulk sizes.
    :param fts_source_strategy:   Strategy to group fts sources
    :param max_time_in_queue:     Maximum time in queue
    :return:                      List of grouped transfers.
    """

    grouped_transfers = {}
    grouped_jobs = {}
    for request_id in transfers:
        transfer = transfers[request_id]
        external_host = transfer['external_host']
        if external_host not in grouped_transfers:
            grouped_transfers[external_host] = {}
            grouped_jobs[external_host] = []

        file = {
            'sources': transfer['sources'],
            'destinations': transfer['dest_urls'],
            'metadata': transfer['file_metadata'],
            'filesize': int(transfer['file_metadata']['filesize']),
            'checksum': None,
            'selection_strategy': fts_source_strategy,
            'request_type':
            transfer['file_metadata'].get('request_type', None),
            'activity': str(transfer['file_metadata']['activity'])
        }
        if file['metadata'].get('verify_checksum', True):
            if 'md5' in file['metadata'].keys() and file['metadata']['md5']:
                file['checksum'] = 'MD5:%s' % str(file['metadata']['md5'])
            if 'adler32' in file['metadata'].keys(
            ) and file['metadata']['adler32']:
                file['checksum'] = 'ADLER32:%s' % str(
                    file['metadata']['adler32'])

        job_params = {
            'verify_checksum':
            True if file['checksum']
            and file['metadata'].get('verify_checksum', True) else False,
            'spacetoken':
            transfer['dest_spacetoken']
            if transfer['dest_spacetoken'] else 'null',
            'copy_pin_lifetime':
            transfer['copy_pin_lifetime']
            if transfer['copy_pin_lifetime'] else -1,
            'bring_online':
            transfer['bring_online'] if transfer['bring_online'] else None,
            'job_metadata': {
                'issuer': 'rucio'
            },  # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc.
            'source_spacetoken':
            transfer['src_spacetoken'] if transfer['src_spacetoken'] else None,
            'overwrite':
            transfer['overwrite'],
            'priority':
            3
        }

        if max_time_in_queue:
            if transfer['file_metadata']['activity'] in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue[
                    transfer['file_metadata']['activity']]
            elif 'default' in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue['default']

        # for multiple source replicas, no bulk submission
        if len(transfer['sources']) > 1:
            job_params['job_metadata']['multi_sources'] = True
            grouped_jobs[external_host].append({
                'files': [file],
                'job_params': job_params
            })
        else:
            job_params['job_metadata']['multi_sources'] = False
            job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (
                job_params['verify_checksum'], job_params['spacetoken'],
                job_params['copy_pin_lifetime'], job_params['bring_online'],
                job_params['job_metadata'], job_params['source_spacetoken'],
                job_params['overwrite'], job_params['priority'])
            if 'max_time_in_queue' in job_params:
                job_key = job_key + ',%s' % job_params['max_time_in_queue']

            if job_key not in grouped_transfers[external_host]:
                grouped_transfers[external_host][job_key] = {}

            if policy == 'rule':
                policy_key = '%s' % (transfer['rule_id'])
            if policy == 'dest':
                policy_key = '%s' % (file['metadata']['dst_rse'])
            if policy == 'src_dest':
                policy_key = '%s,%s' % (file['metadata']['src_rse'],
                                        file['metadata']['dst_rse'])
            if policy == 'rule_src_dest':
                policy_key = '%s,%s,%s' % (transfer['rule_id'],
                                           file['metadata']['src_rse'],
                                           file['metadata']['dst_rse'])
            # maybe here we need to hash the key if it's too long

            if policy_key not in grouped_transfers[external_host][job_key]:
                grouped_transfers[external_host][job_key][policy_key] = {
                    'files': [file],
                    'job_params': job_params
                }
            else:
                grouped_transfers[external_host][job_key][policy_key][
                    'files'].append(file)

    # for jobs with different job_key, we cannot put in one job.
    for external_host in grouped_transfers:
        for job_key in grouped_transfers[external_host]:
            # for all policy groups in job_key, the job_params is the same.
            for policy_key in grouped_transfers[external_host][job_key]:
                job_params = grouped_transfers[external_host][job_key][
                    policy_key]['job_params']
                for xfers_files in chunks(
                        grouped_transfers[external_host][job_key][policy_key]
                    ['files'], group_bulk):
                    # for the last small piece, just submit it.
                    grouped_jobs[external_host].append({
                        'files': xfers_files,
                        'job_params': job_params
                    })

    return grouped_jobs
Exemplo n.º 32
0
def poller(once=False, activities=None, sleep_time=60,
           fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    executable = 'conveyor-poller'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info(prepend_str + 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' % (db_bulk, fts_bulk, timeout))

    time.sleep(10)  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info(prepend_str + 'Poller started')

    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                start_time = time.time()
                logging.debug(prepend_str + 'Start to poll transfers older than %i seconds for activity %s' % (older_than, activity))
                transfs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                state=[RequestState.SUBMITTED],
                                                limit=db_bulk,
                                                older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than),
                                                total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'],
                                                mode_all=False, hash_variable='id',
                                                activity=activity,
                                                activity_shares=activity_shares)

                record_timer('daemons.conveyor.poller.000-get_next', (time.time() - start_time) * 1000)

                if transfs:
                    logging.debug(prepend_str + 'Polling %i transfers for activity %s' % (len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append((transf['external_id'], transf['request_id']))

                for external_host in xfers_ids:
                    external_ids = list({trf[0] for trf in xfers_ids[external_host]})
                    request_ids = [trf[1] for trf in xfers_ids[external_host]]
                    for xfers in chunks(external_ids, fts_bulk):
                        # poll transfers
                        poll_transfers(external_host=external_host, xfers=xfers, prepend_str=prepend_str, request_ids=request_ids, timeout=timeout)

                if len(transfs) < fts_bulk / 2:
                    logging.info(prepend_str + "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (len(transfs), activity, fts_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except Exception:
            logging.critical(prepend_str + "%s" % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 33
0
def bulk_group_transfer(transfers, policy='rule', group_bulk=200, source_strategy=None, max_time_in_queue=None, session=None):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:             List of transfers to group.
    :param plicy:                 Policy to use to group.
    :param group_bulk:            Bulk sizes.
    :param source_strategy:       Strategy to group sources
    :param max_time_in_queue:     Maximum time in queue
    :return:                      List of grouped transfers.
    """

    grouped_transfers = {}
    grouped_jobs = {}

    try:
        default_source_strategy = get(section='conveyor', option='default-source-strategy')
    except ConfigNotFound:
        default_source_strategy = 'orderly'

    try:
        activity_source_strategy = get(section='conveyor', option='activity-source-strategy')
        activity_source_strategy = loads(activity_source_strategy)
    except ConfigNotFound:
        activity_source_strategy = {}
    except ValueError:
        logging.warning('activity_source_strategy not properly defined')
        activity_source_strategy = {}

    for request_id in transfers:
        transfer = transfers[request_id]
        verify_checksum = transfer['file_metadata'].get('verify_checksum', 'both')

        dest_rse_id = transfer['file_metadata']['dest_rse_id']
        source_rse_id = transfer['file_metadata']['src_rse_id']

        dest_supported_checksums = get_rse_supported_checksums(rse_id=dest_rse_id, session=session)
        source_supported_checksums = get_rse_supported_checksums(rse_id=source_rse_id, session=session)
        common_checksum_names = set(source_supported_checksums).intersection(dest_supported_checksums)

        if source_supported_checksums == ['none']:
            if dest_supported_checksums == ['none']:
                # both endpoints support none
                verify_checksum = 'none'
            else:
                # src supports none but dst does
                verify_checksum = 'destination'
        else:
            if dest_supported_checksums == ['none']:
                # source supports some but destination does not
                verify_checksum = 'source'
            else:
                if len(common_checksum_names) == 0:
                    # source and dst support some bot none in common (dst priority)
                    verify_checksum = 'destination'
                else:
                    # Don't override the value in the file_metadata
                    pass

        t_file = {'sources': transfer['sources'],
                  'destinations': transfer['dest_urls'],
                  'metadata': transfer['file_metadata'],
                  'filesize': int(transfer['file_metadata']['filesize']),
                  'checksum': None,
                  'verify_checksum': verify_checksum,
                  'selection_strategy': source_strategy if source_strategy else activity_source_strategy.get(str(transfer['file_metadata']['activity']), default_source_strategy),
                  'request_type': transfer['file_metadata'].get('request_type', None),
                  'activity': str(transfer['file_metadata']['activity'])}

        if verify_checksum != 'none':
            if verify_checksum == 'both':
                set_checksum_value(t_file, common_checksum_names)
            if verify_checksum == 'source':
                set_checksum_value(t_file, source_supported_checksums)
            if verify_checksum == 'destination':
                set_checksum_value(t_file, dest_supported_checksums)

        multihop = transfer.get('multihop', False)
        strict_copy = transfer.get('strict_copy', False)

        external_host = transfer['external_host']
        scope = t_file['metadata']['scope']
        scope_str = scope.internal
        activity = t_file['activity']

        if external_host not in grouped_transfers:
            grouped_transfers[external_host] = {}
            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                grouped_jobs[external_host] = []
            elif activity in USER_ACTIVITY:
                grouped_jobs[external_host] = {}
                if scope_str not in grouped_transfers[external_host]:
                    grouped_transfers[external_host][scope_str] = {}
                    grouped_jobs[external_host][scope_str] = []

        job_params = {'account': transfer['account'],
                      'use_oidc': transfer.get('use_oidc', False),
                      'verify_checksum': verify_checksum,
                      'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1,
                      'bring_online': transfer['bring_online'] if transfer['bring_online'] else None,
                      'job_metadata': {'issuer': 'rucio'},  # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc.
                      'overwrite': transfer['overwrite'],
                      'priority': 3,
                      's3alternate': True}
        if multihop:
            job_params['multihop'] = True
        if strict_copy:
            job_params['strict_copy'] = True

        # Don't put optional & missing keys in the parameters
        if transfer['dest_spacetoken']:
            job_params.update({'spacetoken': transfer['dest_spacetoken']})
        if transfer['src_spacetoken']:
            job_params.update({'source_spacetoken': transfer['src_spacetoken']})

        if max_time_in_queue:
            if transfer['file_metadata']['activity'] in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue[transfer['file_metadata']['activity']]
            elif 'default' in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue['default']

        # for multiple source replicas, no bulk submission
        if len(transfer['sources']) > 1:
            job_params['job_metadata']['multi_sources'] = True
            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                grouped_jobs[external_host].append({'files': [t_file], 'job_params': job_params})
            elif activity in USER_ACTIVITY:
                grouped_jobs[external_host][scope_str].append({'files': [t_file], 'job_params': job_params})
        else:
            job_params['job_metadata']['multi_sources'] = False
            job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (job_params['verify_checksum'], job_params.get('spacetoken', None),
                                                   job_params['copy_pin_lifetime'],
                                                   job_params['bring_online'], job_params['job_metadata'],
                                                   job_params.get('source_spacetoken', None),
                                                   job_params['overwrite'], job_params['priority'])
            if 'max_time_in_queue' in job_params:
                job_key = job_key + ',%s' % job_params['max_time_in_queue']

            if multihop:
                job_key = 'multihop_%s' % (transfer['initial_request_id'])

            if job_key not in grouped_transfers[external_host]:
                if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                    grouped_transfers[external_host][job_key] = {}
                elif activity in USER_ACTIVITY:
                    grouped_transfers[external_host][scope_str][job_key] = {}

            if multihop:
                policy_key = 'multihop_%s' % (transfer['initial_request_id'])
            else:
                if policy == 'rule':
                    policy_key = '%s' % (transfer['rule_id'])
                if policy == 'dest':
                    policy_key = '%s' % (t_file['metadata']['dst_rse'])
                if policy == 'src_dest':
                    policy_key = '%s,%s' % (t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'rule_src_dest':
                    policy_key = '%s,%s,%s' % (transfer['rule_id'], t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'activity_dest':
                    policy_key = '%s %s' % (activity, t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                if policy == 'activity_src_dest':
                    policy_key = '%s %s %s' % (activity, t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                    # maybe here we need to hash the key if it's too long

            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                if policy_key not in grouped_transfers[external_host][job_key]:
                    grouped_transfers[external_host][job_key][policy_key] = {'files': [t_file], 'job_params': job_params}
                else:
                    if multihop:
                        # The parent transfer should be the first of the list
                        # TODO : Only work for a single hop now, need to be able to handle multiple hops
                        if transfer['parent_request']:  # This is the child
                            grouped_transfers[external_host][job_key][policy_key]['files'].append(t_file)
                        else:
                            grouped_transfers[external_host][job_key][policy_key]['files'].insert(0, t_file)
                    else:
                        grouped_transfers[external_host][job_key][policy_key]['files'].append(t_file)
            elif activity in USER_ACTIVITY:
                if policy_key not in grouped_transfers[external_host][scope_str][job_key]:
                    grouped_transfers[external_host][scope_str][job_key][policy_key] = {'files': [t_file], 'job_params': job_params}
                else:
                    if multihop:
                        # The parent transfer should be the first of the list
                        # TODO : Only work for a single hop now, need to be able to handle multiple hops
                        if transfer['parent_request']:  # This is the child
                            grouped_transfers[external_host][scope_str][job_key][policy_key]['files'].append(t_file)
                        else:
                            grouped_transfers[external_host][scope_str][job_key][policy_key]['files'].insert(0, t_file)

    # for jobs with different job_key, we cannot put in one job.
    for external_host in grouped_transfers:
        if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
            for job_key in grouped_transfers[external_host]:
                # for all policy groups in job_key, the job_params is the same.
                for policy_key in grouped_transfers[external_host][job_key]:
                    job_params = grouped_transfers[external_host][job_key][policy_key]['job_params']
                    for xfers_files in chunks(grouped_transfers[external_host][job_key][policy_key]['files'], group_bulk):
                        # for the last small piece, just submit it.
                        grouped_jobs[external_host].append({'files': xfers_files, 'job_params': job_params})
        elif activity in USER_ACTIVITY:
            for scope_key in grouped_transfers[external_host]:
                for job_key in grouped_transfers[external_host][scope_key]:
                    # for all policy groups in job_key, the job_params is the same.
                    for policy_key in grouped_transfers[external_host][scope_key][job_key]:
                        job_params = grouped_transfers[external_host][scope_key][job_key][policy_key]['job_params']
                        for xfers_files in chunks(grouped_transfers[external_host][scope_key][job_key][policy_key]['files'], group_bulk):
                            # for the last small piece, just submit it.
                            grouped_jobs[external_host][scope_key].append({'files': xfers_files, 'job_params': job_params})

    return grouped_jobs
Exemplo n.º 34
0
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = config_get('reaper',
                                                'auto_exclude_threshold',
                                                default=auto_exclude_threshold)
            auto_exclude_timeout = config_get('reaper',
                                              'auto_exclude_timeout',
                                              default=auto_exclude_timeout)
        except (NoOptionError, NoSectionError, RuntimeError):
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'reaper', 'max_evaluator_backlog_count')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'reaper', 'max_evaluator_backlog_duration')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blocklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blocklisted for delete',
                           rse['rse'])
                    continue
                needed_free_space, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], greedy=greedy, logger=logger)
                if needed_free_space:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                    tot_needed_free_space += needed_free_space
                elif only_delete_obsolete:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                else:
                    logger(logging.DEBUG, 'Nothing to delete on %s',
                           rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            paused_rses = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    paused_rses.append(rse_name)
                    logger(
                        logging.DEBUG,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                try:
                    rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                except KeyError:
                    logger(logging.DEBUG, "Hostname lookup for %s failed.",
                           rse_name)
                    REGION.set('pause_deletion_%s' % rse_id, True)
                    continue
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]
                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue
                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][1]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes_=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)
                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme,
                                                  logger=logger)
                    for file_replicas in chunks(replicas, chunk_size):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas,
                            prot,
                            rse_info,
                            staging_areas,
                            auto_exclude_threshold,
                            logger=logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if paused_rses:
                logger(logging.INFO,
                       'Deletion paused for a while for following RSEs: %s',
                       ', '.join(paused_rses))

            if once:
                break

            daemon_sleep(start_time=start_time,
                         sleep_time=sleep_time,
                         graceful_stop=GRACEFUL_STOP,
                         logger=logger)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
Exemplo n.º 35
0
    def update_replicas(self, dry=False):
        """
        Add or removes replicas for the dataset at rse.
        :dry:  Drydrun. default false
        """

        logging.notice('Updating replicas for %s:%s at %s' %
                       (self.scope, self.dataset, self.rse))

        replicas = self.rcli.list_replicas([{
            'scope': self.scope,
            'name': self.dataset
        }],
                                           rse_expression='rse=%s' % self.rse)

        rrepl = [repl['name'] for repl in replicas]

        prepl = [repl for repl in self.replicas.keys()]

        missing = list(set(prepl) - set(rrepl))

        to_remove = list(set(rrepl) - set(prepl))

        if missing and dry:
            logging.dry('Adding replicas %s to rse %s.', str(missing),
                        self.rse)

        elif missing:
            logging.verbose('Adding replicas %s to rse %s.', str(missing),
                            self.rse)

            self.rcli.add_replicas(rse=self.rse,
                                   files=[{
                                       'scope':
                                       self.scope,
                                       'name':
                                       self.replicas[lfn]['name'],
                                       'adler32':
                                       self.replicas[lfn]['checksum'],
                                       'bytes':
                                       self.replicas[lfn]['size'],
                                   } for lfn in missing])

            # missing files that are not in the list of dataset files
            # are to be attached.
            lfns = [
                item['name']
                for item in self.rcli.list_files(scope=self.scope,
                                                 name=self.dataset)
            ]

            missing_lfns = list(set(missing) - set(lfns))
            if missing_lfns:
                logging.verbose('Attaching lfns %s to dataset %s.',
                                str(missing_lfns), self.dataset)

                try:
                    self.rcli.attach_dids(
                        scope=self.scope,
                        name=self.dataset,
                        dids=[{
                            'scope': self.scope,
                            'name': lfn
                        } for lfn in list(set(missing) - set(lfns))])

                except FileAlreadyExists:
                    logging.warning('Trying to attach already existing files.')

        if to_remove and dry:
            logging.dry('Removing replicas %s from rse %s.', str(to_remove),
                        self.rse)

        elif to_remove:
            logging.verbose('Removing replicas %s from rse %s.',
                            str(to_remove), self.rse)
            for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                attempt = 0
                while True:
                    attempt += 1
                    try:
                        self.rcli.delete_replicas(rse=self.rse,
                                                  files=[{
                                                      'scope': self.scope,
                                                      'name': lfn,
                                                  } for lfn in to_remove_chunk
                                                         ])
                        break
                    except DatabaseException:
                        logging.warning(
                            'DatabaseException raised, retrying...')
                        if attempt > 3:
                            raise
                        time.sleep(randint(1, 5))

        return {'added': missing, 'removed': to_remove}
Exemplo n.º 36
0
def run_once(bulk, db_bulk, suspicious_patterns, retry_protocol_mismatches,
             heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        logger(logging.DEBUG, 'Working on activity %s', activity)
        time1 = time.time()
        reqs = request_core.get_next(
            request_type=[
                RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT
            ],
            state=[
                RequestState.DONE, RequestState.FAILED, RequestState.LOST,
                RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED,
                RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES,
                RequestState.MISMATCH_SCHEME
            ],
            limit=db_bulk,
            older_than=datetime.datetime.utcnow(),
            total_workers=total_workers,
            worker_number=worker_number,
            mode_all=True,
            hash_variable='rule_id')
        record_timer('daemons.conveyor.finisher.get_next',
                     (time.time() - time1) * 1000)
        time2 = time.time()
        if reqs:
            logger(logging.DEBUG, 'Updating %i requests for activity %s',
                   len(reqs), activity)

        for chunk in chunks(reqs, bulk):
            try:
                worker_number, total_workers, logger = heartbeat_handler.live()
                time3 = time.time()
                __handle_requests(chunk,
                                  suspicious_patterns,
                                  retry_protocol_mismatches,
                                  logger=logger)
                record_timer('daemons.conveyor.finisher.handle_requests_time',
                             (time.time() - time3) * 1000 /
                             (len(chunk) if chunk else 1))
                record_counter('daemons.conveyor.finisher.handle_requests',
                               delta=len(chunk))
            except Exception as error:
                logger(logging.WARNING, '%s', str(error))
        if reqs:
            logger(
                logging.DEBUG,
                'Finish to update %s finished requests for activity %s in %s seconds',
                len(reqs), activity,
                time.time() - time2)

    except (DatabaseException, DatabaseError) as error:
        if re.match('.*ORA-00054.*', error.args[0]) or re.match(
                '.*ORA-00060.*',
                error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
            logger(logging.WARNING,
                   'Lock detected when handling request - skipping: %s',
                   str(error))
        else:
            raise
    return True
Exemplo n.º 37
0
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals())
    while not graceful_stop.is_set():
        try:
            max_deleting_rate = 0

            for rse in rses:
                deleting_rate = 0
                rse_info = rsemgr.get_rse_info(rse['rse'])
                rse_protocol = rse_core.get_rse_protocols(rse['rse'])

                if not rse_protocol['availability_delete']:
                    logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse']))
                    continue

                # Temporary hack to force gfal for deletion
                for protocol in rse_info['protocols']:
                    if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                        protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse']))
                try:
                    needed_free_space, max_being_deleted_files = None, 10000
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space = 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse']))

                    s = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas)))

                    if not replicas:
                        logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse']))
                        continue

                    p = rsemgr.create_protocol(rse_info, 'delete', scheme=None)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files)))
                        try:
                            s = time.time()
                            update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files])

                            for replica in files:
                                try:
                                    replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0])
                                except ReplicaUnAvailable as e:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e))
                                    logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg))
                                    replica['pfn'] = None

                                add_message('deletion-planned', {'scope': replica['scope'],
                                                                 'name': replica['name'],
                                                                 'file-size': replica['bytes'],
                                                                 'url': replica['pfn'],
                                                                 'rse': rse_info['rse']})

                            # logging.debug('update_replicas_states %s' % (time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.being_deleted',  delta=len(files))

                            if not scheme:
                                try:
                                    deleted_files = []
                                    p.connect()
                                    for replica in files:
                                        try:
                                            logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            s = time.time()
                                            if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                                logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number,
                                                                                                                                                                                    child_number,
                                                                                                                                                                                    replica['scope'],
                                                                                                                                                                                    replica['name'],
                                                                                                                                                                                    replica['pfn'],
                                                                                                                                                                                    rse['rse']))
                                            else:
                                                if replica['pfn']:
                                                    p.delete(replica['pfn'])
                                                else:
                                                    logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000)
                                            duration = time.time() - s

                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                            add_message('deletion-done', {'scope': replica['scope'],
                                                                          'name': replica['name'],
                                                                          'rse': rse_info['rse'],
                                                                          'file-size': replica['bytes'],
                                                                          'url': replica['pfn'],
                                                                          'duration': duration})
                                            logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                        except SourceNotFound:
                                            err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                            logging.warning(err_msg)
                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': err_msg})
                                        except (ServiceUnavailable, RSEAccessDenied) as e:
                                            logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except Exception as e:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                except (ServiceUnavailable, RSEAccessDenied) as e:
                                    for replica in files:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                        add_message('deletion-failed', {'scope': replica['scope'],
                                                                        'name': replica['name'],
                                                                        'rse': rse_info['rse'],
                                                                        'file-size': replica['bytes'],
                                                                        'url': replica['pfn'],
                                                                        'reason': str(e)})
                                finally:
                                    p.close()
                            s = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse=rse['rse'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.done',  delta=len(deleted_files))
                            deleting_rate += len(deleted_files)
                        except:
                            logging.critical(traceback.format_exc())
                    deleting_rate = deleting_rate * 1.0 / max_being_deleted_files
                    if deleting_rate > max_deleting_rate:
                        max_deleting_rate = deleting_rate
                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate))
            sleep_time = int((1 - max_deleting_rate) * 60 + 1)
            time.sleep(sleep_time)

        except:
            logging.critical(traceback.format_exc())

    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')