Пример #1
0
def test_client_add_temporary_unavailable_pfns(rse_factory, mock_scope, replica_client):
    """ REPLICA (CLIENT): Add temporary unavailable PFNs"""
    rse, rse_id = rse_factory.make_posix_rse()
    nbfiles = 5
    # Adding replicas to deterministic RSE
    files = [{'scope': mock_scope.external, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles)]
    replica_client.add_replicas(rse=rse, files=files)

    # Listing replicas on deterministic RSE
    list_rep = []
    for replica in replica_client.list_replicas(dids=[{'scope': f['scope'], 'name': f['name']} for f in files], schemes=['file'], all_states=True):
        pfn = list(replica['pfns'].keys())[0]
        list_rep.append(pfn)

    # Submit bad PFNs
    now = datetime.utcnow()
    reason_str = generate_uuid()
    replica_client.add_bad_pfns(pfns=list_rep, reason=str(reason_str), state='TEMPORARY_UNAVAILABLE', expires_at=now.isoformat())
    result = get_bad_pfns(limit=10000, thread=None, total_threads=None, session=None)
    bad_pfns = {}
    for res in result:
        bad_pfns[res['pfn']] = (res['state'], res['reason'], res['expires_at'])

    for pfn in list_rep:
        pfn = str(clean_surls([pfn])[0])
        assert pfn in bad_pfns
        assert bad_pfns[pfn][0] == BadPFNStatus.TEMPORARY_UNAVAILABLE
        assert bad_pfns[pfn][1] == reason_str

    # Submit with wrong state
    with pytest.raises(RucioException):
        replica_client.add_bad_pfns(pfns=list_rep, reason=str(reason_str), state='BADSTATE', expires_at=now.isoformat())

    # Run minos once
    minos_run(threads=1, bulk=10000, once=True)
    result = get_bad_pfns(limit=10000, thread=None, total_threads=None, session=None)
    pfns = [res['pfn'] for res in result if res['pfn'] in bad_pfns]
    res_pfns = []
    for replica in list_rep:
        if replica in pfns:
            res_pfns.append(replica)
    assert res_pfns == []

    # Check the state in the replica table
    for did in files:
        rep = get_replicas_state(scope=mock_scope, name=did['name'])
        assert list(rep.keys())[0] == ReplicaState.TEMPORARY_UNAVAILABLE

    rep = []
    for did in files:
        did['state'] = ReplicaState.TEMPORARY_UNAVAILABLE
        rep.append(did)

    # Run the minos expiration
    minos_temp_run(threads=1, once=True)
    # Check the state in the replica table
    for did in files:
        rep = get_replicas_state(scope=mock_scope, name=did['name'])
        assert list(rep.keys())[0] == ReplicaState.AVAILABLE
Пример #2
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

        stime = time.time()
        replicas = []
        try:
            replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                list_replicas = get_replicas_state(scope=scope, name=name)
                if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                    logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                    unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, [])
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep)))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(limit=1000000,
                                                         thread=heart_beat['assign_thread'],
                                                         total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now))
                update_history_time = time.time()

            tottime = time.time() - stime
            if len(replicas) == bulk:
                logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle')
            elif tottime < sleep_time:
                logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime)))
                time.sleep(sleep_time - tottime)
                continue

    logging.info(prepend_str + 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop done')
Пример #3
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                            unknown_replicas.extend(
                                tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logging.info(
                            prepend_str +
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logging.debug(prepend_str +
                                      'Running on RSE %s%s with %s replicas' %
                                      (get_rse_name(rse_id=rse_id), vo_str,
                                       len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logging.debug(prepend_str +
                                              'Unknown replicas : %s' %
                                              (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logging.debug(prepend_str +
                                  'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logging.debug(prepend_str +
                                  'Running on %s replicas on RSE %s' %
                                  (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.error(
                                prepend_str +
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logging.debug(prepend_str + 'Working on %s' %
                                              (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    if rep['rse_id'] in unavailable_states:
                                        logging.info(
                                            prepend_str +
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    elif expires_at < datetime.now():
                                        logging.info(
                                            '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns',
                                            prepend_str, str(rep['pfn']),
                                            expires_at)
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound) as error:
                                    logging.error(
                                        prepend_str +
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logging.critical(traceback.format_exc())
                            session = get_session()

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logging.info(
                prepend_str +
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Пример #4
0
    def test_add_temporary_unavailable_pfns(self):
        """ REPLICA (CLIENT): Add temporary unavailable PFNs"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{
            'scope': tmp_scope,
            'name': 'file_%s' % generate_uuid(),
            'bytes': 1,
            'adler32': '0cc737eb',
            'meta': {
                'events': 10
            }
        } for _ in range(nbfiles)]
        self.replica_client.add_replicas(rse='MOCK', files=files)

        # Listing replicas on deterministic RSE
        list_rep = []
        for replica in self.replica_client.list_replicas(dids=[{
                'scope':
                f['scope'],
                'name':
                f['name']
        } for f in files],
                                                         schemes=['srm'],
                                                         unavailable=True):
            pfn = replica['pfns'].keys()[0]
            list_rep.append(pfn)

        # Submit bad PFNs
        now = datetime.utcnow()
        reason_str = generate_uuid()
        self.replica_client.add_bad_pfns(pfns=list_rep,
                                         reason=str(reason_str),
                                         state='TEMPORARY_UNAVAILABLE',
                                         expires_at=now.isoformat())
        result = get_bad_pfns(limit=10000,
                              thread=None,
                              total_threads=None,
                              session=None)
        bad_pfns = {}
        for res in result:
            bad_pfns[res['pfn']] = (res['state'], res['reason'],
                                    res['expires_at'])

        for pfn in list_rep:
            pfn = str(clean_surls([pfn])[0])
            assert_in(pfn, bad_pfns)
            assert_equal(str(bad_pfns[pfn][0]), 'TEMPORARY_UNAVAILABLE')
            assert_equal(bad_pfns[pfn][1], reason_str)

        # Submit with wrong state
        with assert_raises(RucioException):
            self.replica_client.add_bad_pfns(pfns=list_rep,
                                             reason=str(reason_str),
                                             state='BADSTATE',
                                             expires_at=now.isoformat())

        # Run minos once
        minos_run(threads=1, bulk=10000, once=True)
        result = get_bad_pfns(limit=10000,
                              thread=None,
                              total_threads=None,
                              session=None)
        pfns = [res['pfn'] for res in result]
        res_pfns = []
        for replica in list_rep:
            if replica in pfns:
                res_pfns.append(replica)
        assert_equal(res_pfns, [])

        # Check the state in the replica table
        for did in files:
            rep = get_replicas_state(scope=did['scope'], name=did['name'])
            assert_equal(str(rep.keys()[0]), 'TEMPORARY_UNAVAILABLE')

        rep = []
        for did in files:
            did['state'] = ReplicaState.from_sym('TEMPORARY_UNAVAILABLE')
            rep.append(did)

        # Run the minos expiration
        minos_temp_run(threads=1, once=True)
        # Check the state in the replica table
        for did in files:
            rep = get_replicas_state(scope=did['scope'], name=did['name'])
            assert_equal(str(rep.keys()[0]), 'AVAILABLE')
Пример #5
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos Temporary Expiration starting')

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        try:
            # Get list of expired TU replicas
            logger(logging.INFO, 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logger(logging.INFO, '%s expired replicas returned', len(expired_replicas))
            logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except (DatabaseException, DatabaseError) as error:
                    if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                        logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                    else:
                        logger(logging.ERROR, 'Exception', exc_info=True)
                    session.rollback()
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logger(logging.CRITICAL, str(traceback.format_exc()))

        if once:
            break
        daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info('%s Minos Temporary Expiration starting', prepend_str)

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info('%s Minos Temporary Expiration started', prepend_str)

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info('%s Getting list of expired replicas', prepend_str)
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas))
            logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
Пример #7
0
def necromancer(thread=0, bulk=5, once=False, sleep_time=60):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not GRACEFUL_STOP.is_set():

        stime = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')
        logger(logging.DEBUG, 'Starting new cycle')

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get('necromancer', 'max_evaluator_backlog_count')
            max_evaluator_backlog_count = int(max_evaluator_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get('necromancer', 'max_evaluator_backlog_duration')
            max_evaluator_backlog_duration = int(max_evaluator_backlog_duration)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            evaluator_backlog_count, evaluator_backlog_duration = get_evaluation_backlog(expiration_time=sleep_time)
            if max_evaluator_backlog_count and \
               evaluator_backlog_count and \
               max_evaluator_backlog_duration and \
               evaluator_backlog_duration and \
               evaluator_backlog_count > max_evaluator_backlog_count and \
               evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation')
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and evaluator_backlog_count and evaluator_backlog_count > max_evaluator_backlog_count:
                logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation')
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and evaluator_backlog_duration and evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation')
                GRACEFUL_STOP.wait(30)
                continue

        # Check how many bad replicas are queued
        try:
            max_bad_replicas_backlog_count = config_get('necromancer', 'max_bad_replicas_backlog_count')
            max_bad_replicas_backlog_count = int(max_bad_replicas_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_bad_replicas_backlog_count = None
        bad_replicas_backlog = REGION.get('bad_replicas_backlog')
        if bad_replicas_backlog is NO_VALUE:
            bad_replicas_backlog = get_bad_replicas_backlog()
            REGION.set('bad_replicas_backlog', bad_replicas_backlog)
        tot_bad_files = sum([bad_replicas_backlog[key] for key in bad_replicas_backlog])
        list_of_rses = list()
        # If too many replica, call list_bad_replicas with a list of RSEs
        if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(bad_replicas_backlog) > 1:
            logger(logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs')
            rses = list()
            cnt = 0
            for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=True):
                rses.append({'id': key})
                cnt += bad_replicas_backlog[key]
                if cnt >= bulk:
                    list_of_rses.append(rses)
                    rses = list()
                    cnt = 0
        else:
            list_of_rses.append(None)

        tot_processed = 0
        if tot_bad_files == 0:
            logger(logging.INFO, 'No bad replicas to process.')
        else:
            ttime = time.time()
            replicas = []
            try:
                for rses in list_of_rses:

                    replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses)
                    for replica in replicas:
                        scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                        logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse))

                        list_replicas = get_replicas_state(scope=scope, name=name)
                        if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                            logger(logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name))
                            try:
                                update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                                monitor.record_counter(name='necromancer.badfiles.lostfile')
                            except (DatabaseException, DatabaseError) as error:
                                if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                                    logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                                else:
                                    logger(logging.ERROR, str(error))

                        else:
                            rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                            unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, [])
                            logger(logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep)))
                            try:
                                update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                                monitor.record_counter(name='necromancer.badfiles.recovering')
                            except (DatabaseException, DatabaseError) as error:
                                if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                                    logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                                else:
                                    logger(logging.ERROR, str(error))

                    tot_processed += len(replicas)
                    logger(logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - ttime), str(len(replicas))))
            except Exception:
                exc_type, exc_value, exc_traceback = exc_info()
                logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        elif tot_processed == 0 or tot_bad_files == 0:
            daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger)

    logger(logging.INFO, 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop done')
Пример #8
0
def necromancer(thread=0, bulk=5, once=False, sleep_time=60):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not GRACEFUL_STOP.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                        heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'necromancer', 'max_evaluator_backlog_count')
            max_evaluator_backlog_count = int(max_evaluator_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'necromancer', 'max_evaluator_backlog_duration')
            max_evaluator_backlog_duration = int(
                max_evaluator_backlog_duration)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog(expiration_time=sleep_time)
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        # Check how many bad replicas are queued
        try:
            max_bad_replicas_backlog_count = config_get(
                'necromancer', 'max_bad_replicas_backlog_count')
            max_bad_replicas_backlog_count = int(
                max_bad_replicas_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_bad_replicas_backlog_count = None
        bad_replicas_backlog = get_bad_replicas_backlog()
        tot_bad_files = sum(
            [bad_replicas_backlog[key] for key in bad_replicas_backlog])
        list_of_rses = list()
        # If too many replica, call list_bad_replicas with a list of RSEs
        if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(
                bad_replicas_backlog) > 1:
            logger(
                logging.INFO,
                'Backlog of bads replica too big. Apply some sharing between different RSEs'
            )
            rses = list()
            cnt = 0
            for key in sorted(bad_replicas_backlog,
                              key=bad_replicas_backlog.get,
                              reverse=False):
                rses.append({'id': key})
                cnt += bad_replicas_backlog[key]
                if cnt >= bulk:
                    list_of_rses.append(rses)
                    rses = list()
                    cnt = 0
        else:
            list_of_rses.append(None)

        stime = time.time()
        replicas = []
        try:
            for rses in list_of_rses:
                replicas = list_bad_replicas(
                    limit=bulk,
                    thread=heart_beat['assign_thread'],
                    total_threads=heart_beat['nr_threads'],
                    rses=rses)

                for replica in replicas:
                    scope, name, rse_id, rse = replica['scope'], replica[
                        'name'], replica['rse_id'], replica['rse']
                    logger(logging.INFO,
                           'Working on %s:%s on %s' % (scope, name, rse))

                    list_replicas = get_replicas_state(scope=scope, name=name)
                    if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                        logger(
                            logging.INFO,
                            'File %s:%s has no other available or temporary available replicas, it will be marked as lost'
                            % (scope, name))
                        try:
                            update_rules_for_lost_replica(scope=scope,
                                                          name=name,
                                                          rse_id=rse_id,
                                                          nowait=True)
                            monitor.record_counter(
                                name='necromancer.badfiles.lostfile')
                        except DatabaseException as error:
                            logger(logging.WARNING, str(error))

                    else:
                        rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                        unavailable_rep = list_replicas.get(
                            ReplicaState.TEMPORARY_UNAVAILABLE, [])
                        logger(
                            logging.INFO,
                            'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s'
                            % (scope, name, str(rep), str(unavailable_rep)))
                        try:
                            update_rules_for_bad_replica(scope=scope,
                                                         name=name,
                                                         rse_id=rse_id,
                                                         nowait=True)
                            monitor.record_counter(
                                name='necromancer.badfiles.recovering')
                        except DatabaseException as error:
                            logger(logging.WARNING, str(error))

                logger(
                    logging.INFO, 'It took %s seconds to process %s replicas' %
                    (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logger(
                logging.CRITICAL,
                ''.join(format_exception(exc_type, exc_value,
                                         exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logger(
                    logging.INFO,
                    'Last update of history table %s seconds ago. Running update.'
                    % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(
                    limit=1000000,
                    thread=heart_beat['assign_thread'],
                    total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logger(logging.DEBUG,
                           'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logger(
                            logging.DEBUG,
                            ' History for rse_id %s : chunk %i/%i' %
                            (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logger(
                    logging.INFO, 'History table updated in %s seconds' %
                    (time.time() - now))
                update_history_time = time.time()

            if len(replicas) == bulk:
                logger(
                    logging.INFO,
                    'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle'
                )
            else:
                daemon_sleep(start_time=stime,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

    logger(logging.INFO, 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop done')