예제 #1
0
            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(limit=10000000,
                                                         thread=hb['assign_thread'],
                                                         total_threads=hb['nr_threads'])
                for rse_id in bad_replicas:
                    update_bad_replicas_history(bad_replicas[rse_id], rse_id)
                logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now))
                update_history_time = time.time()

            tottime = time.time() - stime
            if tottime < sleep_time:
                logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime)))
                time.sleep(sleep_time - tottime)
                continue

    logging.info(prepend_str + 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop done')


def run(threads=1, bulk=100, once=False):
예제 #2
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

        stime = time.time()
        replicas = []
        try:
            replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                list_replicas = get_replicas_state(scope=scope, name=name)
                if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                    logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                    unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, [])
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep)))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(limit=1000000,
                                                         thread=heart_beat['assign_thread'],
                                                         total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now))
                update_history_time = time.time()

            tottime = time.time() - stime
            if len(replicas) == bulk:
                logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle')
            elif tottime < sleep_time:
                logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime)))
                time.sleep(sleep_time - tottime)
                continue

    logging.info(prepend_str + 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop done')
예제 #3
0
파일: necromancer.py 프로젝트: rcarpa/rucio
def necromancer(thread=0, bulk=5, once=False, sleep_time=60):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not GRACEFUL_STOP.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                        heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'necromancer', 'max_evaluator_backlog_count')
            max_evaluator_backlog_count = int(max_evaluator_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'necromancer', 'max_evaluator_backlog_duration')
            max_evaluator_backlog_duration = int(
                max_evaluator_backlog_duration)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog(expiration_time=sleep_time)
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        # Check how many bad replicas are queued
        try:
            max_bad_replicas_backlog_count = config_get(
                'necromancer', 'max_bad_replicas_backlog_count')
            max_bad_replicas_backlog_count = int(
                max_bad_replicas_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_bad_replicas_backlog_count = None
        bad_replicas_backlog = get_bad_replicas_backlog()
        tot_bad_files = sum(
            [bad_replicas_backlog[key] for key in bad_replicas_backlog])
        list_of_rses = list()
        # If too many replica, call list_bad_replicas with a list of RSEs
        if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(
                bad_replicas_backlog) > 1:
            logger(
                logging.INFO,
                'Backlog of bads replica too big. Apply some sharing between different RSEs'
            )
            rses = list()
            cnt = 0
            for key in sorted(bad_replicas_backlog,
                              key=bad_replicas_backlog.get,
                              reverse=False):
                rses.append({'id': key})
                cnt += bad_replicas_backlog[key]
                if cnt >= bulk:
                    list_of_rses.append(rses)
                    rses = list()
                    cnt = 0
        else:
            list_of_rses.append(None)

        stime = time.time()
        replicas = []
        try:
            for rses in list_of_rses:
                replicas = list_bad_replicas(
                    limit=bulk,
                    thread=heart_beat['assign_thread'],
                    total_threads=heart_beat['nr_threads'],
                    rses=rses)

                for replica in replicas:
                    scope, name, rse_id, rse = replica['scope'], replica[
                        'name'], replica['rse_id'], replica['rse']
                    logger(logging.INFO,
                           'Working on %s:%s on %s' % (scope, name, rse))

                    list_replicas = get_replicas_state(scope=scope, name=name)
                    if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                        logger(
                            logging.INFO,
                            'File %s:%s has no other available or temporary available replicas, it will be marked as lost'
                            % (scope, name))
                        try:
                            update_rules_for_lost_replica(scope=scope,
                                                          name=name,
                                                          rse_id=rse_id,
                                                          nowait=True)
                            monitor.record_counter(
                                name='necromancer.badfiles.lostfile')
                        except DatabaseException as error:
                            logger(logging.WARNING, str(error))

                    else:
                        rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                        unavailable_rep = list_replicas.get(
                            ReplicaState.TEMPORARY_UNAVAILABLE, [])
                        logger(
                            logging.INFO,
                            'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s'
                            % (scope, name, str(rep), str(unavailable_rep)))
                        try:
                            update_rules_for_bad_replica(scope=scope,
                                                         name=name,
                                                         rse_id=rse_id,
                                                         nowait=True)
                            monitor.record_counter(
                                name='necromancer.badfiles.recovering')
                        except DatabaseException as error:
                            logger(logging.WARNING, str(error))

                logger(
                    logging.INFO, 'It took %s seconds to process %s replicas' %
                    (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logger(
                logging.CRITICAL,
                ''.join(format_exception(exc_type, exc_value,
                                         exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logger(
                    logging.INFO,
                    'Last update of history table %s seconds ago. Running update.'
                    % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(
                    limit=1000000,
                    thread=heart_beat['assign_thread'],
                    total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logger(logging.DEBUG,
                           'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logger(
                            logging.DEBUG,
                            ' History for rse_id %s : chunk %i/%i' %
                            (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logger(
                    logging.INFO, 'History table updated in %s seconds' %
                    (time.time() - now))
                update_history_time = time.time()

            if len(replicas) == bulk:
                logger(
                    logging.INFO,
                    'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle'
                )
            else:
                daemon_sleep(start_time=stime,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

    logger(logging.INFO, 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop done')