def run(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5):
    """
    Starts up the Suspicious-Replica-Recoverer threads.
    """
    setup_logging()
    logger = formatted_logger(logging.log)

    if rucio.db.sqla.util.is_old_db():
        raise DatabaseException('Database was not updated, daemon won\'t start')

    client_time, db_time = datetime.utcnow(), get_db_time()
    max_offset = timedelta(hours=1, seconds=10)
    if isinstance(db_time, datetime):
        if db_time - client_time > max_offset or client_time - db_time > max_offset:
            logger(logging.CRITICAL, 'Offset between client and db time too big. Stopping Suspicious-Replica-Recoverer.')
            return

    sanity_check(executable='rucio-replica-recoverer', hostname=socket.gethostname())

    if once:
        declare_suspicious_replicas_bad(once, younger_than, nattempts, vos, limit_suspicious_files_on_rse)
    else:
        logger(logging.INFO, 'Suspicious file replicas recovery starting 1 worker.')
        t = threading.Thread(target=declare_suspicious_replicas_bad,
                             kwargs={'once': once,
                                     'younger_than': younger_than,
                                     'nattempts': nattempts,
                                     'vos': vos,
                                     'limit_suspicious_files_on_rse': limit_suspicious_files_on_rse})
        t.start()
        logger(logging.INFO, 'Waiting for interrupts')

        # Interruptible joins require a timeout.
        while t.is_alive():
            t.join(timeout=3.14)
示例#2
0
def kronos_dataset(thread=0, dataset_queue=None, sleep_time=60):
    logging.info('kronos-dataset[%d/?] starting', thread)

    executable = 'kronos-dataset'
    hostname = socket.gethostname()
    pid = getpid()
    hb_thread = current_thread()

    dataset_wait = config_get_int('tracer-kronos', 'dataset_wait')
    start = datetime.now()
    sanity_check(executable=executable, hostname=hostname)
    while not graceful_stop.is_set():
        start_time = time()
        heart_beat = live(executable, hostname, pid, hb_thread)
        prepend_str = 'kronos-dataset[%i/%i] ' % (heart_beat['assign_thread'],
                                                  heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prepend_str + '%s')
        if (datetime.now() - start).seconds > dataset_wait:
            __update_datasets(dataset_queue, logger=logger)
            start = datetime.now()

        tottime = time() - start_time
        if tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            sleep(sleep_time - tottime)

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)

    # once again for the backlog
    logger(logging.INFO, 'cleaning dataset backlog before shutdown...')
    __update_datasets(dataset_queue)
示例#3
0
def preparer(once, sleep_time, bulk):
    # Make an initial heartbeat so that all instanced daemons have the correct worker number on the next try
    executable = 'conveyor-preparer'
    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()
    worker_number = current_thread
    total_workers = '?'
    prefix = 'conveyor-preparer[%s/%s] ' % (worker_number, total_workers)
    daemon_logger = formatted_logger(logging.log, prefix + '%s')
    heartbeat.sanity_check(executable=executable, hostname=hostname, pid=pid, thread=current_thread)

    try:
        graceful_stop.wait(10)  # gathering of daemons/threads on first start
        while not graceful_stop.is_set():
            start_time = time()

            pulse = heartbeat.live(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
            worker_number = pulse['assign_thread']
            total_workers = pulse['nr_threads']

            prefix = 'conveyor-preparer[%s/%s] ' % (worker_number, total_workers)
            daemon_logger = formatted_logger(logging.log, prefix + '%s')

            try:
                updated_msg = run_once(total_workers=total_workers, worker_number=worker_number, limit=bulk, logger=daemon_logger)
            except RucioException:
                daemon_logger(logging.ERROR, 'errored with a RucioException, retrying later', exc_info=True)
                updated_msg = 'errored'

            if once:
                break

            end_time = time()
            time_diff = end_time - start_time
            daemon_logger(logging.INFO, '%s, taking %.3f seconds' % (updated_msg, time_diff))
            if time_diff < sleep_time:
                sleep_remaining = sleep_time - time_diff
                daemon_logger(logging.DEBUG, 'sleeping for a while :  %.2f seconds' % sleep_remaining)
                graceful_stop.wait(sleep_remaining)

        daemon_logger(logging.INFO, 'gracefully stopping')

    finally:
        heartbeat.die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
示例#4
0
def deckard_loop(scope, rses, dark_min_age, dark_threshold_percent,
                 miss_threshold_percent, force_proceed, scanner_files_path):

    prefix = 'storage-consistency-actions (deckard_loop())'
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'A loop over all RSEs')
    for rse in rses:
        logger(logging.INFO, 'Now processing RSE: %s' % rse)
        deckard(scope, rse, dark_min_age, dark_threshold_percent,
                miss_threshold_percent, force_proceed, scanner_files_path)
示例#5
0
def test_formatted_logger():
    result = None

    def log_func(level, msg, *args, **kwargs):
        nonlocal result
        result = (level, msg)

    new_log_func = formatted_logger(log_func, "a %s c")

    new_log_func(logging.INFO, "b")
    assert result == (logging.INFO, "a b c")
示例#6
0
def throttler(once=False, sleep_time=600):
    """
    Main loop to check rse transfer limits.
    """

    logging.info('Throttler starting')

    executable = 'conveyor-throttler'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all throttlers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info(prepend_str + 'Throttler started - timeout (%s)' % (sleep_time))

    current_time = time.time()
    graceful_stop.wait(10)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
            if heart_beat['assign_thread'] != 0:
                logging.info(prepend_str + 'Throttler thread id is not 0, will sleep. Only thread 0 will work')
                if once:
                    break
                if time.time() < current_time + sleep_time:
                    graceful_stop.wait(int((current_time + sleep_time) - time.time()))
                current_time = time.time()
                continue

            logging.info(prepend_str + "Throttler - schedule requests")
            run_once(logger=formatted_logger(logging.log, prepend_str + '%s'))

            if once:
                break
            if time.time() < current_time + sleep_time:
                graceful_stop.wait(int((current_time + sleep_time) - time.time()))
            current_time = time.time()
        except Exception:
            logging.critical(prepend_str + 'Throtter crashed %s' % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Throtter - graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Throtter - graceful stop done')
示例#7
0
    def live(self, older_than=None):
        if older_than:
            self.last_heart_beat = heartbeat.live(self.executable, self.hostname, self.pid, self.hb_thread, older_than=older_than)
        else:
            self.last_heart_beat = heartbeat.live(self.executable, self.hostname, self.pid, self.hb_thread)

        prefix = '%s[%i/%i]: ' % (self.logger_prefix, self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads'])
        self.logger = formatted_logger(logging.log, prefix + '%s')

        if not self.last_heart_beat:
            self.logger(logging.DEBUG, 'First heartbeat set')
        else:
            self.logger(logging.DEBUG, 'Heartbeat renewed')

        return self.last_heart_beat, self.logger
示例#8
0
def rse_update(once=False, sleep_time=10):
    """
    Main loop to check and update the RSE Counters.
    """

    # Make an initial heartbeat so that all abacus-rse daemons have the correct worker number on the next try
    executable = 'abacus-rse'
    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()
    live(executable=executable, hostname=hostname, pid=pid, thread=current_thread)

    while not graceful_stop.is_set():
        try:
            # Heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread)

            prepend_str = 'rse_update[%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')

            # Select a bunch of rses for to update for this worker
            start = time.time()  # NOQA
            rse_ids = get_updated_rse_counters(total_workers=heartbeat['nr_threads'],
                                               worker_number=heartbeat['assign_thread'])
            logger(logging.DEBUG, 'Index query time %f size=%d' % (time.time() - start, len(rse_ids)))

            # If the list is empty, sent the worker to sleep
            if not rse_ids and not once:
                logger(logging.INFO, 'did not get any work')
                daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop)
            else:
                for rse_id in rse_ids:
                    if graceful_stop.is_set():
                        break
                    start_time = time.time()
                    update_rse_counter(rse_id=rse_id)
                    logger(logging.DEBUG, 'update of rse "%s" took %f' % (rse_id, time.time() - start_time))
        except Exception:
            logger(logging.ERROR, traceback.format_exc())
        if once:
            break

    logging.info('rse_update: graceful stop requested')
    die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
    logging.info('rse_update: graceful stop done')
示例#9
0
def preparer(once, sleep_time, bulk, partition_wait_time=10):
    # Make an initial heartbeat so that all instanced daemons have the correct worker number on the next try
    logger_prefix = executable = 'conveyor-preparer'
    with HeartbeatHandler(executable=executable,
                          logger_prefix=logger_prefix) as heartbeat_handler:

        if partition_wait_time is not None:
            graceful_stop.wait(partition_wait_time
                               )  # gathering of daemons/threads on first start
        while not graceful_stop.is_set():
            start_time = time()

            heart_beat, logger = heartbeat_handler.live()

            worker_number = heart_beat['assign_thread']
            total_workers = heart_beat['nr_threads']

            prefix = 'conveyor-preparer[%s/%s] ' % (worker_number,
                                                    total_workers)
            daemon_logger = formatted_logger(logging.log, prefix + '%s')

            count = 0
            try:
                count, updated_msg = run_once(total_workers=total_workers,
                                              worker_number=worker_number,
                                              limit=bulk,
                                              logger=logger)
            except RucioException:
                daemon_logger(logging.ERROR,
                              'errored with a RucioException, retrying later',
                              exc_info=True)
                updated_msg = 'errored'

            if once:
                break

            end_time = time()
            time_diff = end_time - start_time
            logger(logging.INFO,
                   '%s, taking %.3f seconds' % (updated_msg, time_diff))
            if count < bulk:
                daemon_sleep(start_time=start_time,
                             sleep_time=sleep_time,
                             graceful_stop=graceful_stop,
                             logger=logger)
示例#10
0
def create_protocol(rse_settings, operation, scheme=None, domain='wan', auth_token=None, protocol_attr=None, logger=logging.log):
    """
    Instanciates the protocol defined for the given operation.

    :param rse_settings:  RSE attributes
    :param operation:     Intended operation for this protocol
    :param scheme:        Optional filter if no specific protocol is defined in rse_setting for the provided operation
    :param domain:        Optional specification of the domain
    :param auth_token:    Optionally passing JSON Web Token (OIDC) string for authentication
    :param protocol_attr: Optionally passing the full protocol availability information to correctly select WAN/LAN
    :param logger:        Optional decorated logger that can be passed from the calling daemons or servers.
    :returns:             An instance of the requested protocol
    """

    # Verify feasibility of Protocol
    operation = operation.lower()
    if operation not in utils.rse_supported_protocol_operations():
        raise exception.RSEOperationNotSupported('Operation %s is not supported' % operation)

    if domain and domain not in utils.rse_supported_protocol_domains():
        raise exception.RSEProtocolDomainNotSupported('Domain %s not supported' % domain)

    if not protocol_attr:
        protocol_attr = select_protocol(rse_settings, operation, scheme, domain)
    else:
        candidates = _get_possible_protocols(rse_settings, operation, scheme, domain)
        if protocol_attr not in candidates:
            raise exception.RSEProtocolNotSupported('Protocol %s operation %s on domain %s not supported' % (protocol_attr, operation, domain))

    # Instantiate protocol
    comp = protocol_attr['impl'].split('.')
    prefix = '.'.join(comp[-2:]) + ': '
    logger = formatted_logger(logger, prefix + "%s")
    mod = __import__('.'.join(comp[:-1]))
    for n in comp[1:]:
        try:
            mod = getattr(mod, n)
        except AttributeError as e:
            logger(logging.DEBUG, 'Protocol implementations not supported.')
            raise exception.RucioException(str(e))  # TODO: provide proper rucio exception
    protocol_attr['auth_token'] = auth_token
    protocol = mod(protocol_attr, rse_settings, logger=logger)
    return protocol
示例#11
0
文件: common.py 项目: rak108/rucio
    def live(self):
        """
        :return: a tuple: <the number of the current worker>, <total number of workers>, <decorated logger>
        """
        if not self.last_time or self.last_time < datetime.datetime.now() - datetime.timedelta(seconds=self.renewal_interval):
            if self.older_than:
                self.last_heart_beat = heartbeat.live(self.executable, self.hostname, self.pid, self.hb_thread, older_than=self.older_than)
            else:
                self.last_heart_beat = heartbeat.live(self.executable, self.hostname, self.pid, self.hb_thread)

            prefix = '%s[%i/%i]: ' % (self.logger_prefix, self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads'])
            self.logger = formatted_logger(logging.log, prefix + '%s')

            if not self.last_time:
                self.logger(logging.DEBUG, 'First heartbeat set')
            else:
                self.logger(logging.DEBUG, 'Heartbeat renewed')
            self.last_time = datetime.datetime.now()

        return self.last_heart_beat['assign_thread'], self.last_heart_beat['nr_threads'], self.logger
示例#12
0
def hermes2(once=False, thread=0, bulk=1000, sleep_time=10):
    """
    Creates a Hermes2 Worker that can submit messages to different services (InfluXDB, ElasticSearch, ActiveMQ)
    The list of services need to be define in the config service in the hermes section.
    The list of endpoints need to be defined in rucio.cfg in the hermes section.

    :param once:       Run only once.
    :param thread:     Thread number at startup.
    :param bulk:       The number of requests to process.
    :param sleep_time: Time between two cycles.
    """

    executable = 'hermes2'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable,
                           hostname=hostname,
                           pid=pid,
                           thread=hb_thread)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

    # Make an initial heartbeat so that all daemons have the correct worker number on the next try
    GRACEFUL_STOP.wait(10)
    heart_beat = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)
    prepend_str = 'hermes2[%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    try:
        services_list = get('hermes', 'services_list')
        services_list = services_list.split(',')
    except ConfigNotFound:
        logger(logging.DEBUG, 'No services found, exiting')
        sys.exit(1)
    if 'influx' in services_list:
        try:
            influx_endpoint = config_get('hermes', 'influxdb_endpoint', False,
                                         None)
            if not influx_endpoint:
                logger(
                    logging.ERROR,
                    'InfluxDB defined in the services list, but no endpoint can be find. Exiting'
                )
                sys.exit(1)
        except Exception as err:
            logger(logging.ERROR, str(err))
    if 'elastic' in services_list:
        try:
            elastic_endpoint = config_get('hermes', 'elastic_endpoint', False,
                                          None)
            if not elastic_endpoint:
                logger(
                    logging.ERROR,
                    'Elastic defined in the services list, but no endpoint can be find. Exiting'
                )
                sys.exit(1)
        except Exception as err:
            logger(logging.ERROR, str(err))
    if 'activemq' in services_list:
        try:
            # activemq_endpoint = config_get('hermes', 'activemq_endpoint', False, None)
            conns, destination, username, password, use_ssl = setup_activemq(
                logger)
            if not conns:
                logger(
                    logging.ERROR,
                    'ActiveMQ defined in the services list, cannot be setup')
                sys.exit(1)
        except Exception as err:
            logger(logging.ERROR, str(err))

    while not GRACEFUL_STOP.is_set():
        message_status = copy.deepcopy(services_list)
        message_statuses = {}
        stime = time.time()
        try:
            start_time = time.time()
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'hermes2[%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            messages = retrieve_messages(
                bulk=bulk,
                thread=heart_beat['assign_thread'],
                total_threads=heart_beat['nr_threads'])

            if messages:
                for message in messages:
                    message_statuses[message['id']] = copy.deepcopy(
                        services_list)
                logger(logging.DEBUG,
                       'Retrieved %i messages retrieved in %s seconds',
                       len(messages),
                       time.time() - start_time)

                if 'influx' in message_status:
                    t_time = time.time()
                    logger(logging.DEBUG, 'Will submit to influxDB')
                    try:
                        state = aggregate_to_influx(messages=messages,
                                                    bin_size='1m',
                                                    endpoint=influx_endpoint,
                                                    logger=logger)
                    except Exception as error:
                        logger(logging.ERROR, 'Error sending to InfluxDB : %s',
                               str(error))
                        state = 500
                    if state in [204, 200]:
                        logger(
                            logging.INFO,
                            'Messages successfully submitted to influxDB in %s seconds',
                            time.time() - t_time)
                        for message in messages:
                            message_statuses[message['id']].remove('influx')
                    else:
                        logger(logging.INFO, 'Failure to submit to influxDB')

                if 'elastic' in message_status:
                    t_time = time.time()
                    try:
                        state = submit_to_elastic(messages=messages,
                                                  endpoint=elastic_endpoint,
                                                  logger=logger)
                    except Exception as error:
                        logger(logging.ERROR, 'Error sending to Elastic : %s',
                               str(error))
                        state = 500
                    if state in [200, 204]:
                        logger(
                            logging.INFO,
                            'Messages successfully submitted to elastic in %s seconds',
                            time.time() - t_time)
                        for message in messages:
                            message_statuses[message['id']].remove('elastic')
                    else:
                        logger(logging.INFO, 'Failure to submit to elastic')

                if 'emails' in message_status:
                    t_time = time.time()
                    try:
                        to_delete = deliver_emails(messages=messages,
                                                   logger=logger)
                        logger(
                            logging.INFO,
                            'Messages successfully submitted by emails in %s seconds',
                            time.time() - t_time)
                        for message_id in to_delete:
                            message_statuses[message_id].remove('emails')
                    except Exception as error:
                        logger(logging.ERROR, 'Error sending email : %s',
                               str(error))

                if 'activemq' in message_status:
                    t_time = time.time()
                    try:
                        to_delete = deliver_to_activemq(
                            messages=messages,
                            conns=conns,
                            destination=destination,
                            username=username,
                            password=password,
                            use_ssl=use_ssl,
                            logger=logger)
                        logger(
                            logging.INFO,
                            'Messages successfully submitted to ActiveMQ in %s seconds',
                            time.time() - t_time)
                        for message_id in to_delete:
                            message_statuses[message_id].remove('activemq')
                    except Exception as error:
                        logger(logging.ERROR, 'Error sending to ActiveMQ : %s',
                               str(error))

                to_delete = []
                to_update = {}
                for message in messages:
                    status = message_statuses[message['id']]
                    if not status:
                        to_delete.append({
                            'id': message['id'],
                            'created_at': message['created_at'],
                            'updated_at': message['created_at'],
                            'payload': str(message['payload']),
                            'event_type': message['event_type']
                        })
                    else:
                        status = ",".join(status)
                        if status not in to_update:
                            to_update[status] = []
                        to_update[status].append({
                            'id':
                            message['id'],
                            'created_at':
                            message['created_at'],
                            'updated_at':
                            message['created_at'],
                            'payload':
                            str(message['payload']),
                            'event_type':
                            message['event_type']
                        })
                logger(logging.INFO, 'Deleting %s messages', len(to_delete))
                delete_messages(messages=to_delete)
                for status in to_update:
                    logger(
                        logging.INFO,
                        'Failure to submit %s messages to %s. Will update the message status',
                        str(len(to_update[status])), status)
                    update_messages_services(messages=to_update[status],
                                             services=status)

            if once:
                break
            daemon_sleep(start_time=stime,
                         sleep_time=sleep_time,
                         graceful_stop=GRACEFUL_STOP,
                         logger=logger)

        except Exception:
            logger(logging.ERROR, "Failed to submit messages", exc_info=True)
示例#13
0
def rule_injector(once=False, sleep_time=60):
    """
    Main loop to check for asynchronous creation of replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try
    executable = 'judge-injector'
    heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60)
    prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60)
            prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_injected_rules(total_workers=heartbeat['nr_threads'],
                                       worker_number=heartbeat['assign_thread'],
                                       limit=100,
                                       blocked_rules=[key for key in paused_rules])
            logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules)))

            if not rules and not once:
                logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules)))
                daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    logger(logging.INFO, 'Injecting rule %s' % rule_id)
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        inject_rule(rule_id=rule_id, logger=logger)
                        logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                            record_counter('rule.judge.exceptions.LocksDetected')
                            logger(logging.WARNING, 'Locks detected for %s' % rule_id)
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logger(logging.WARNING, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logger(logging.WARNING, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logger(logging.ERROR, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except (RSEWriteBlocked) as e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                        logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id)
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed as e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                        logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id)
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except RuleNotFound:
                        pass
                    except InsufficientAccountLimit:
                        # A rule with InsufficientAccountLimit on injection hangs there potentially forever
                        # It should be marked as SUSPENDED
                        logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id)
                        update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'})

        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            else:
                logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        except Exception as e:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
示例#14
0
def deckard(scope, rse, dark_min_age, dark_threshold_percent,
            miss_threshold_percent, force_proceed, scanner_files_path):
    """
    The core of CC actions.
    Use the results of the CC Scanner to check one RSE for confirmed dark files and delete them.
    Re-subscribe missing files.
    """

    prefix = 'storage-consistency-actions (running original deckard code)'
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Now running the original deckard code...')

    path = scanner_files_path
    minagedark = dark_min_age
    max_dark_fraction = dark_threshold_percent
    max_miss_fraction = miss_threshold_percent
    logger(
        logging.INFO,
        'Scanner Output Path: %s \n minagedark: %d \n max_dark_fraction: %f\
      \n max_miss_fraction: %f' %
        (path, minagedark, max_dark_fraction, max_miss_fraction))

    scanner_files = 0
    dbdump_before_files = 0
    dbdump_after_files = 0

    # Check if we have any scans available for that RSE
    if rse in list_cc_scanned_rses(path):

        # Have any of them still not been processed?
        # (no CC_dark or CC-miss sections in _stats.json)
        np_runs = list_unprocessed_runs(path, rse)
        logger(logging.INFO,
               'Found %d unprocessed runs for RSE: %s' % (len(np_runs), rse))

        latest_run = list_runs(path, rse, 1)[0]

        # Get the number of files recorded by the scanner
        logger(logging.INFO, 'latest_run %s' % latest_run)
        with open(latest_run, "r") as f:
            fstats = json.loads(f.read())
            if "scanner" in fstats:
                scanner_stats = fstats["scanner"]
                if "total_files" in scanner_stats:
                    scanner_files = scanner_stats["total_files"]
                else:
                    scanner_files = 0
                    for root_info in scanner_stats["roots"]:
                        scanner_files += root_info["files"]
            if "dbdump_before" in fstats:
                dbdump_before_files = fstats["dbdump_before"]["files"]
            if "dbdump_after" in fstats:
                dbdump_after_files = fstats["dbdump_after"]["files"]

        max_files_at_site = max(scanner_files, dbdump_before_files,
                                dbdump_after_files)
        if max_files_at_site == 0:
            logger(
                logging.WARNING,
                '\n No files reported by scanner for this run.\
             Will skip processing.')

        logger(
            logging.INFO, 'scanner_files: %d \n dbdump_before_files: %d\
          \n dbdump_after_files: %d \n max_files_at_site: %d' %
            (scanner_files, dbdump_before_files, dbdump_after_files,
             max_files_at_site))

        # Was the latest run ever attempted to be processed?
        logger(
            logging.INFO,
            'Was the latest run %s attempted to be processed already? - %s' %
            (latest_run, was_cc_attempted(latest_run)))
        if max_files_at_site > 0 and (was_cc_attempted(latest_run) is False
                                      or force_proceed is True):
            logger(logging.INFO, 'Will try to process the run')

            # Is there another run, at least "minagedark" old, for this RSE?
            old_enough_run = None
            d = list_runs_by_age(path, rse, latest_run)
            if len([k for k in d if d[k] > minagedark]) > 0:
                # i.e. there is another dark run with appropriate age
                old_enough_run = [k for k in d if d[k] > minagedark][0]
                logger(
                    logging.INFO,
                    'Found another run %d days older than the latest.\
                  \n Will compare the dark files in the two.' % minagedark)
                logger(
                    logging.INFO, 'The first  %d days older run is: %s' %
                    (minagedark, old_enough_run))

                process_dark_files(path, scope, rse, latest_run,
                                   max_dark_fraction, max_files_at_site,
                                   old_enough_run, force_proceed)
            else:
                logger(
                    logging.INFO,
                    'There is no other run for this RSE at least %d days older,\
                 so cannot safely proceed with dark files deleteion.' %
                    minagedark)

            process_miss_files(path, scope, rse, latest_run, max_miss_fraction,
                               max_files_at_site, old_enough_run,
                               force_proceed)

        else:
            # This run was already processed
            logger(logging.INFO, 'Nothing to do here')

    else:
        # No scans outputs are available for this RSE
        logger(logging.INFO, 'No scans available for this RSE')
示例#15
0
def process_miss_files(path, scope, rse, latest_run, max_miss_fraction,
                       max_files_at_site, old_enough_run, force_proceed):
    """
    Process the Missing Replicas.
    """

    prefix = 'storage-consistency-actions (process_miss_files())'
    logger = formatted_logger(logging.log, prefix + '%s')

    latest_miss = re.sub('_stats.json$', '_M.list', latest_run)
    logger(logging.INFO, 'latest_missing = %s' % latest_miss)

    # Create a cc_miss section in the stats file

    t0 = time.time()
    stats_key = "cc_miss"
    cc_stats = stats = None
    stats = Stats(latest_run)
    cc_stats = {
        "start_time": t0,
        "end_time": None,
        "initial_miss_files": 0,
        "confirmed_miss_files": 0,
        "x-check_run": old_enough_run,
        "status": "started"
    }
    stats[stats_key] = cc_stats

    ###
    #   SAFEGUARD
    #   If a large fraction (larger than 'max_miss_fraction') of the files at a site are reported as
    #   'missing', do NOT proceed with the invalidation.
    #   Instead, put a warning in the _stats.json file, so that an operator can have a look.
    ###

    miss_files = sum(1 for line in open(latest_miss))
    logger(logging.INFO, 'miss_files = %d' % miss_files)
    logger(
        logging.INFO,
        'miss_files/max_files_at_site = %f' % (miss_files / max_files_at_site))
    logger(
        logging.INFO, 'max_miss_fraction configured for this RSE (in %%): %f' %
        max_miss_fraction)

    labels = {'rse': rse}
    record_gauge('storage.consistency.actions_miss_files_found',
                 miss_files,
                 labels=labels)

    if miss_files / max_files_at_site < max_miss_fraction or force_proceed is True:
        logger(logging.INFO, 'Can proceed with missing files retransfer')

        invalidated_files = 0
        issuer = InternalAccount('root')
        with open(latest_miss, 'r') as csvfile:
            reader = csv.reader(csvfile)
            reason = "invalidating damaged/missing replica"
            for name, in reader:
                logger(
                    logging.INFO,
                    'Processing invalid replica:\n RSE: %s Scope: %s Name: %s'
                    % (rse, scope, name))

                rse_id = get_rse_id(rse=rse)
                dids = [{'scope': scope, 'name': name}]
                declare_bad_file_replicas(dids=dids,
                                          rse_id=rse_id,
                                          reason=reason,
                                          issuer=issuer)
                invalidated_files += 1
                record_counter(
                    'storage.consistency.actions_miss_files_to_retransfer_counter',
                    delta=1,
                    labels=labels)


# TODO: The stats updating can be refactored in a future version of the Stats class.
# See: https://github.com/rucio/rucio/pull/5120#discussion_r792688019
# Update the stats
            t1 = time.time()

            cc_stats.update({
                "end_time": t1,
                "initial_miss_files": miss_files,
                "confirmed_miss": invalidated_files,
                "status": "done"
            })
            stats[stats_key] = cc_stats
            record_gauge(
                'storage.consistency.actions_miss_files_to_retransfer',
                invalidated_files,
                labels=labels)

    else:
        missperc = 100. * miss_files / max_files_at_site
        logger(
            logging.WARNING, '\n Too many MISS files (%3.2f%%)!\n\
         Stopping and asking for operators help.' % missperc)

        # Update the stats
        t1 = time.time()

        cc_stats.update({
            "end_time": t1,
            "initial_miss_files": miss_files,
            "confirmed_miss_files": 0,
            "status": "ABORTED",
            "aborted_reason": "%3.2f%% miss" % missperc,
        })
        stats[stats_key] = cc_stats
        record_gauge('storage.consistency.actions_miss_files_to_retransfer',
                     0,
                     labels=labels)
示例#16
0
def actions_loop(once, scope, rses, sleep_time, dark_min_age,
                 dark_threshold_percent, miss_threshold_percent, force_proceed,
                 scanner_files_path):
    """
    Main loop to apply the CC actions
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    executable = 'storage-consistency-actions'
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=current_thread)

    # Make an initial heartbeat
    # so that all storage-consistency-actions have the correct worker number on the next try
    prefix = 'storage-consistency-actions[%i/%i] ' %\
             (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(
        logging.INFO, 'hostname: %s  pid: %d  current_thread: %s' %
        (hostname, pid, current_thread))

    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread)
            logger(logging.INFO, 'heartbeat? %s' % heartbeat)

            prefix = 'storage-consistency-actions[%i/%i] ' %\
                     (heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger(logging.INFO, 'prefix: %s' % prefix)
            start = time.time()
            logger(logging.DEBUG, 'Start time: %f' % start)

            deckard_loop(scope, rses, dark_min_age, dark_threshold_percent,
                         miss_threshold_percent, force_proceed,
                         scanner_files_path)
            daemon_sleep(start_time=start,
                         sleep_time=sleep_time,
                         graceful_stop=graceful_stop,
                         logger=logger)

        except Exception as e:
            traceback.print_exc()
            logger(logging.WARNING, '\n Something went wrong here... %s' % e)
            logger(
                logging.WARNING,
                '\n Something went wrong here... %s ' % (e.__class__.__name__))
        if once:
            break

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=current_thread)
示例#17
0
def kronos_file(thread=0, dataset_queue=None, sleep_time=60):
    """
    Main loop to consume tracer reports.
    """

    logging.info('kronos_file[%i/?] starting', thread)

    executable = 'kronos-file'
    hostname = socket.gethostname()
    pid = getpid()
    hb_thread = current_thread()

    chunksize = config_get_int('tracer-kronos', 'chunksize')
    prefetch_size = config_get_int('tracer-kronos', 'prefetch_size')
    subscription_id = config_get('tracer-kronos', 'subscription_id')
    try:
        bad_files_patterns = []
        pattern = get(section='kronos',
                      option='bad_files_patterns',
                      session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            bad_files_patterns.append(re.compile(pat.strip()))
    except ConfigNotFound:
        bad_files_patterns = []
    except Exception as error:
        logging.log(logging.ERROR,
                    'kronos_file[%i/?] Failed to get bad_file_patterns %s',
                    thread, str(error))
        bad_files_patterns = []

    use_ssl = True
    try:
        use_ssl = config_get_bool('tracer-kronos', 'use_ssl')
    except Exception:
        pass

    if not use_ssl:
        username = config_get('tracer-kronos', 'username')
        password = config_get('tracer-kronos', 'password')

    excluded_usrdns = set(
        config_get('tracer-kronos', 'excluded_usrdns').split(','))
    vhost = config_get('tracer-kronos',
                       'broker_virtual_host',
                       raise_exception=False)

    brokers_alias = [
        b.strip() for b in config_get('tracer-kronos', 'brokers').split(',')
    ]
    port = config_get_int('tracer-kronos', 'port')
    reconnect_attempts = config_get_int('tracer-kronos', 'reconnect_attempts')
    ssl_key_file = config_get('tracer-kronos',
                              'ssl_key_file',
                              raise_exception=False)
    ssl_cert_file = config_get('tracer-kronos',
                               'ssl_cert_file',
                               raise_exception=False)

    sanity_check(executable=executable, hostname=hostname)
    while not graceful_stop.is_set():
        start_time = time()
        heart_beat = live(executable, hostname, pid, hb_thread)
        prepend_str = 'kronos-file[%i/%i] ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prepend_str + '%s')
        conns = __get_broker_conns(brokers=brokers_alias,
                                   port=port,
                                   use_ssl=use_ssl,
                                   vhost=vhost,
                                   reconnect_attempts=reconnect_attempts,
                                   ssl_key_file=ssl_key_file,
                                   ssl_cert_file=ssl_cert_file,
                                   timeout=sleep_time,
                                   logger=logger)
        for conn in conns:
            if not conn.is_connected():
                logger(
                    logging.INFO, 'connecting to %s' %
                    str(conn.transport._Transport__host_and_ports[0]))
                record_counter('daemons.tracer.kronos.reconnect.%s' %
                               conn.transport._Transport__host_and_ports[0][0])
                conn.set_listener(
                    'rucio-tracer-kronos',
                    AMQConsumer(
                        broker=conn.transport._Transport__host_and_ports[0],
                        conn=conn,
                        queue=config_get('tracer-kronos', 'queue'),
                        chunksize=chunksize,
                        subscription_id=subscription_id,
                        excluded_usrdns=excluded_usrdns,
                        dataset_queue=dataset_queue,
                        bad_files_patterns=bad_files_patterns,
                        logger=logger))
                if not use_ssl:
                    conn.connect(username, password)
                else:
                    conn.connect()
                conn.subscribe(
                    destination=config_get('tracer-kronos', 'queue'),
                    ack='client-individual',
                    id=subscription_id,
                    headers={'activemq.prefetchSize': prefetch_size})

        tottime = time() - start_time
        if tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            sleep(sleep_time - tottime)

    logger(logging.INFO, 'graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)
    logger(logging.INFO, 'graceful stop done')
示例#18
0
def poller(once=False,
           activities=None,
           sleep_time=60,
           fts_bulk=100,
           db_bulk=1000,
           older_than=60,
           activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    executable = 'conveyor-poller'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)
    if FILTER_TRANSFERTOOL:
        executable += ' --filter-transfertool ' + FILTER_TRANSFERTOOL

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-poller[%i/%i] : ' % (heart_beat['assign_thread'],
                                            heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(
        logging.INFO,
        'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' %
        (db_bulk, fts_bulk, timeout))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-poller[%i/%i] : ' % (heart_beat['assign_thread'],
                                            heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')

    logger(logging.INFO, 'Poller started')

    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prefix = 'conveyor-poller[%i/%i] : ' % (
                heart_beat['assign_thread'], heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                start_time = time.time()
                logger(
                    logging.DEBUG,
                    'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s'
                    % (older_than, activity, FILTER_TRANSFERTOOL))
                transfs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[RequestState.SUBMITTED],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow() -
                    datetime.timedelta(seconds=older_than),
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    mode_all=False,
                    hash_variable='id',
                    activity=activity,
                    activity_shares=activity_shares,
                    transfertool=FILTER_TRANSFERTOOL)

                record_timer('daemons.conveyor.poller.000-get_next',
                             (time.time() - start_time) * 1000)

                if transfs:
                    logger(
                        logging.DEBUG, 'Polling %i transfers for activity %s' %
                        (len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append(
                        (transf['external_id'], transf['request_id']))

                for external_host in xfers_ids:
                    external_ids = list(
                        {trf[0]
                         for trf in xfers_ids[external_host]})
                    request_ids = [trf[1] for trf in xfers_ids[external_host]]
                    for xfers in chunks(external_ids, fts_bulk):
                        # poll transfers
                        poll_transfers(external_host=external_host,
                                       xfers=xfers,
                                       request_ids=request_ids,
                                       timeout=timeout,
                                       logger=logger)

                if len(transfs) < fts_bulk / 2:
                    logger(
                        logging.INFO,
                        "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds"
                        % (len(transfs), activity, fts_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logger(logging.CRITICAL, "Exception", exc_info=True)

        if once:
            break

    logger(logging.INFO, 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
示例#19
0
def transmogrifier(bulk=5, once=False, sleep_time=60):
    """
    Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash,
    identifies the subscriptions matching the DIDs and
    submit a replication rule for each DID matching a subscription.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'transmogrifier'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

        dids, subscriptions = [], []
        tottime = 0
        prepend_str = 'transmogrifier[%i/%i] : ' % (
            heart_beat['assign_thread'], heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prepend_str + '%s')

        try:
            #  Get the new DIDs based on the is_new flag
            for did in list_new_dids(thread=heart_beat['assign_thread'],
                                     total_threads=heart_beat['nr_threads'],
                                     chunk_size=bulk,
                                     did_type=None):
                dids.append({
                    'scope': did['scope'],
                    'did_type': str(did['did_type']),
                    'name': did['name']
                })

            sub_dict = {3: []}
            #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
            #  The priority is defined as 'policyid'
            for sub in list_subscriptions(None, None):
                if sub['state'] != SubscriptionState.INACTIVE and sub[
                        'lifetime'] and (datetime.now() > sub['lifetime']):
                    update_subscription(
                        name=sub['name'],
                        account=sub['account'],
                        metadata={'state': SubscriptionState.INACTIVE})

                elif sub['state'] in [
                        SubscriptionState.ACTIVE, SubscriptionState.UPDATED
                ]:
                    priority = 3
                    if 'policyid' in sub:
                        if int(sub['policyid']) not in sub_dict:
                            sub_dict[int(sub['policyid'])] = []
                        priority = int(sub['policyid'])
                    sub_dict[priority].append(sub)
            priorities = list(sub_dict.keys())
            priorities.sort()
            #  Order the subscriptions according to their priority
            for priority in priorities:
                subscriptions.extend(sub_dict[priority])
        except SubscriptionNotFound as error:
            logger(logging.WARNING,
                   'No subscriptions defined: %s' % (str(error)))
            time.sleep(10)
            continue
        except Exception as error:
            logger(
                logging.ERROR,
                'Failed to get list of new DIDs or subscriptions: %s' %
                (str(error)))

        try:
            results = {}
            start_time = time.time()
            blacklisted_rse_id = [
                rse['id'] for rse in list_rses({'availability_write': False})
            ]
            logger(logging.DEBUG, 'In transmogrifier worker')
            identifiers = []
            #  Loop over all the new dids
            for did in dids:
                did_success = True
                if did['did_type'] == str(
                        DIDType.DATASET) or did['did_type'] == str(
                            DIDType.CONTAINER):
                    did_tag = '%s:%s' % (did['scope'].internal, did['name'])
                    results[did_tag] = []
                    try:
                        metadata = get_metadata(did['scope'], did['name'])
                        # Loop over all the subscriptions
                        for subscription in subscriptions:
                            #  Check if the DID match the subscription
                            if is_matching_subscription(
                                    subscription, did, metadata) is True:
                                filter_string = loads(subscription['filter'])
                                split_rule = filter_string.get(
                                    'split_rule', False)
                                stime = time.time()
                                results[did_tag].append(subscription['id'])
                                logger(
                                    logging.INFO,
                                    '%s:%s matches subscription %s' %
                                    (did['scope'], did['name'],
                                     subscription['name']))
                                rules = loads(
                                    subscription['replication_rules'])
                                created_rules = {}
                                cnt = 0
                                for rule_dict in rules:
                                    cnt += 1
                                    created_rules[cnt] = []
                                    # Get all the rule and subscription parameters
                                    grouping = rule_dict.get(
                                        'grouping', 'DATASET')
                                    lifetime = rule_dict.get('lifetime', None)
                                    ignore_availability = rule_dict.get(
                                        'ignore_availability', None)
                                    weight = rule_dict.get('weight', None)
                                    source_replica_expression = rule_dict.get(
                                        'source_replica_expression', None)
                                    locked = rule_dict.get('locked', None)
                                    if locked == 'True':
                                        locked = True
                                    else:
                                        locked = False
                                    purge_replicas = rule_dict.get(
                                        'purge_replicas', False)
                                    if purge_replicas == 'True':
                                        purge_replicas = True
                                    else:
                                        purge_replicas = False
                                    rse_expression = str(
                                        rule_dict['rse_expression'])
                                    comment = str(subscription['comments'])
                                    if 'comments' in rule_dict:
                                        comment = str(rule_dict['comments'])
                                    subscription_id = str(subscription['id'])
                                    account = subscription['account']
                                    copies = int(rule_dict['copies'])
                                    activity = rule_dict.get(
                                        'activity', 'User Subscriptions')
                                    try:
                                        validate_schema(name='activity',
                                                        obj=activity,
                                                        vo=account.vo)
                                    except InputValidationError as error:
                                        logger(
                                            logging.ERROR,
                                            'Error validating the activity %s'
                                            % (str(error)))
                                        activity = 'User Subscriptions'
                                    if lifetime:
                                        lifetime = int(lifetime)

                                    str_activity = "".join(activity.split())
                                    success = False
                                    nattempt = 5
                                    attemptnr = 0
                                    skip_rule_creation = False

                                    selected_rses = []
                                    chained_idx = rule_dict.get(
                                        'chained_idx', None)
                                    if chained_idx:
                                        params = {}
                                        if rule_dict.get(
                                                'associated_site_idx', None):
                                            params[
                                                'associated_site_idx'] = rule_dict.get(
                                                    'associated_site_idx',
                                                    None)
                                        logger(
                                            logging.DEBUG,
                                            '%s Chained subscription identified. Will use %s',
                                            prepend_str,
                                            str(created_rules[chained_idx]))
                                        algorithm = rule_dict.get(
                                            'algorithm', None)
                                        selected_rses = select_algorithm(
                                            algorithm,
                                            created_rules[chained_idx], params)
                                    else:
                                        # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm
                                        if split_rule:
                                            vo = account.vo
                                            rses = parse_expression(
                                                rse_expression,
                                                filter={'vo': vo})
                                            list_of_rses = [
                                                rse['id'] for rse in rses
                                            ]
                                            # Check that some rule doesn't already exist for this DID and subscription
                                            preferred_rse_ids = []
                                            for rule in list_rules(
                                                    filters={
                                                        'subscription_id':
                                                        subscription_id,
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }):
                                                already_existing_rses = [
                                                    (rse['rse'], rse['id']) for
                                                    rse in parse_expression(
                                                        rule['rse_expression'],
                                                        filter={'vo': vo})
                                                ]
                                                for rse, rse_id in already_existing_rses:
                                                    if (rse_id in list_of_rses
                                                        ) and (
                                                            rse_id not in
                                                            preferred_rse_ids):
                                                        preferred_rse_ids.append(
                                                            rse_id)
                                            if len(preferred_rse_ids
                                                   ) >= copies:
                                                skip_rule_creation = True
                                            rse_id_dict = {}
                                            for rse in rses:
                                                rse_id_dict[
                                                    rse['id']] = rse['rse']
                                            try:
                                                rseselector = RSESelector(
                                                    account=account,
                                                    rses=rses,
                                                    weight=weight,
                                                    copies=copies -
                                                    len(preferred_rse_ids))
                                                selected_rses = [
                                                    rse_id_dict[rse_id]
                                                    for rse_id, _, _ in
                                                    rseselector.select_rse(
                                                        0,
                                                        preferred_rse_ids=
                                                        preferred_rse_ids,
                                                        copies=copies,
                                                        blacklist=
                                                        blacklisted_rse_id)
                                                ]
                                            except (InsufficientTargetRSEs,
                                                    InsufficientAccountLimit,
                                                    InvalidRuleWeight,
                                                    RSEOverQuota) as error:
                                                logger(
                                                    logging.WARNING,
                                                    'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites'
                                                    % (subscription['name'],
                                                       account, str(error)))
                                                # Now including the blacklisted sites
                                                try:
                                                    rseselector = RSESelector(
                                                        account=account,
                                                        rses=rses,
                                                        weight=weight,
                                                        copies=copies -
                                                        len(preferred_rse_ids))
                                                    selected_rses = [
                                                        rse_id_dict[rse_id]
                                                        for rse_id, _, _ in
                                                        rseselector.select_rse(
                                                            0,
                                                            preferred_rse_ids=
                                                            preferred_rse_ids,
                                                            copies=copies,
                                                            blacklist=[])
                                                    ]
                                                    ignore_availability = True
                                                except (InsufficientTargetRSEs,
                                                        InsufficientAccountLimit,
                                                        InvalidRuleWeight,
                                                        RSEOverQuota) as error:
                                                    logger(
                                                        logging.ERROR,
                                                        'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                        %
                                                        (subscription['name'],
                                                         account, str(error)))
                                                    monitor.record_counter(
                                                        counters=
                                                        'transmogrifier.addnewrule.errortype.%s'
                                                        % (str(error.__class__.
                                                               __name__)),
                                                        delta=1)
                                                    # The DID won't be reevaluated at the next cycle
                                                    did_success = did_success and True
                                                    continue

                                    for attempt in range(0, nattempt):
                                        attemptnr = attempt
                                        nb_rule = 0
                                        #  Try to create the rule
                                        try:
                                            if split_rule:
                                                if not skip_rule_creation:
                                                    for rse in selected_rses:
                                                        if isinstance(
                                                                selected_rses,
                                                                dict):
                                                            source_replica_expression = selected_rses[
                                                                rse].get(
                                                                    'source_replica_expression',
                                                                    None)
                                                            weight = selected_rses[
                                                                rse].get(
                                                                    'weight',
                                                                    None)
                                                        logger(
                                                            logging.INFO,
                                                            'Will insert one rule for %s:%s on %s'
                                                            %
                                                            (did['scope'],
                                                             did['name'], rse))
                                                        rule_ids = add_rule(
                                                            dids=[{
                                                                'scope':
                                                                did['scope'],
                                                                'name':
                                                                did['name']
                                                            }],
                                                            account=account,
                                                            copies=1,
                                                            rse_expression=rse,
                                                            grouping=grouping,
                                                            weight=weight,
                                                            lifetime=lifetime,
                                                            locked=locked,
                                                            subscription_id=
                                                            subscription_id,
                                                            source_replica_expression
                                                            =source_replica_expression,
                                                            activity=activity,
                                                            purge_replicas=
                                                            purge_replicas,
                                                            ignore_availability=
                                                            ignore_availability,
                                                            comment=comment)
                                                        created_rules[
                                                            cnt].append(
                                                                rule_ids[0])
                                                        nb_rule += 1
                                                        if nb_rule == copies:
                                                            success = True
                                                            break
                                            else:
                                                rule_ids = add_rule(
                                                    dids=[{
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }],
                                                    account=account,
                                                    copies=copies,
                                                    rse_expression=
                                                    rse_expression,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription['id'],
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment)
                                                created_rules[cnt].append(
                                                    rule_ids[0])
                                                nb_rule += 1
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.done',
                                                delta=nb_rule)
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.activity.%s'
                                                % str_activity,
                                                delta=nb_rule)
                                            success = True
                                            break
                                        except (InvalidReplicationRule,
                                                InvalidRuleWeight,
                                                InvalidRSEExpression,
                                                StagingAreaRuleRequiresLifetime,
                                                DuplicateRule) as error:
                                            # Errors that won't be retried
                                            success = True
                                            logger(logging.ERROR, str(error))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                            break
                                        except (ReplicationRuleCreationTemporaryFailed,
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                DatabaseException,
                                                RSEBlacklisted,
                                                RSEWriteBlocked) as error:
                                            # Errors to be retried
                                            logger(
                                                logging.ERROR,
                                                '%s Will perform an other attempt %i/%i'
                                                % (str(error), attempt + 1,
                                                   nattempt))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                        except Exception:
                                            # Unexpected errors
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.unknown',
                                                delta=1)
                                            logger(logging.ERROR,
                                                   "Unexpected error",
                                                   exc_info=True)

                                    did_success = (did_success and success)
                                    if (attemptnr +
                                            1) == nattempt and not success:
                                        logger(
                                            logging.ERROR,
                                            'Rule for %s:%s on %s cannot be inserted'
                                            % (did['scope'], did['name'],
                                               rse_expression))
                                    else:
                                        logger(
                                            logging.INFO,
                                            '%s rule(s) inserted in %f seconds'
                                            % (str(nb_rule),
                                               time.time() - stime))
                    except DataIdentifierNotFound as error:
                        logger(logging.WARNING, str(error))

                if did_success:
                    if did['did_type'] == str(DIDType.FILE):
                        monitor.record_counter(
                            counters='transmogrifier.did.file.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.DATASET):
                        monitor.record_counter(
                            counters='transmogrifier.did.dataset.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.CONTAINER):
                        monitor.record_counter(
                            counters='transmogrifier.did.container.processed',
                            delta=1)
                    monitor.record_counter(
                        counters='transmogrifier.did.processed', delta=1)
                    identifiers.append({
                        'scope': did['scope'],
                        'name': did['name'],
                        'did_type': did['did_type']
                    })

            time1 = time.time()

            #  Mark the DIDs as processed
            for identifier in chunks(identifiers, 100):
                _retrial(set_new_dids, identifier, None)

            logger(logging.DEBUG,
                   'Time to set the new flag : %f' % (time.time() - time1))
            tottime = time.time() - start_time
            for sub in subscriptions:
                update_subscription(
                    name=sub['name'],
                    account=sub['account'],
                    metadata={'last_processed': datetime.now()})
            logger(
                logging.INFO,
                'It took %f seconds to process %i DIDs' % (tottime, len(dids)))
            logger(logging.DEBUG, 'DIDs processed : %s' % (str(dids)))
            monitor.record_counter(counters='transmogrifier.job.done', delta=1)
            monitor.record_timer(stat='transmogrifier.job.duration',
                                 time=1000 * tottime)
        except Exception:
            logger(logging.ERROR,
                   "Failed to run transmogrifier",
                   exc_info=True)
            monitor.record_counter(counters='transmogrifier.job.error',
                                   delta=1)
            monitor.record_counter(counters='transmogrifier.addnewrule.error',
                                   delta=1)
        if once is True:
            break
        if tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
示例#20
0
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400,
            once=True, unlock=False, spread_period=0, purge_replicas=False, sleep_time=60):
    """
    Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param grace_period: The grace_period for the rules.
    :param once: Run only once.
    :param sleep_time: Thread sleep time after each chunk of work.
    """
    executable = 'atropos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    now = datetime.datetime.now()
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    time.sleep(10)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.DEBUG, 'Starting worker')
    summary = {}
    lifetime_exceptions = {}
    rand = random.Random(hb['assign_thread'])
    for excep in rucio.core.lifetime_exception.list_exceptions(exception_id=None, states=[LifetimeExceptionsState.APPROVED, ], session=None):
        key = '{}:{}'.format(excep['scope'].internal, excep['name'])
        if key not in lifetime_exceptions:
            lifetime_exceptions[key] = excep['expires_at']
        elif lifetime_exceptions[key] < excep['expires_at']:
            lifetime_exceptions[key] = excep['expires_at']
    logger(logging.DEBUG, '%s active exceptions' % len(lifetime_exceptions))
    if not dry_run and date_check > now:
        logger(logging.ERROR, 'Atropos cannot run in non-dry-run mode for date in the future')
    else:
        while not GRACEFUL_STOP.is_set():

            hb = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')

            stime = time.time()
            try:
                rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'], session=None)
                logger(logging.INFO, '%s rules to process' % (len(rules)))
                for rule_idx, rule in enumerate(rules, start=1):
                    did = '%s:%s' % (rule.scope, rule.name)
                    did_key = '{}:{}'.format(rule.scope.internal, rule.name)
                    logger(logging.DEBUG, 'Working on rule %s on DID %s on %s' % (rule.id, did, rule.rse_expression))

                    if (rule_idx % 1000) == 0:
                        logger(logging.INFO, '%s/%s rules processed' % (rule_idx, len(rules)))

                    # We compute the expected eol_at
                    try:
                        rses = parse_expression(rule.rse_expression, filter_={'vo': rule.account.vo})
                    except InvalidRSEExpression:
                        logger(logging.WARNING, 'Rule %s has an RSE expression that results in an empty set: %s' % (rule.id, rule.rse_expression))
                        continue
                    eol_at = rucio.core.lifetime_exception.define_eol(rule.scope, rule.name, rses)
                    if eol_at != rule.eol_at:
                        logger(logging.WARNING, 'The computed eol %s differs from the one recorded %s for rule %s on %s at %s' % (eol_at, rule.eol_at, rule.id,
                                                                                                                                  did, rule.rse_expression))
                        try:
                            update_rule(rule.id, options={'eol_at': eol_at})
                        except RuleNotFound:
                            logger(logging.WARNING, 'Cannot find rule %s on DID %s' % (rule.id, did))
                            continue

                    # Check the exceptions
                    if did_key in lifetime_exceptions:
                        if eol_at > lifetime_exceptions[did_key]:
                            logger(logging.INFO, 'Rule %s on DID %s on %s has longer expiration date than the one requested : %s' % (rule.id, did, rule.rse_expression,
                                                                                                                                     lifetime_exceptions[did_key]))
                        else:
                            # If eol_at < requested extension, update eol_at
                            logger(logging.INFO, 'Updating rule %s on DID %s on %s according to the exception till %s' % (rule.id, did, rule.rse_expression,
                                                                                                                          lifetime_exceptions[did_key]))
                            eol_at = lifetime_exceptions[did_key]
                            try:
                                update_rule(rule.id, options={'eol_at': lifetime_exceptions[did_key]})
                            except RuleNotFound:
                                logger(logging.WARNING, 'Cannot find rule %s on DID %s' % (rule.id, did))
                                continue

                    # Now check that the new eol_at is expired
                    if eol_at and eol_at <= date_check:
                        set_metadata(scope=rule.scope, name=rule.name, key='eol_at', value=eol_at)
                        no_locks = True
                        for lock in get_dataset_locks(rule.scope, rule.name):
                            if lock['rule_id'] == rule[4]:
                                no_locks = False
                                if lock['rse_id'] not in summary:
                                    summary[lock['rse_id']] = {}
                                if did_key not in summary[lock['rse_id']]:
                                    summary[lock['rse_id']][did_key] = {'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0}
                        if no_locks:
                            logger(logging.WARNING, 'Cannot find a lock for rule %s on DID %s' % (rule.id, did))
                        if not dry_run:
                            lifetime = grace_period + rand.randrange(spread_period + 1)
                            logger(logging.INFO, 'Setting %s seconds lifetime for rule %s' % (lifetime, rule.id))
                            options = {'lifetime': lifetime}
                            if purge_replicas:
                                options['purge_replicas'] = True
                            if rule.locked and unlock:
                                logger(logging.INFO, 'Unlocking rule %s', rule.id)
                                options['locked'] = False
                            try:
                                update_rule(rule.id, options=options)
                            except RuleNotFound:
                                logger(logging.WARNING, 'Cannot find rule %s on DID %s' % (rule.id, did))
                                continue
            except Exception:
                exc_type, exc_value, exc_traceback = exc_info()
                logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

            for rse_id in summary:
                tot_size, tot_files, tot_datasets = 0, 0, 0
                for did in summary[rse_id]:
                    tot_datasets += 1
                    tot_files += summary[rse_id][did].get('length', 0)
                    tot_size += summary[rse_id][did].get('bytes', 0)
                vo = get_rse_vo(rse_id=rse_id)
                logger(logging.INFO, 'For RSE %s %s %s datasets will be deleted representing %s files and %s bytes' % (get_rse_name(rse_id=rse_id),
                                                                                                                       '' if vo == 'def' else 'on VO ' + vo,
                                                                                                                       tot_datasets,
                                                                                                                       tot_files,
                                                                                                                       tot_size))

            if once:
                break
            else:
                daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP)

        logger(logging.INFO, 'Graceful stop requested')
        heartbeat.die(executable, hostname, pid, hb_thread)
        logger(logging.INFO, 'Graceful stop done')
示例#21
0
def rule_cleaner(once=False):
    """
    Main loop to check for expired replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-cleaners have the correct worker number on the next try
    executable = 'judge-cleaner'
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=current_thread)
    prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'],
                                        heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread)
            prefix = 'judge-cleaner[%i/%i] ' % (heartbeat['assign_thread'],
                                                heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_expired_rules(
                total_workers=heartbeat['nr_threads'],
                worker_number=heartbeat['assign_thread'],
                limit=200,
                blocked_rules=[key for key in paused_rules])
            logger(
                logging.DEBUG, 'index query time %f fetch size is %d' %
                (time.time() - start, len(rules)))

            if not rules and not once:
                logger(
                    logging.DEBUG, 'did not get any work (paused_rules=%s)' %
                    str(len(paused_rules)))
                graceful_stop.wait(60)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    rule_expression = rule[1]
                    logger(
                        logging.INFO, 'Deleting rule %s with expression %s' %
                        (rule_id, rule_expression))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        delete_rule(rule_id=rule_id, nowait=True)
                        logger(
                            logging.DEBUG, 'deletion of %s took %f' %
                            (rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError,
                            UnsupportedOperation) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow(
                            ) + timedelta(seconds=randint(600, 2400))
                            record_counter(
                                'rule.judge.exceptions.LocksDetected')
                            logger(logging.WARNING,
                                   'Locks detected for %s' % rule_id)
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logger(logging.WARNING,
                                   'DatabaseException',
                                   exc_info=True)
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logger(logging.WARNING,
                                   'DatabaseException',
                                   exc_info=True)
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        else:
                            logger(logging.ERROR,
                                   'DatabaseException',
                                   exc_info=True)
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                    except RuleNotFound:
                        pass
        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            else:
                logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
        except Exception as e:
            logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=current_thread)
示例#22
0
def run(once=False,
        scope=None,
        rses=None,
        sleep_time=60,
        default_dark_min_age=28,
        default_dark_threshold_percent=1.0,
        default_miss_threshold_percent=1.0,
        force_proceed=False,
        default_scanner_files_path="/var/cache/consistency-dump",
        threads=1):
    """
    Starts up the Consistency-Actions.
    """

    setup_logging()

    prefix = 'storage-consistency-actions (run())'
    logger = formatted_logger(logging.log, prefix + '%s')

    # TODO: These variables should be sourced from the RSE config in the future.
    # For now, they are passed as arguments, and to emphasize that fact, we are re-assigning them:
    dark_min_age = default_dark_min_age
    dark_threshold_percent = default_dark_threshold_percent
    miss_threshold_percent = default_miss_threshold_percent
    scanner_files_path = default_scanner_files_path

    if rses == []:
        logger(logging.INFO,
               'NO RSEs passed. Will loop over all writable RSEs.')

        rses = [rse['rse'] for rse in list_rses({'availability_write': True})]

# Could limit it only to Tier-2s:
#        rses = [rse['rse'] for rse in list_rses({'tier': 2, 'availability_write': True})]

    logging.info('\n RSEs: %s' % rses)
    logger(
        logging.INFO,
        '\n RSEs: %s  \n run once: %r \n Sleep time: %d  \n Dark min age (days): %d\
     \n Dark files threshold %%: %f  \n Missing files threshold %%: %f  \n Force proceed: %r\
     \n Scanner files path: %s ' %
        (rses, once, sleep_time, dark_min_age, dark_threshold_percent,
         miss_threshold_percent, force_proceed, scanner_files_path))

    executable = 'storage-consistency-actions'
    hostname = socket.gethostname()
    sanity_check(executable=executable, hostname=hostname)

    # It was decided that for the time being this daemon is best executed in a single thread
    # TODO: If this decicion is reversed in the future, the following line should be removed.
    threads = 1

    if once:
        actions_loop(once, scope, rses, sleep_time, dark_min_age,
                     dark_threshold_percent, miss_threshold_percent,
                     force_proceed, scanner_files_path)
    else:
        logging.info('Consistency Actions starting %s threads' % str(threads))
        threads = [
            threading.Thread(target=actions_loop,
                             kwargs={
                                 'once': once,
                                 'scope': scope,
                                 'rses': rses,
                                 'sleep_time': sleep_time,
                                 'dark_min_age': dark_min_age,
                                 'dark_threshold_percent':
                                 dark_threshold_percent,
                                 'miss_threshold_percent':
                                 miss_threshold_percent,
                                 'force_proceed': force_proceed,
                                 'scanner_files_path': scanner_files_path
                             }) for i in range(0, threads)
        ]
        logger(logging.INFO, 'Threads: %d' % len(threads))
        [t.start() for t in threads]
        # Interruptible joins require a timeout.
        while threads[0].is_alive():
            [t.join(timeout=3.14) for t in threads]
示例#23
0
def stager(once=False,
           rses=None,
           mock=False,
           bulk=100,
           group_bulk=1,
           group_policy='rule',
           source_strategy=None,
           activities=None,
           sleep_time=600,
           retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None

    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}
    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = 'conveyor-stager'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-stager[%i/%i] : ' % (heart_beat['assign_thread'],
                                            heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO,
           'Stager starting with bring_online %s seconds' % (bring_online))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-stager[%i/%i] : ' % (heart_beat['assign_thread'],
                                            heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Stager started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
            prefix = 'conveyor-stager[%i/%i] : ' % (
                heart_beat['assign_thread'], heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logger(logging.INFO,
                       'Starting to get stagein transfers for %s' % (activity))
                start_time = time.time()
                transfers = __get_stagein_transfers(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    mock=mock,
                    schemes=scheme,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts,
                    logger=logger)
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.stager.get_stagein_transfers',
                               len(transfers))
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.transfers',
                    len(transfers))
                logger(
                    logging.INFO, 'Got %s stagein transfers for %s' %
                    (len(transfers), activity))

                # group transfers
                logger(logging.INFO,
                       'Starting to group transfers for %s' % (activity))
                start_time = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer('daemons.conveyor.stager.bulk_group_transfer',
                             (time.time() - start_time) * 1000 /
                             (len(transfers) if transfers else 1))

                logger(logging.INFO,
                       'Starting to submit transfers for %s' % (activity))
                # submit transfers
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        submit_transfer(external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logger=logger)

                if len(transfers) < group_bulk:
                    logger(
                        logging.INFO,
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logger(logging.CRITICAL, "Exception", exc_info=True)

        if once:
            break

    logger(logging.INFO, 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
示例#24
0
文件: finisher.py 项目: rizart/rucio
def finisher(once=False,
             sleep_time=60,
             activities=None,
             bulk=100,
             db_bulk=1000,
             partition_wait_time=10):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        conveyor_config = {item[0]: item[1] for item in items('conveyor')}
    except ConfigNotFound:
        logging.log(logging.INFO, 'No configuration found for conveyor')
        conveyor_config = {}

    # Get suspicious patterns
    suspicious_patterns = conveyor_config.get('suspicious_pattern', [])
    if suspicious_patterns:
        pattern = str(suspicious_patterns)
        patterns = pattern.split(",")
        suspicious_patterns = [re.compile(pat.strip()) for pat in patterns]
    logging.log(
        logging.DEBUG, "Suspicious patterns: %s" %
        [pat.pattern for pat in suspicious_patterns])

    retry_protocol_mismatches = conveyor_config.get(
        'retry_protocol_mismatches', False)

    executable = 'conveyor-finisher'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-finisher[%i/%i] : ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk,
           bulk)

    if partition_wait_time:
        graceful_stop.wait(partition_wait_time)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prefix = 'conveyor-finisher[%i/%i] : ' % (
                heart_beat['assign_thread'], heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')
            logger(logging.DEBUG, 'Starting new cycle')
            if activities is None:
                activities = [None]

            for activity in activities:
                logger(logging.DEBUG, 'Working on activity %s', activity)
                time1 = time.time()
                reqs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[
                        RequestState.DONE, RequestState.FAILED,
                        RequestState.LOST, RequestState.SUBMITTING,
                        RequestState.SUBMISSION_FAILED,
                        RequestState.NO_SOURCES,
                        RequestState.ONLY_TAPE_SOURCES,
                        RequestState.MISMATCH_SCHEME
                    ],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow(),
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    mode_all=True,
                    hash_variable='rule_id')
                record_timer('daemons.conveyor.finisher.000-get_next',
                             (time.time() - time1) * 1000)
                time2 = time.time()
                if reqs:
                    logger(logging.DEBUG,
                           'Updating %i requests for activity %s', len(reqs),
                           activity)

                for chunk in chunks(reqs, bulk):
                    try:
                        time3 = time.time()
                        __handle_requests(chunk,
                                          suspicious_patterns,
                                          retry_protocol_mismatches,
                                          logger=logger)
                        record_timer(
                            'daemons.conveyor.finisher.handle_requests',
                            (time.time() - time3) * 1000 /
                            (len(chunk) if chunk else 1))
                        record_counter(
                            'daemons.conveyor.finisher.handle_requests',
                            len(chunk))
                    except Exception as error:
                        logger(logging.WARNING, '%s', str(error))
                if reqs:
                    logger(
                        logging.DEBUG,
                        'Finish to update %s finished requests for activity %s in %s seconds',
                        len(reqs), activity,
                        time.time() - time2)

        except (DatabaseException, DatabaseError) as error:
            if re.match('.*ORA-00054.*', error.args[0]) or re.match(
                    '.*ORA-00060.*',
                    error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                logger(logging.WARNING,
                       'Lock detected when handling request - skipping: %s',
                       str(error))
            else:
                logger(logging.ERROR, 'Exception', exc_info=True)
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)

        if once:
            break

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logger(logging.INFO, 'Sleeping for a while :  %s seconds',
                   (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

    logger(logging.INFO, 'Graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300):
    """
    Main loop to check for available replicas which are labeled as suspicious.

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param vos: VOs on which to look for RSEs. Only used in multi-VO mode.
                If None, we either use all VOs if run from "def",
    :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE
                                          is considered problematic and the suspicious replicas on that RSE
                                          are labeled as 'TEMPORARY_UNAVAILABLE'.
    :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then
                       it should sleep until sleep_time is over.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]

    prepend_str = 'replica_recoverer: '
    logger = formatted_logger(logging.log, prepend_str + '%s')

    multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False)
    if not multi_vo:
        if vos:
            logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.')
        vos = ['def']
    else:
        if vos:
            invalid = set(vos) - set([v['vo'] for v in list_vos()])
            if invalid:
                msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid]))
                raise VONotFound(msg)
        else:
            vos = [v['vo'] for v in list_vos()]
        logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos])))

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
    prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread'] + 1

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname())
                die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
                break

            prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers)
            logger = formatted_logger(logging.log, prepend_str + '%s')

            start = time.time()

            try:
                json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json")
            except:
                logger(logging.WARNING, "An error occured whilst trying to open the JSON file.")
                break

            try:
                json_data = json.load(json_file)
            except ValueError:
                logger(logging.WARNING, "No JSON object could be decoded.")

            # Checking that the json file is formatedd properly.
            for i, entry in enumerate(json_data):
                if "datatype" not in entry or "action" not in entry:
                    logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i)
                    break

            logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts)

            getfileskwargs = {'younger_than': younger_than,
                              'nattempts': nattempts,
                              'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                              'is_suspicious': True}

            for vo in vos:
                logger(logging.INFO, 'Start replica recovery for VO: %s', vo)
                recoverable_replicas = {}
                if vo not in recoverable_replicas:
                    recoverable_replicas[vo] = {}

                # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse'])
                rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse'])

                logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:")
                for i in rse_list:
                    logger(logging.DEBUG, '%s', i)

                for rse in rse_list:
                    time_start_rse = time.time()
                    rse_expr = rse['rse']
                    cnt_surl_not_found = 0
                    if rse_expr not in recoverable_replicas[vo]:
                        recoverable_replicas[vo][rse_expr] = {}
                    # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs
                    suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs)
                    # Get the suspicious replicas that are the last remaining copies
                    suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs)

                    logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr)
                    logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere))
                    for i in suspicious_replicas_avail_elsewhere:
                        logger(logging.DEBUG, '%s', i)
                    logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy))
                    for i in suspicious_replicas_last_copy:
                        logger(logging.DEBUG, '%s', i)

                    # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE.
                    if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)):
                        logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability'])
                        continue

                    if suspicious_replicas_avail_elsewhere:
                        for replica in suspicious_replicas_avail_elsewhere:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                for rep in list_replicas([{'scope': scope, 'name': rep_name}]):
                                    for rse_ in rep['rses']:
                                        if rse_ == rse_id:
                                            recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True}
                                            surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr)

                    if suspicious_replicas_last_copy:
                        for replica in suspicious_replicas_last_copy:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                # Should only return one rse, as there is only one replica remaining
                                for rep in list_replicas([{'scope': scope, 'name': rep_name}]):
                                    recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False}
                                    surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr)

                    logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.',
                           time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy),
                           len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy))

                    if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0:
                        logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr)
                        for i in recoverable_replicas[vo][rse_expr]:
                            logger(logging.DEBUG, '%s', i)

                # Log file is long and hard to read -> implement some spacing
                logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start)
                logger(logging.INFO, 'Begin check for problematic RSEs.')
                time_start_check_probl = time.time()

                # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE.
                # The suspicious files are marked as temporarily unavailable.
                list_problematic_rses = []
                for rse_key in list(recoverable_replicas[vo].keys()):
                    if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse:
                        list_problematic_rses.append(rse_key)
                        surls_list = []
                        for replica_value in recoverable_replicas[vo][rse_key].values():
                            surls_list.append(replica_value['surl'])

                        add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3))

                        logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse)
                        logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key)
                        for rse_values in recoverable_replicas[vo][rse_key].values():
                            logger(logging.INFO, 'Scope: %s    Name: %s', rse_values['scope'], rse_values['name'])
                        # Remove the RSE from the dictionary as it has been dealt with.
                        del recoverable_replicas[vo][rse_key]

                logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses))
                for rse in list_problematic_rses:
                    logger(logging.INFO, "%s", rse)

                # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious).
                # If they are the last remaining copies, deal with them differently.
                for rse_key in list(recoverable_replicas[vo].keys()):
                    files_to_be_declared_bad = []
                    files_to_be_ignored = []
                    # Remove RSEs from dictionary that don't have any suspicious replicas
                    if len(recoverable_replicas[vo][rse_key]) == 0:
                        del recoverable_replicas[vo][rse_key]
                        continue
                    # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there
                    rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id']
                    for replica_key in list(recoverable_replicas[vo][rse_key].keys()):
                        if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True:
                            # Replicas with other copies on at least one other RSE can safely be labeled as bad
                            files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                            # Remove replica from dictionary
                            del recoverable_replicas[vo][rse_key][replica_key]
                        elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False:
                            if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")):
                                # Don't keep log files or user files
                                files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                del recoverable_replicas[vo][rse_key][replica_key]
                            else:
                                # Deal with replicas based on their metadata.
                                file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"])
                                if file_metadata["datatype"] is None:  # "None" type has no function "split()"
                                    files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                    continue
                                for i in json_data:
                                    if i["datatype"] == file_metadata["datatype"].split("_")[-1]:
                                        action = i["action"]
                                        if action == "ignore":
                                            files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                        elif action == "declare bad":
                                            files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                        else:
                                            logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)",
                                                   rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"])
                                        break
                                    else:
                                        # If no policy has be set, default to ignoring the file (no action taken).
                                        files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])

                    logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key)
                    for i in files_to_be_ignored:
                        logger(logging.INFO, '%s', i)
                    logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key)
                    for i in files_to_be_declared_bad:
                        logger(logging.INFO, '%s', i)

                    if files_to_be_declared_bad:
                        logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id))

                        declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None)

                        logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key)

                logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl)

                time_passed = time.time() - start
                logger(logging.INFO, 'Total time: %s seconds', time_passed)
                daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
            else:
                logger(logging.CRITICAL, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
        except Exception as err:
            logger(logging.CRITICAL, traceback.format_exc())
            record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
        if once:
            break

    die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
    logger(logging.INFO, 'Graceful stop done.')
示例#26
0
文件: bb8.py 项目: pradeepjasal/rucio
def rule_rebalancer(rse_expression, move_subscriptions=False, use_dump=False, sleep_time=300, once=True, dry_run=False):
    """
    Main loop to rebalancer rules automatically
    """

    total_rebalance_volume = 0
    executable = 'rucio-bb8'
    hostname = socket.gethostname()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'bb8[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.DEBUG, 'rse_expression: %s', rse_expression)
    logger(logging.INFO, 'BB8 started')

    while not GRACEFUL_STOP.is_set():
        logger(logging.INFO, 'Starting new cycle')
        heart_beat = live(executable, hostname, pid, hb_thread)
        start_time = time.time()
        total_rebalance_volume = 0
        tolerance = config_core.get('bb8', 'tolerance', default=0.05)
        max_total_rebalance_volume = config_core.get('bb8', 'max_total_rebalance_volume', default=10 * 1E12)
        max_rse_rebalance_volume = config_core.get('bb8', 'max_rse_rebalance_volume', default=500 * 1E9)
        min_total = config_core.get('bb8', 'min_total', default=20 * 1E9)
        payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None)
        if rse_expression in payload_cnt:
            logger(logging.WARNING, 'One BB8 instance already running with the same RSE expression. Stopping')
            break
        else:
            # List the RSEs represented by rse_expression
            try:
                rses = [rse for rse in parse_expression(rse_expression)]
                list_rses2 = [rse['rse'] for rse in rses]
            except InvalidRSEExpression as err:
                logger(logging.ERROR, err)
                break
            # List the RSEs represented by all the RSE expressions stored in heartbeat payload
            list_rses1 = []
            for rse_exp in payload_cnt:
                if rse_exp:
                    list_rses1 = [rse['rse'] for rse in parse_expression(rse_exp)]
            for rse in list_rses2:
                if rse in list_rses1:
                    logger(logging.WARNING, 'Overlapping RSE expressions %s vs %s. Stopping', rse_exp, rse_expression)
                    break

            logger(logging.INFO, 'Will process rebalancing on %s', rse_expression)
            heart_beat = live(executable, hostname, pid, hb_thread, older_than=max(600, sleep_time), hash_executable=None, payload=rse_expression, session=None)
            total_primary = 0
            total_secondary = 0
            total_total = 0
            global_ratio = float(0)
            for rse in rses:
                logger(logging.DEBUG, 'Getting RSE usage on %s', rse['rse'])
                rse_usage = get_rse_usage(rse_id=rse['id'])
                usage_dict = {}
                for item in rse_usage:
                    # TODO Check last update
                    usage_dict[item['source']] = {'used': item['used'], 'free': item['free'], 'total': item['total']}

                try:
                    rse['primary'] = usage_dict['rucio']['used'] - usage_dict['expired']['used']
                    rse['secondary'] = usage_dict['expired']['used']
                    rse['total'] = usage_dict['storage']['total'] - usage_dict['min_free_space']['used']
                    rse['ratio'] = float(rse['primary']) / float(rse['total'])
                except KeyError as err:
                    logger(logging.ERROR, 'Missing source usage %s for RSE %s. Exiting', err, rse['rse'])
                    break
                total_primary += rse['primary']
                total_secondary += rse['secondary']
                total_total += float(rse['total'])
                rse['receive_volume'] = 0  # Already rebalanced volume in this run
                global_ratio = float(total_primary) / float(total_total)
                logger(logging.INFO, 'Global ratio: %f' % (global_ratio))

            for rse in sorted(rses, key=lambda k: k['ratio']):
                logger(logging.INFO, '%s Sec/Prim local ratio (%f) vs global %s', rse['rse'], rse['ratio'], global_ratio)
            rses_over_ratio = sorted([rse for rse in rses if rse['ratio'] > global_ratio + global_ratio * tolerance], key=lambda k: k['ratio'], reverse=True)
            rses_under_ratio = sorted([rse for rse in rses if rse['ratio'] < global_ratio - global_ratio * tolerance], key=lambda k: k['ratio'], reverse=False)

            # Excluding RSEs
            logger(logging.DEBUG, 'Excluding RSEs as destination which are too small by size:')
            for des in rses_under_ratio:
                if des['total'] < min_total:
                    logger(logging.DEBUG, 'Excluding %s', des['rse'])
                    rses_under_ratio.remove(des)
            logger(logging.DEBUG, 'Excluding RSEs as sources which are too small by size:')
            for src in rses_over_ratio:
                if src['total'] < min_total:
                    logger(logging.DEBUG, 'Excluding %s', src['rse'])
                    rses_over_ratio.remove(src)
            logger(logging.DEBUG, 'Excluding RSEs as destinations which are not available for write:')
            for des in rses_under_ratio:
                if des['availability'] & 2 == 0:
                    logger(logging.DEBUG, 'Excluding %s', des['rse'])
                    rses_under_ratio.remove(des)
            logger(logging.DEBUG, 'Excluding RSEs as sources which are not available for read:')
            for src in rses_over_ratio:
                if src['availability'] & 4 == 0:
                    logger(logging.DEBUG, 'Excluding %s', src['rse'])
                    rses_over_ratio.remove(src)

            # Gets the number of active transfers per location
            dict_locks = get_active_locks(session=None)

            # Loop over RSEs over the ratio
            for index, source_rse in enumerate(rses_over_ratio):

                # The volume that would be rebalanced, not real availability of the data:
                available_source_rebalance_volume = int((source_rse['primary'] - global_ratio * source_rse['secondary']) / (global_ratio + 1))
                if available_source_rebalance_volume > max_rse_rebalance_volume:
                    available_source_rebalance_volume = max_rse_rebalance_volume
                if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume:
                    available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume

                # Select a target:
                for destination_rse in rses_under_ratio:
                    if available_source_rebalance_volume > 0:
                        vo_str = ' on VO {}'.format(destination_rse['vo']) if destination_rse['vo'] != 'def' else ''
                        if index == 0 and destination_rse['id'] in dict_locks:
                            replicating_volume = dict_locks[destination_rse['id']]['bytes']
                            logger(logging.DEBUG, 'Already %f TB replicating to %s%s', replicating_volume / 1E12, destination_rse['rse'], vo_str)
                            destination_rse['receive_volume'] += replicating_volume
                        if destination_rse['receive_volume'] >= max_rse_rebalance_volume:
                            continue
                        available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse['receive_volume']
                        if available_target_rebalance_volume >= available_source_rebalance_volume:
                            available_target_rebalance_volume = available_source_rebalance_volume

                        logger(logging.INFO, 'Rebalance %d TB from %s(%f) to %s(%f)%s', available_target_rebalance_volume / 1E12, source_rse['rse'], source_rse['ratio'], destination_rse['rse'], destination_rse['ratio'], vo_str)
                        expr = destination_rse['rse']
                        rebalance_rse(rse_id=source_rse['id'], max_bytes=available_target_rebalance_volume, dry_run=dry_run, comment='Background rebalancing', force_expression=expr, logger=logger)

                        destination_rse['receive_volume'] += available_target_rebalance_volume
                        total_rebalance_volume += available_target_rebalance_volume
                        available_source_rebalance_volume -= available_target_rebalance_volume

        if once:
            break

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logger(logging.INFO, 'Sleeping for a while : %f seconds', sleep_time - time_diff)
            GRACEFUL_STOP.wait(sleep_time - time_diff)

    die(executable='rucio-bb8', hostname=hostname, pid=pid, thread=hb_thread)
示例#27
0
def automatix(sites,
              inputfile,
              sleep_time,
              account,
              worker_number=1,
              total_workers=1,
              scope='tests',
              once=False,
              dataset_lifetime=None,
              set_metadata=False):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)

    executable = 'automatix'
    hostname = socket.getfqdn()
    pid = getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers)
    logger = formatted_logger(logging.log, prefix + '%s')
    while not GRACEFUL_STOP.is_set():
        heartbeat.live(executable, hostname, pid, hb_thread)
        starttime = time()
        prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers)
        logger = formatted_logger(logging.log, prefix + '%s')
        logger(logging.INFO, 'Getting data distribution')
        probabilities, data = get_data_distribution(inputfile)
        logger(logging.DEBUG, 'Probabilities %s', probabilities)
        totretries = 3
        status = False

        for site in sites:

            for retry in range(0, totretries):
                start_time = time()
                tmpdir = tempfile.mkdtemp()
                logger(logging.INFO, 'Running on site %s', site)
                dic = choose_element(probabilities, data)
                metadata = dic['metadata']
                try:
                    nbfiles = dic['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logger(
                        logging.WARNING,
                        'No nbfiles defined in the configuration, will use 2')
                try:
                    filesize = dic['filesize']
                except KeyError:
                    filesize = 1000000
                    logger(
                        logging.WARNING,
                        'No filesize defined in the configuration, will use 1M files'
                    )
                dsn = generate_didname(metadata, None, 'dataset')
                fnames = []
                lfns = []
                physical_fnames = []
                for _ in range(nbfiles):
                    fname = generate_didname(metadata=metadata,
                                             dsn=dsn,
                                             did_type='file')
                    lfns.append(fname)
                    logger(logging.INFO, 'Generating file %s in dataset %s',
                           fname, dsn)
                    physical_fname = '%s/%s' % (tmpdir, "".join(
                        fname.split('/')))
                    physical_fnames.append(physical_fname)
                    generate_file(physical_fname, filesize, logger=logger)
                    fnames.append(fname)
                logger(logging.INFO, 'Upload %s to %s', dsn, site)
                dsn = '%s:%s' % (scope, dsn)
                status = upload(files=lfns,
                                scope=scope,
                                metadata=metadata,
                                rse=site,
                                account=account,
                                source_dir=tmpdir,
                                dataset_lifetime=dataset_lifetime,
                                did=dsn,
                                set_metadata=set_metadata,
                                logger=logger)
                for physical_fname in physical_fnames:
                    remove(physical_fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(
                        counters='automatix.addnewdataset.done', delta=1)
                    monitor.record_counter(
                        counters='automatix.addnewfile.done', delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection',
                                         (time() - start_time) * 1000)
                    break
                else:
                    logger(
                        logging.INFO,
                        'Failed to upload files. Will retry another time (attempt %s/%s)',
                        str(retry + 1), str(totretries))
        if once is True:
            logger(logging.INFO, 'Run with once mode. Exiting')
            break
        tottime = time() - starttime
        if status:
            logger(logging.INFO,
                   'It took %s seconds to upload one dataset on %s',
                   str(tottime), str(sites))
            if sleep_time > tottime:
                logger(logging.INFO, 'Will sleep for %s seconds',
                       str(sleep_time - tottime))
                sleep(sleep_time - tottime)
        else:
            logger(logging.INFO, 'Retrying a new upload')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
示例#28
0
def undertaker(worker_number=1,
               total_workers=1,
               chunk_size=5,
               once=False,
               sleep_time=60):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    executable = 'undertaker'
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)

    paused_dids = {}  # {(scope, name): datetime}

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             older_than=6000)
            prepend_str = 'undertaker [%i/%i] : ' % (
                heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals()))

            start = time.time()

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'],
                                     total_workers=heartbeat['nr_threads'],
                                     limit=10000)

            dids = [
                did for did in dids
                if (did['scope'], did['name']) not in paused_dids
            ]

            if not dids and not once:
                logger(logging.INFO, 'Nothing to do. sleep 60.')
                daemon_sleep(start_time=start,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logger(logging.INFO, 'Receive %s dids to delete',
                           len(chunk))
                    delete_dids(dids=chunk,
                                account=InternalAccount('root', vo='def'),
                                expire_rules=True)
                    logger(logging.INFO, 'Delete %s dids', len(chunk))
                    record_counter(name='undertaker.delete_dids',
                                   delta=len(chunk))
                except RuleNotFound as error:
                    logger(logging.ERROR, error)
                except (DatabaseException, DatabaseError,
                        UnsupportedOperation) as e:
                    if match('.*ORA-00054.*', str(e.args[0])) or match(
                            '.*55P03.*', str(e.args[0])) or match(
                                '.*3572.*', str(e.args[0])):
                        for did in chunk:
                            paused_dids[(
                                did['scope'],
                                did['name'])] = datetime.utcnow() + timedelta(
                                    seconds=randint(600, 2400))
                        record_counter(
                            'undertaker.delete_dids.exceptions.{exception}',
                            labels={'exception': 'LocksDetected'})
                        logger(logging.WARNING, 'Locks detected for chunk')
                    else:
                        logger(logging.ERROR, 'Got database error %s.', str(e))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
示例#29
0
文件: reaper.py 项目: rcarpa/rucio
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = config_get('reaper',
                                                'auto_exclude_threshold',
                                                default=auto_exclude_threshold)
            auto_exclude_timeout = config_get('reaper',
                                              'auto_exclude_timeout',
                                              default=auto_exclude_timeout)
        except (NoOptionError, NoSectionError, RuntimeError):
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'reaper', 'max_evaluator_backlog_count')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'reaper', 'max_evaluator_backlog_duration')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blocklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blocklisted for delete',
                           rse['rse'])
                    continue
                needed_free_space, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], greedy=greedy, logger=logger)
                if needed_free_space:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                    tot_needed_free_space += needed_free_space
                elif only_delete_obsolete:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                else:
                    logger(logging.DEBUG, 'Nothing to delete on %s',
                           rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            paused_rses = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    paused_rses.append(rse_name)
                    logger(
                        logging.DEBUG,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                try:
                    rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                except KeyError:
                    logger(logging.DEBUG, "Hostname lookup for %s failed.",
                           rse_name)
                    REGION.set('pause_deletion_%s' % rse_id, True)
                    continue
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]
                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue
                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][1]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes_=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)
                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme,
                                                  logger=logger)
                    for file_replicas in chunks(replicas, chunk_size):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas,
                            prot,
                            rse_info,
                            staging_areas,
                            auto_exclude_threshold,
                            logger=logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if paused_rses:
                logger(logging.INFO,
                       'Deletion paused for a while for following RSEs: %s',
                       ', '.join(paused_rses))

            if once:
                break

            daemon_sleep(start_time=start_time,
                         sleep_time=sleep_time,
                         graceful_stop=GRACEFUL_STOP,
                         logger=logger)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
示例#30
0
def reaper(rses=[],
           worker_number=0,
           total_workers=1,
           chunk_size=100,
           once=False,
           scheme=None,
           sleep_time=60):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param total_workers:  The total number of workers.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param sleep_time: Thread sleep time after each chunk of work.
    """
    logging.info('Starting Light Reaper %s-%s: Will work on RSEs: %s',
                 worker_number, total_workers,
                 ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(sys.argv[0] +
                                     ''.join([rse['rse']
                                              for rse in rses])).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            prepend_str = 'light-reaper [%i/%i] : ' % (
                heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals()))
            nothing_to_do = True
            start_time = time.time()

            random.shuffle(rses)
            for rse in rses:
                rse_id = rse['id']
                rse = rse['rse']
                replicas = list_expired_temporary_dids(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=worker_number,
                    total_workers=total_workers)

                rse_info = rsemgr.get_rse_info(rse_id=rse_id)
                rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id)
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)
                deleted_replicas = []
                try:
                    prot.connect()
                    for replica in replicas:
                        nothing_to_do = False
                        try:
                            # pfn = str(rsemgr.lfns2pfns(rse_settings=rse_info,
                            #                            lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                            #                            operation='delete', scheme=scheme).values()[0])
                            pfn = 's3://%s%s%s' % (prot.attributes['hostname'],
                                                   prot.attributes['prefix'],
                                                   replica['name'])
                            # logging.debug('Light Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, replica['scope'], replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logger(
                                logging.INFO,
                                'Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                replica['scope'], replica['name'], pfn, rse,
                                duration)
                            payload = {
                                'scope': replica['scope'].external,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica.get('bytes') or 0,
                                'bytes': replica.get('bytes') or 0,
                                'url': pfn,
                                'duration': duration,
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-done', payload)
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (
                                replica['scope'], replica['name'], pfn, rse)
                            logger(logging.WARNING, err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = 'Deletion NOACCESS of %s:%s as %s on %s: %s' % (
                                replica['scope'], replica['name'], pfn, rse,
                                str(error))
                            logger(logging.WARNING, err_msg)
                            payload = {
                                'scope': replica['scope'].external,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica['bytes'] or 0,
                                'bytes': replica['bytes'] or 0,
                                'url': pfn,
                                'reason': str(error),
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-failed', payload)

                        except:
                            logger(logging.CRITICAL, traceback.format_exc())
                finally:
                    prot.close()

                delete_temporary_dids(dids=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logger(logging.INFO, 'Nothing to do. I will sleep for 60s')
                daemon_sleep(start_time=start_time,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return