Exemplo n.º 1
0
 def load_info(self, session=None):
     if self.info is None:
         self.info = rsemgr.get_rse_info(
             rse=self.load_name(session=session),
             vo=rse_core.get_rse_vo(rse_id=self.id, session=session),
             session=session)
     return self.info
Exemplo n.º 2
0
def list_rebalance_rule_candidates(rse_id, mode=None, session=None):
    """
    List the rebalance rule candidates based on the agreed on specification

    :param rse_id:       RSE of the source.
    :param mode:         Rebalancing mode.
    :param session:      DB Session.
    """

    vo = get_rse_vo(rse_id=rse_id)

    # dumps can be applied only for decommission since the dumps doesn't contain info from dids
    if mode == 'decommission':
        return _list_rebalance_rule_candidates_dump(rse_id, mode)

    # the rest is done with sql query
    from_date = datetime.utcnow() + timedelta(days=60)
    to_date = datetime.now() - timedelta(days=60)
    allowed_accounts = [
        InternalAccount(a, vo=vo) for a in ('panda', 'root', 'ddmadmin')
    ]
    allowed_grouping = [RuleGrouping.DATASET, RuleGrouping.ALL]
    external_dsl = aliased(models.DatasetLock)
    count_locks = select([func.count()]).where(
        and_(external_dsl.scope == models.DatasetLock.scope,
             external_dsl.name == models.DatasetLock.name,
             external_dsl.rse_id == models.DatasetLock.rse_id)).as_scalar()
    query = session.query(models.DatasetLock.scope,
                          models.DatasetLock.name,
                          models.ReplicationRule.id,
                          models.ReplicationRule.rse_expression,
                          models.ReplicationRule.subscription_id,
                          models.DataIdentifier.bytes,
                          models.DataIdentifier.length,
                          case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)],
                               else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, Integer))).\
        join(models.ReplicationRule, models.ReplicationRule.id == models.DatasetLock.rule_id).\
        join(models.DataIdentifier, and_(models.DatasetLock.scope == models.DataIdentifier.scope, models.DatasetLock.name == models.DataIdentifier.name)).\
        filter(models.DatasetLock.rse_id == rse_id).\
        filter(or_(models.ReplicationRule.expires_at > from_date, models.ReplicationRule.expires_at.is_(None))).\
        filter(models.ReplicationRule.created_at < to_date).\
        filter(models.ReplicationRule.account.in_(allowed_accounts)).\
        filter(models.ReplicationRule.state == RuleState.OK).\
        filter(models.ReplicationRule.did_type == DIDType.DATASET).\
        filter(models.ReplicationRule.copies == 1).\
        filter(models.ReplicationRule.child_rule_id.is_(None)).\
        filter(models.ReplicationRule.grouping.in_(allowed_grouping)).\
        filter(models.DataIdentifier.bytes.isnot(None)).\
        filter(models.DataIdentifier.is_open == 0).\
        filter(models.DataIdentifier.did_type == DIDType.DATASET).\
        filter(case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)],
                    else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, Integer)) > 1000000000).\
        filter(count_locks == 1)
    summary = query.order_by(
        case([(or_(models.DatasetLock.length < 1,
                   models.DatasetLock.length.is_(None)), 0)],
             else_=cast(models.DatasetLock.bytes / models.DatasetLock.length,
                        Integer)), models.DatasetLock.accessed_at).all()
    return summary
Exemplo n.º 3
0
def __dump_url(rse_id, logger=logging.log):
    """
    getting potential urls of the dump over last week
    :param rse_id:                     RSE where the dump is released.
    :param logger:                     Logger.
    """

    rse = get_rse_name(rse_id=rse_id)
    vo = get_rse_vo(rse_id=rse_id)

    # get the date of the most recent dump
    today = date.today()
    dump_dates = []
    dump_production_day = config_get('bb8',
                                     'dump_production_day',
                                     raise_exception=False,
                                     default=None)
    if dump_production_day is None:
        for idx in range(0, 7):
            dump_date = today - timedelta(idx)
            dump_dates.append(dump_date.strftime('%d-%m-%Y'))
    else:
        weekdays = {
            'Sunday': 6,
            'Monday': 0,
            'Tuesday': 1,
            'Wednesday': 2,
            'Thursday': 3,
            'Friday': 4,
            'Saturday': 5
        }
        if dump_production_day not in weekdays:
            logger(
                logging.WARNING,
                'ERROR: please set the day of a dump creation in bb8 config correctly, e.g. Monday'
            )
            return False
        today_idx = (today.weekday() - weekdays[dump_production_day]) % 7
        dump_date = today - timedelta(today_idx)
        dump_dates = [dump_date.strftime('%d-%m-%Y')]

    # getting structure (template) of url location of a dump
    url_template_str = config_get(
        'bb8',
        'dump_url_template',
        raise_exception=False,
        default=
        'http://rucio-analytix.cern.ch:8080/LOCKS/GetFileFromHDFS?date=${date}&rse=${rse}'
    )
    url_template = Template(url_template_str)

    # populating url template
    urls = []
    for d in dump_dates:
        url = url_template.substitute({'date': d, 'rse': rse, 'vo': vo})
        urls.append(url)
    return urls
Exemplo n.º 4
0
def __dump_url(rse_id, logger=logging.log):
    """
    getting potential urls of the dump over last week
    :param rse_id:                     RSE where the dump is released.
    :param logger:                     Logger.
    """

    rse = get_rse_name(rse_id=rse_id)
    vo = get_rse_vo(rse_id=rse_id)

    # get the date of the most recent dump
    today = date.today()
    dump_dates = []
    dump_production_day = config_get("bb8",
                                     "dump_production_day",
                                     raise_exception=False,
                                     default=None)
    if dump_production_day is None:
        for idx in range(0, 7):
            dump_date = today - timedelta(idx)
            dump_dates.append(dump_date.strftime("%d-%m-%Y"))
    else:
        weekdays = {
            "Sunday": 6,
            "Monday": 0,
            "Tuesday": 1,
            "Wednesday": 2,
            "Thursday": 3,
            "Friday": 4,
            "Saturday": 5,
        }
        if dump_production_day not in weekdays:
            logger(
                logging.WARNING,
                "ERROR: please set the day of a dump creation in bb8 config correctly, e.g. Monday",
            )
            return False
        today_idx = (today.weekday() - weekdays[dump_production_day]) % 7
        dump_date = today - timedelta(today_idx)
        dump_dates = [dump_date.strftime("%d-%m-%Y")]

    # getting structure (template) of url location of a dump
    url_template_str = config_get(
        "bb8",
        "dump_url_template",
        raise_exception=False,
    )
    url_template = Template(url_template_str)

    # populating url template
    urls = []
    for d in dump_dates:
        url = url_template.substitute({"date": d, "rse": rse, "vo": vo})
        urls.append(url)
    return urls
Exemplo n.º 5
0
 def _get_path_nondeterministic_server(self, scope, name):  # pylint: disable=invalid-name
     """ Provides the path of a replica for non-deterministic sites. Will be assigned to get path by the __init__ method if neccessary. """
     vo = get_rse_vo(self.rse['id'])
     scope = InternalScope(scope, vo=vo)
     rep = replica.get_replica(scope=scope, name=name, rse_id=self.rse['id'])
     if 'path' in rep and rep['path'] is not None:
         path = rep['path']
     elif 'state' in rep and (rep['state'] is None or rep['state'] == 'UNAVAILABLE'):
         raise exception.ReplicaUnAvailable('Missing path information and state is UNAVAILABLE for replica %s:%s on non-deterministic storage named %s' % (scope, name, self.rse['rse']))
     else:
         raise exception.ReplicaNotFound('Missing path information for replica %s:%s on non-deterministic storage named %s' % (scope, name, self.rse['rse']))
     if path.startswith('/'):
         path = path[1:]
     if path.endswith('/'):
         path = path[:-1]
     return path
Exemplo n.º 6
0
    def test_account_counters_at_different_vos(self):
        """ MULTI VO (CLIENT): Test that account counters from 2nd vo don't interfere """

        session = db_session.get_session()

        # add some RSEs to test create_counters_for_new_account
        rse_client = RSEClient()
        rse_str = ''.join(choice(ascii_uppercase) for x in range(10))
        tst_rse1 = 'TST1_%s' % rse_str
        new_rse1 = 'NEW1_%s' % rse_str
        rse_client.add_rse(tst_rse1)
        add_rse(new_rse1, 'root', **self.new_vo)

        # add an account - should have counters created for RSEs on the same VO
        usr_uuid = str(generate_uuid()).lower()[:16]
        new_acc_str = 'shr-%s' % usr_uuid
        new_acc = InternalAccount(new_acc_str, **self.new_vo)
        add_account(new_acc_str, 'USER', '*****@*****.**', 'root', **self.new_vo)

        query = session.query(models.AccountUsage.account, models.AccountUsage.rse_id).\
            distinct(models.AccountUsage.account, models.AccountUsage.rse_id).\
            filter_by(account=new_acc)
        acc_counters = list(query.all())

        assert_not_equal(0, len(acc_counters))
        for counter in acc_counters:
            rse_id = counter[1]
            vo = get_rse_vo(rse_id)
            assert_equal(vo, self.new_vo['vo'])

        # add an RSE - should have counters created for accounts on the same VO
        new_rse2 = 'NEW2_' + rse_str
        new_rse2_id = add_rse(new_rse2, 'root', **self.new_vo)

        query = session.query(models.AccountUsage.account, models.AccountUsage.rse_id).\
            distinct(models.AccountUsage.account, models.AccountUsage.rse_id).\
            filter_by(rse_id=new_rse2_id)
        rse_counters = list(query.all())

        assert_not_equal(0, len(rse_counters))
        for counter in rse_counters:
            account = counter[0]
            assert_equal(account.vo, self.new_vo['vo'])

        session.commit()
Exemplo n.º 7
0
def atropos(thread,
            bulk,
            date_check,
            dry_run=True,
            grace_period=86400,
            once=True,
            unlock=False,
            spread_period=0,
            purge_replicas=False):
    """
    Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param grace_period: The grace_period for the rules.
    :param once: Run only once.
    """

    sleep_time = 60

    executable = 'atropos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    now = datetime.datetime.now()
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    time.sleep(10)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads'])
    logging.debug(prepend_str + 'Starting worker')
    summary = {}
    lifetime_exceptions = {}
    rand = random.Random(hb['assign_thread'])
    for excep in rucio.core.lifetime_exception.list_exceptions(
            exception_id=None,
            states=[
                LifetimeExceptionsState.APPROVED,
            ],
            session=None):
        key = '{}:{}'.format(excep['scope'].internal, excep['name'])
        if key not in lifetime_exceptions:
            lifetime_exceptions[key] = excep['expires_at']
        elif lifetime_exceptions[key] < excep['expires_at']:
            lifetime_exceptions[key] = excep['expires_at']
    logging.debug(prepend_str +
                  '%s active exceptions' % len(lifetime_exceptions))
    if not dry_run and date_check > now:
        logging.error(
            prepend_str +
            'Atropos cannot run in non-dry-run mode for date in the future')
    else:
        while not GRACEFUL_STOP.is_set():

            hb = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'],
                                                 hb['nr_threads'])

            stime = time.time()
            try:
                rules = get_rules_beyond_eol(date_check,
                                             thread,
                                             hb['nr_threads'],
                                             session=None)
                logging.info(prepend_str + '%s rules to process' %
                             (len(rules)))
                for rule_idx, rule in enumerate(rules, start=1):
                    did = '%s:%s' % (rule.scope, rule.name)
                    did_key = '{}:{}'.format(rule.scope.internal, rule.name)
                    logging.debug(prepend_str +
                                  'Working on rule %s on DID %s on %s' %
                                  (rule.id, did, rule.rse_expression))

                    if (rule_idx % 1000) == 0:
                        logging.info(prepend_str + '%s/%s rules processed' %
                                     (rule_idx, len(rules)))

                    # We compute the expected eol_at
                    try:
                        rses = parse_expression(rule.rse_expression,
                                                filter={'vo': rule.account.vo})
                    except InvalidRSEExpression:
                        logging.warning(
                            prepend_str +
                            'Rule %s has an RSE expression that results in an empty set: %s'
                            % (rule.id, rule.rse_expression))
                        continue
                    eol_at = rucio.core.lifetime_exception.define_eol(
                        rule.scope, rule.name, rses)
                    if eol_at != rule.eol_at:
                        logging.warning(
                            prepend_str +
                            'The computed eol %s differs from the one recorded %s for rule %s on %s at %s'
                            % (eol_at, rule.eol_at, rule.id, did,
                               rule.rse_expression))
                        try:
                            update_rule(rule.id, options={'eol_at': eol_at})
                        except RuleNotFound:
                            logging.warning(prepend_str +
                                            'Cannot find rule %s on DID %s' %
                                            (rule.id, did))
                            continue

                    # Check the exceptions
                    if did_key in lifetime_exceptions:
                        if eol_at > lifetime_exceptions[did_key]:
                            logging.info(
                                prepend_str +
                                'Rule %s on DID %s on %s has longer expiration date than the one requested : %s'
                                % (rule.id, did, rule.rse_expression,
                                   lifetime_exceptions[did_key]))
                        else:
                            # If eol_at < requested extension, update eol_at
                            logging.info(
                                prepend_str +
                                'Updating rule %s on DID %s on %s according to the exception till %s'
                                % (rule.id, did, rule.rse_expression,
                                   lifetime_exceptions[did_key]))
                            eol_at = lifetime_exceptions[did_key]
                            try:
                                update_rule(rule.id,
                                            options={
                                                'eol_at':
                                                lifetime_exceptions[did_key]
                                            })
                            except RuleNotFound:
                                logging.warning(
                                    prepend_str +
                                    'Cannot find rule %s on DID %s' %
                                    (rule.id, did))
                                continue

                    # Now check that the new eol_at is expired
                    if eol_at and eol_at < date_check:
                        no_locks = True
                        for lock in get_dataset_locks(rule.scope, rule.name):
                            if lock['rule_id'] == rule[4]:
                                no_locks = False
                                if lock['rse_id'] not in summary:
                                    summary[lock['rse_id']] = {}
                                if did_key not in summary[lock['rse_id']]:
                                    summary[lock['rse_id']][did_key] = {
                                        'length': lock['length'] or 0,
                                        'bytes': lock['bytes'] or 0
                                    }
                        if no_locks:
                            logging.warning(
                                prepend_str +
                                'Cannot find a lock for rule %s on DID %s' %
                                (rule.id, did))
                        if not dry_run:
                            lifetime = grace_period + rand.randrange(
                                spread_period + 1)
                            logging.info(
                                prepend_str +
                                'Setting %s seconds lifetime for rule %s' %
                                (lifetime, rule.id))
                            options = {'lifetime': lifetime}
                            if purge_replicas:
                                options['purge_replicas'] = True
                            if rule.locked and unlock:
                                logging.info(prepend_str + 'Unlocking rule %s',
                                             rule.id)
                                options['locked'] = False
                            try:
                                update_rule(rule.id, options=options)
                            except RuleNotFound:
                                logging.warning(
                                    prepend_str +
                                    'Cannot find rule %s on DID %s' %
                                    (rule.id, did))
                                continue
            except Exception:
                exc_type, exc_value, exc_traceback = exc_info()
                logging.critical(''.join(
                    format_exception(exc_type, exc_value,
                                     exc_traceback)).strip())

            for rse_id in summary:
                tot_size, tot_files, tot_datasets = 0, 0, 0
                for did in summary[rse_id]:
                    tot_datasets += 1
                    tot_files += summary[rse_id][did].get('length', 0)
                    tot_size += summary[rse_id][did].get('bytes', 0)
                vo = get_rse_vo(rse_id=rse_id)
                logging.info(
                    prepend_str +
                    'For RSE %s %s %s datasets will be deleted representing %s files and %s bytes'
                    % (get_rse_name(rse_id=rse_id), '' if vo == 'def' else
                       'on VO ' + vo, tot_datasets, tot_files, tot_size))

            if once:
                break
            else:
                tottime = time.time() - stime
                if tottime < sleep_time:
                    logging.info(prepend_str + 'Will sleep for %s seconds' %
                                 (str(sleep_time - tottime)))
                    time.sleep(sleep_time - tottime)
                    continue

        logging.info(prepend_str + 'Graceful stop requested')
        heartbeat.die(executable, hostname, pid, hb_thread)
        logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 8
0
def list_rebalance_rule_candidates(rse_id, mode=None, session=None):
    """
    List the rebalance rule candidates based on the agreed on specification
    :param rse_id:       RSE of the source.
    :param mode:         Rebalancing mode.
    :param session:      DB Session.
    """

    vo = get_rse_vo(rse_id=rse_id)

    # dumps can be applied only for decommission since the dumps doesn't contain info from dids
    if mode == 'decommission':
        return _list_rebalance_rule_candidates_dump(rse_id, mode)

    # If no decommissioning use SQLAlchemy

    # Rules constraints. By default only moves rules in state OK that have no children and have only one copy
    # Additional constraints can be imposed by setting specific configuration
    rule_clause = [
        models.ReplicationRule.state == RuleState.OK,
        models.ReplicationRule.child_rule_id.is_(None),
        models.ReplicationRule.copies == 1
    ]

    # Only move rules w/o expiration date, or rules with expiration_date > >min_expires_date_in_days> days
    expiration_clause = models.ReplicationRule.expires_at.is_(None)
    min_expires_date_in_days = config_get_int(
        section='bb8',
        option='min_expires_date_in_days',
        raise_exception=False,
        default=-1,
        expiration_time=3600)
    if min_expires_date_in_days > 0:
        min_expires_date_in_days = datetime.utcnow() + timedelta(
            days=min_expires_date_in_days)
        expiration_clause = or_(
            models.ReplicationRule.expires_at > min_expires_date_in_days,
            models.ReplicationRule.expires_at.is_(None))
    rule_clause.append(expiration_clause)

    # Only move rules which were created more than <min_created_days> days ago
    min_created_days = config_get_int(section='bb8',
                                      option='min_created_days',
                                      raise_exception=False,
                                      default=-1,
                                      expiration_time=3600)
    if min_created_days > 0:
        min_created_days = datetime.now() - timedelta(days=min_created_days)
        rule_clause.append(
            models.ReplicationRule.created_at < min_created_days)

    # Only move rules which are owned by <allowed_accounts> (coma separated accounts, e.g. panda,root,ddmadmin,jdoe)
    allowed_accounts = config_get(section='bb8',
                                  option='allowed_accounts',
                                  raise_exception=False,
                                  default=None,
                                  expiration_time=3600)
    if allowed_accounts:
        allowed_accounts = [
            InternalAccount(acc.strip(' '), vo=vo)
            for acc in allowed_accounts.split(',')
        ]
        rule_clause.append(
            models.ReplicationRule.account.in_(allowed_accounts))

    # Only move rules that have a certain grouping <allowed_grouping> (accepted values : all, dataset, none)
    rule_grouping_mapping = {
        'all': RuleGrouping.ALL,
        'dataset': RuleGrouping.DATASET,
        'none': RuleGrouping.NONE
    }
    allowed_grouping = config_get(section='bb8',
                                  option='allowed_grouping',
                                  raise_exception=False,
                                  default=None,
                                  expiration_time=3600)
    if allowed_grouping:
        rule_clause.append(models.ReplicationRule.grouping ==
                           rule_grouping_mapping.get(allowed_grouping))

    # DIDs constraints. By default only moves rules of DID where we can compute the size
    # Additional constraints can be imposed by setting specific configuration
    did_clause = [models.DataIdentifier.bytes.isnot(None)]

    type_to_did_type_mapping = {
        'all': [DIDType.CONTAINER, DIDType.DATASET, DIDType.FILE],
        'collection': [DIDType.CONTAINER, DIDType.DATASET],
        'container': [DIDType.CONTAINER],
        'dataset': [DIDType.DATASET],
        'file': [DIDType.FILE]
    }

    # Only allows to migrate rules of a certain did_type <allowed_did_type> (accepted values : all, collection, container, dataset, file)
    allowed_did_type = config_get(section='bb8',
                                  option='allowed_did_type',
                                  raise_exception=False,
                                  default=None,
                                  expiration_time=3600)
    if allowed_did_type:
        allowed_did_type = [
            models.DataIdentifier.did_type == did_type
            for did_type in type_to_did_type_mapping.get(allowed_did_type)
        ]
        did_clause.append(or_(allowed_did_type))

    # Only allows to migrate rules of closed DID is <only_move_closed_did> is set
    only_move_closed_did = config_get_bool(section='bb8',
                                           option='only_move_closed_did',
                                           raise_exception=False,
                                           default=None,
                                           expiration_time=3600)
    if only_move_closed_did:
        did_clause.append(models.DataIdentifier.is_open == False)  # NOQA

    # Now build the query
    external_dsl = aliased(models.DatasetLock)
    count_locks = select([func.count()]).where(
        and_(external_dsl.scope == models.DatasetLock.scope,
             external_dsl.name == models.DatasetLock.name,
             external_dsl.rse_id == models.DatasetLock.rse_id)).as_scalar()
    query = session.query(models.DatasetLock.scope,
                          models.DatasetLock.name,
                          models.ReplicationRule.id,
                          models.ReplicationRule.rse_expression,
                          models.ReplicationRule.subscription_id,
                          models.DataIdentifier.bytes,
                          models.DataIdentifier.length,
                          case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)],
                               else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger))).\
        join(models.ReplicationRule, models.ReplicationRule.id == models.DatasetLock.rule_id).\
        join(models.DataIdentifier, and_(models.DatasetLock.scope == models.DataIdentifier.scope, models.DatasetLock.name == models.DataIdentifier.name)).\
        filter(models.DatasetLock.rse_id == rse_id).\
        filter(and_(*rule_clause)).\
        filter(and_(*did_clause)).\
        filter(case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)],
                    else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger)) > 1000000000).\
        filter(count_locks == 1)
    summary = query.order_by(
        case([(or_(models.DatasetLock.length < 1,
                   models.DatasetLock.length.is_(None)), 0)],
             else_=cast(models.DatasetLock.bytes / models.DatasetLock.length,
                        BigInteger)), models.DatasetLock.accessed_at).all()
    return summary
Exemplo n.º 9
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                pfns = bad_replicas[(account, reason, state)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme])
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                            unknown_replicas.extend(
                                tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logging.info(
                            prepend_str +
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo = get_rse_vo(rse_id=rse_id)
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logging.debug(prepend_str +
                                      'Running on RSE %s%s with %s replicas' %
                                      (get_rse_name(rse_id=rse_id), vo_str,
                                       len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logging.debug(prepend_str +
                                              'Unknown replicas : %s' %
                                              (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme])
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    vo = get_rse_vo(rse_id=rse_id)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logging.debug(prepend_str +
                                  'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logging.debug(prepend_str +
                                  'Running on %s replicas on RSE %s' %
                                  (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except UnsupportedOperation as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.error(
                                prepend_str +
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logging.debug(prepend_str + 'Working on %s' %
                                              (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    if rep['rse_id'] in unavailable_states:
                                        logging.info(
                                            prepend_str +
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                except DataIdentifierNotFound as error:
                                    logging.error(
                                        prepend_str +
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logging.critical(traceback.format_exc())
                            session = get_session()

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logging.info(
                prepend_str +
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')