Exemplo n.º 1
0
def test_archive_on_dataset_level(rse_factory, did_factory, root_account):
    rse_name, rse_id = rse_factory.make_xroot_rse()

    dataset1 = did_factory.make_dataset()
    dataset2 = did_factory.make_dataset()
    container = did_factory.make_container()
    attach_dids(dids=[dataset1, dataset2], account=root_account, **container)

    # Add a random file to the datasets to avoid dataset deletion when the archive is deleted
    a_file = did_factory.random_did()
    add_replicas(rse_id=rse_id,
                 files=[{
                     **a_file, 'bytes': 500,
                     'type': 'FILE',
                     'adler32': 'beefbeef'
                 }],
                 account=root_account)
    attach_dids(dids=[a_file], account=root_account, **dataset1)
    attach_dids(dids=[a_file], account=root_account, **dataset2)
    # adding a non-archive file should not set is_archive=True
    metadata = get_metadata(**dataset1)
    assert not metadata['is_archive']

    # Create an archive and its constituents, attach the archive to datasets
    archive = did_factory.random_did(name_prefix='archive', name_suffix='.zip')
    add_replicas(rse_id=rse_id,
                 files=[{
                     **archive, 'bytes': 500,
                     'type': 'FILE',
                     'adler32': 'beefbeef'
                 }],
                 account=root_account)
    constituents = [did_factory.random_did() for _ in range(2)]
    # Add archive to one dataset _before_ attaching files to the archive (before is_archive is set on the archive did)
    attach_dids(dids=[archive], account=root_account, **dataset1)
    attach_dids(dids=[{
        **c, 'bytes': 200,
        'adler32': 'ababbaba'
    } for c in constituents],
                account=root_account,
                **archive)
    # Attach to another dataset _after_ attaching files to the archive
    attach_dids(dids=[archive], account=root_account, **dataset2)

    # Both datasets must have is_archive = True
    metadata = get_metadata(**dataset1)
    assert metadata['is_archive'] is True
    metadata = get_metadata(**dataset2)
    assert metadata['is_archive'] is True

    # Delete the archive, the datasets must now have is_archive == false
    delete_replicas(rse_id=rse_id, files=[archive])

    metadata = get_metadata(**dataset1)
    assert not metadata['is_archive']
    metadata = get_metadata(**dataset2)
    assert not metadata['is_archive']
Exemplo n.º 2
0
Arquivo: did.py Projeto: kbg/rucio
def get_metadata(scope, name):
    """
    Get data identifier metadata

    :param scope: The scope name.
    :param name: The data identifier name.
    """
    return did.get_metadata(scope=scope, name=name)
Exemplo n.º 3
0
def get_metadata(scope, name):
    """
    Get data identifier metadata

    :param scope: The scope name.
    :param name: The data identifier name.
    """
    return did.get_metadata(scope=scope, name=name)
Exemplo n.º 4
0
def test_atropos(root_account, rse_factory, mock_scope, did_factory, rucio_client):
    """
    Test the behaviour of atropos
    """
    today = datetime.now()
    check_date = datetime.now() + timedelta(days=365)
    check_date = check_date.isoformat().split('T')[0]

    # Define a policy
    lifetime_dir = '/opt/rucio/etc/policies'
    os.makedirs('/opt/rucio/etc/policies', exist_ok=True)
    lifetime_policy = [{'name': 'Test', 'include': {'datatype': ['RAW'], 'project': ['data%']}, 'age': '6', 'extension': '1'}]
    with open('%s/config_other.json' % lifetime_dir, 'w') as outfile:
        json.dump(lifetime_policy, outfile)
    REGION.invalidate()
    nb_datasets = 2
    today = datetime.now()
    rse, rse_id = rse_factory.make_posix_rse()
    datasets = [did_factory.make_dataset() for _ in range(nb_datasets)]
    rules = list()
    expiration_date = None

    # Check that the eol_at is properly set
    # Rule on dataset 0 that matches the policy should get an eol_at
    # Rule on dataset 1 that doesn't matches the policy should not get an eol_at
    for cnt, dataset in enumerate(datasets):
        if cnt == 0:
            set_metadata(dataset['scope'], dataset['name'], 'datatype', 'RAW')
            set_metadata(dataset['scope'], dataset['name'], 'project', 'data')
        rule_ids = add_rule(dids=[{'scope': dataset['scope'], 'name': dataset['name']}], account=root_account, copies=1, rse_expression=rse, grouping='DATASET', weight=None, lifetime=None, locked=None, subscription_id=None)
        rules.append(rule_ids[0])
        rule = get_rule(rule_ids[0])
        if cnt == 0:
            expiration_date = rule['eol_at']
            assert expiration_date is not None
            assert expiration_date - today < timedelta(181)
            assert expiration_date - today > timedelta(179)
        else:
            assert rule['eol_at'] is None

    # Run atropos in dry-run mode to set eol_at on the dataset
    # Dataset 0 should get eol_at
    # Dataset 1 should not get eol_at
    atropos(thread=1, bulk=100, date_check=datetime.strptime(check_date, '%Y-%m-%d'), dry_run=True, grace_period=86400,
            once=True, unlock=False, spread_period=0, purge_replicas=False, sleep_time=60)

    for cnt, dataset in enumerate(datasets):
        meta = get_metadata(dataset['scope'], dataset['name'])
        if cnt == 0:
            assert meta['eol_at'] is not None
            assert meta['eol_at'] == expiration_date
        else:
            assert meta['eol_at'] is None

    # Clean-up
    os.remove('/opt/rucio/etc/policies/config_other.json')
Exemplo n.º 5
0
    def test_update_dids(self):
        """ DATA IDENTIFIERS (CORE): Update file size and checksum"""
        tmp_scope = 'mock'
        dsn = 'dsn_%s' % generate_uuid()
        lfn = 'lfn.%s' % str(generate_uuid())
        add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account='root')

        files = [{'scope': tmp_scope, 'name': lfn,
                  'bytes': 724963570, 'adler32': '0cc737eb',
                  'meta': {'guid': str(generate_uuid()), 'events': 100}}]
        attach_dids(scope=tmp_scope, name=dsn, rse='MOCK', dids=files, account='root')

        set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee')
        assert_equal(get_metadata(scope=tmp_scope, name=lfn)['adler32'], '0cc737ee')

        with assert_raises(UnsupportedOperation):
            set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee')

        set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577)
        assert_equal(get_metadata(scope=tmp_scope, name=lfn)['bytes'], 724963577)
Exemplo n.º 6
0
    def test_update_dids(self):
        """ DATA IDENTIFIERS (CORE): Update file size and checksum"""
        tmp_scope = InternalScope('mock', **self.vo)
        root = InternalAccount('root', **self.vo)
        dsn = 'dsn_%s' % generate_uuid()
        lfn = 'lfn.%s' % str(generate_uuid())
        add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account=root)

        files = [{'scope': tmp_scope, 'name': lfn,
                  'bytes': 724963570, 'adler32': '0cc737eb',
                  'meta': {'guid': str(generate_uuid()), 'events': 100}}]
        attach_dids(scope=tmp_scope, name=dsn, rse_id=get_rse_id(rse='MOCK', **self.vo), dids=files, account=root)

        set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee')
        assert get_metadata(scope=tmp_scope, name=lfn)['adler32'] == '0cc737ee'

        with pytest.raises(DataIdentifierNotFound):
            set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee')

        set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577)
        assert get_metadata(scope=tmp_scope, name=lfn)['bytes'] == 724963577
Exemplo n.º 7
0
def get_metadata(scope, name):
    """
    Get data identifier metadata

    :param scope: The scope name.
    :param name: The data identifier name.
    """

    scope = InternalScope(scope)

    d = did.get_metadata(scope=scope, name=name)
    return api_update_return_dict(d)
Exemplo n.º 8
0
Arquivo: did.py Projeto: rcarpa/rucio
def get_metadata(scope, name, plugin='DID_COLUMN', vo='def'):
    """
    Get data identifier metadata

    :param scope: The scope name.
    :param name: The data identifier name.
    :param vo: The VO to act on.
    """

    scope = InternalScope(scope, vo=vo)

    d = did.get_metadata(scope=scope, name=name, plugin=plugin)
    return api_update_return_dict(d)
Exemplo n.º 9
0
def perm_set_status(issuer, kwargs, session=None):
    """
    Checks if an account can set status on an data identifier.

    :param issuer: Account identifier which issues the command.
    :param kwargs: List of arguments for the action.
    :param session: The DB session to use
    :returns: True if account is allowed, otherwise False
    """
    meta = get_metadata(kwargs['scope'], kwargs['name'], session=session)
    return perm_default(issuer, kwargs, session=session)\
        or has_account_attribute(account=issuer, key='did_admin', session=session)\
        or meta.get('account', '') == issuer\
        or rucio.core.scope.is_scope_owner(scope=kwargs['scope'], account=issuer, session=session)
Exemplo n.º 10
0
def get_metadata(scope, name, plugin='DID_COLUMN', vo='def', session=None):
    """
    Get data identifier metadata

    :param scope: The scope name.
    :param name: The data identifier name.
    :param vo: The VO to act on.
    :param session: The database session in use.
    """

    scope = InternalScope(scope, vo=vo)

    d = did.get_metadata(scope=scope, name=name, plugin=plugin, session=session)
    return api_update_return_dict(d, session=session)
Exemplo n.º 11
0
def transmogrifier(bulk=5, once=False, sleep_time=60):
    """
    Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash,
    identifies the subscriptions matching the DIDs and
    submit a replication rule for each DID matching a subscription.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'transmogrifier'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

        dids, subscriptions = [], []
        tottime = 0
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])

        try:
            #  Get the new DIDs based on the is_new flag
            for did in list_new_dids(thread=heart_beat['assign_thread'],
                                     total_threads=heart_beat['nr_threads'],
                                     chunk_size=bulk,
                                     did_type=None):
                dids.append({
                    'scope': did['scope'],
                    'did_type': str(did['did_type']),
                    'name': did['name']
                })

            sub_dict = {3: []}
            #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
            #  The priority is defined as 'policyid'
            for sub in list_subscriptions(None, None):
                if sub['state'] != SubscriptionState.INACTIVE and sub[
                        'lifetime'] and (datetime.now() > sub['lifetime']):
                    update_subscription(
                        name=sub['name'],
                        account=sub['account'],
                        metadata={'state': SubscriptionState.INACTIVE})

                elif sub['state'] in [
                        SubscriptionState.ACTIVE, SubscriptionState.UPDATED
                ]:
                    priority = 3
                    if 'policyid' in sub:
                        if int(sub['policyid']) not in sub_dict:
                            sub_dict[int(sub['policyid'])] = []
                        priority = int(sub['policyid'])
                    sub_dict[priority].append(sub)
            priorities = list(sub_dict.keys())
            priorities.sort()
            #  Order the subscriptions according to their priority
            for priority in priorities:
                subscriptions.extend(sub_dict[priority])
        except SubscriptionNotFound as error:
            logging.warning(prepend_str + 'No subscriptions defined: %s' %
                            (str(error)))
            time.sleep(10)
            continue
        except Exception as error:
            logging.error(
                prepend_str +
                'Failed to get list of new DIDs or subscriptions: %s' %
                (str(error)))

        try:
            results = {}
            start_time = time.time()
            blacklisted_rse_id = [
                rse['id'] for rse in list_rses({'availability_write': False})
            ]
            logging.debug(prepend_str + 'In transmogrifier worker')
            identifiers = []
            #  Loop over all the new dids
            for did in dids:
                did_success = True
                if did['did_type'] == str(
                        DIDType.DATASET) or did['did_type'] == str(
                            DIDType.CONTAINER):
                    did_tag = '%s:%s' % (did['scope'].internal, did['name'])
                    results[did_tag] = []
                    try:
                        metadata = get_metadata(did['scope'], did['name'])
                        # Loop over all the subscriptions
                        for subscription in subscriptions:
                            #  Check if the DID match the subscription
                            if is_matching_subscription(
                                    subscription, did, metadata) is True:
                                filter_string = loads(subscription['filter'])
                                split_rule = filter_string.get(
                                    'split_rule', False)
                                stime = time.time()
                                results[did_tag].append(subscription['id'])
                                logging.info(prepend_str +
                                             '%s:%s matches subscription %s' %
                                             (did['scope'], did['name'],
                                              subscription['name']))
                                rules = loads(
                                    subscription['replication_rules'])
                                created_rules = {}
                                cnt = 0
                                for rule_dict in rules:
                                    cnt += 1
                                    created_rules[cnt] = []
                                    # Get all the rule and subscription parameters
                                    grouping = rule_dict.get(
                                        'grouping', 'DATASET')
                                    lifetime = rule_dict.get('lifetime', None)
                                    ignore_availability = rule_dict.get(
                                        'ignore_availability', None)
                                    weight = rule_dict.get('weight', None)
                                    source_replica_expression = rule_dict.get(
                                        'source_replica_expression', None)
                                    locked = rule_dict.get('locked', None)
                                    if locked == 'True':
                                        locked = True
                                    else:
                                        locked = False
                                    purge_replicas = rule_dict.get(
                                        'purge_replicas', False)
                                    if purge_replicas == 'True':
                                        purge_replicas = True
                                    else:
                                        purge_replicas = False
                                    rse_expression = str(
                                        rule_dict['rse_expression'])
                                    comment = str(subscription['comments'])
                                    subscription_id = str(subscription['id'])
                                    account = subscription['account']
                                    copies = int(rule_dict['copies'])
                                    activity = rule_dict.get(
                                        'activity', 'User Subscriptions')
                                    try:
                                        validate_schema(name='activity',
                                                        obj=activity)
                                    except InputValidationError as error:
                                        logging.error(
                                            prepend_str +
                                            'Error validating the activity %s'
                                            % (str(error)))
                                        activity = 'User Subscriptions'
                                    if lifetime:
                                        lifetime = int(lifetime)

                                    str_activity = "".join(activity.split())
                                    success = False
                                    nattempt = 5
                                    attemptnr = 0
                                    skip_rule_creation = False

                                    selected_rses = []
                                    chained_idx = rule_dict.get(
                                        'chained_idx', None)
                                    if chained_idx:
                                        params = {}
                                        if rule_dict.get(
                                                'associated_site_idx', None):
                                            params[
                                                'associated_site_idx'] = rule_dict.get(
                                                    'associated_site_idx',
                                                    None)
                                        logging.debug(
                                            '%s Chained subscription identified. Will use %s',
                                            prepend_str,
                                            str(created_rules[chained_idx]))
                                        algorithm = rule_dict.get(
                                            'algorithm', None)
                                        selected_rses = select_algorithm(
                                            algorithm,
                                            created_rules[chained_idx], params)
                                    else:
                                        # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm
                                        if split_rule:
                                            vo = account.vo
                                            rses = parse_expression(
                                                rse_expression,
                                                filter={'vo': vo})
                                            list_of_rses = [
                                                rse['id'] for rse in rses
                                            ]
                                            # Check that some rule doesn't already exist for this DID and subscription
                                            preferred_rse_ids = []
                                            for rule in list_rules(
                                                    filters={
                                                        'subscription_id':
                                                        subscription_id,
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }):
                                                already_existing_rses = [
                                                    (rse['rse'], rse['id']) for
                                                    rse in parse_expression(
                                                        rule['rse_expression'],
                                                        filter={'vo': vo})
                                                ]
                                                for rse, rse_id in already_existing_rses:
                                                    if (rse_id in list_of_rses
                                                        ) and (
                                                            rse_id not in
                                                            preferred_rse_ids):
                                                        preferred_rse_ids.append(
                                                            rse_id)
                                            if len(preferred_rse_ids
                                                   ) >= copies:
                                                skip_rule_creation = True
                                            rse_id_dict = {}
                                            for rse in rses:
                                                rse_id_dict[
                                                    rse['id']] = rse['rse']
                                            try:
                                                rseselector = RSESelector(
                                                    account=account,
                                                    rses=rses,
                                                    weight=weight,
                                                    copies=copies -
                                                    len(preferred_rse_ids))
                                                selected_rses = [
                                                    rse_id_dict[rse_id]
                                                    for rse_id, _, _ in
                                                    rseselector.select_rse(
                                                        0,
                                                        preferred_rse_ids=
                                                        preferred_rse_ids,
                                                        copies=copies,
                                                        blacklist=
                                                        blacklisted_rse_id)
                                                ]
                                            except (InsufficientTargetRSEs,
                                                    InsufficientAccountLimit,
                                                    InvalidRuleWeight,
                                                    RSEOverQuota) as error:
                                                logging.warning(
                                                    prepend_str +
                                                    'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites'
                                                    % (subscription['name'],
                                                       account, str(error)))
                                                # Now including the blacklisted sites
                                                try:
                                                    rseselector = RSESelector(
                                                        account=account,
                                                        rses=rses,
                                                        weight=weight,
                                                        copies=copies -
                                                        len(preferred_rse_ids))
                                                    selected_rses = [
                                                        rse_id_dict[rse_id]
                                                        for rse_id, _, _ in
                                                        rseselector.select_rse(
                                                            0,
                                                            preferred_rse_ids=
                                                            preferred_rse_ids,
                                                            copies=copies,
                                                            blacklist=[])
                                                    ]
                                                    ignore_availability = True
                                                except (InsufficientTargetRSEs,
                                                        InsufficientAccountLimit,
                                                        InvalidRuleWeight,
                                                        RSEOverQuota) as error:
                                                    logging.error(
                                                        prepend_str +
                                                        'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                        %
                                                        (subscription['name'],
                                                         account, str(error)))
                                                    monitor.record_counter(
                                                        counters=
                                                        'transmogrifier.addnewrule.errortype.%s'
                                                        % (str(error.__class__.
                                                               __name__)),
                                                        delta=1)
                                                    # The DID won't be reevaluated at the next cycle
                                                    did_success = did_success and True
                                                    continue

                                    for attempt in range(0, nattempt):
                                        attemptnr = attempt
                                        nb_rule = 0
                                        #  Try to create the rule
                                        try:
                                            if split_rule:
                                                if not skip_rule_creation:
                                                    for rse in selected_rses:
                                                        if isinstance(
                                                                selected_rses,
                                                                dict):
                                                            source_replica_expression = selected_rses[
                                                                rse].get(
                                                                    'source_replica_expression',
                                                                    None)
                                                            weight = selected_rses[
                                                                rse].get(
                                                                    'weight',
                                                                    None)
                                                        logging.info(
                                                            prepend_str +
                                                            'Will insert one rule for %s:%s on %s'
                                                            %
                                                            (did['scope'],
                                                             did['name'], rse))
                                                        rule_ids = add_rule(
                                                            dids=[{
                                                                'scope':
                                                                did['scope'],
                                                                'name':
                                                                did['name']
                                                            }],
                                                            account=account,
                                                            copies=1,
                                                            rse_expression=rse,
                                                            grouping=grouping,
                                                            weight=weight,
                                                            lifetime=lifetime,
                                                            locked=locked,
                                                            subscription_id=
                                                            subscription_id,
                                                            source_replica_expression
                                                            =source_replica_expression,
                                                            activity=activity,
                                                            purge_replicas=
                                                            purge_replicas,
                                                            ignore_availability=
                                                            ignore_availability,
                                                            comment=comment)
                                                        created_rules[
                                                            cnt].append(
                                                                rule_ids[0])
                                                        nb_rule += 1
                                                        if nb_rule == copies:
                                                            success = True
                                                            break
                                            else:
                                                rule_ids = add_rule(
                                                    dids=[{
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }],
                                                    account=account,
                                                    copies=copies,
                                                    rse_expression=
                                                    rse_expression,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription['id'],
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment)
                                                created_rules[cnt].append(
                                                    rule_ids[0])
                                                nb_rule += 1
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.done',
                                                delta=nb_rule)
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.activity.%s'
                                                % str_activity,
                                                delta=nb_rule)
                                            success = True
                                            break
                                        except (InvalidReplicationRule,
                                                InvalidRuleWeight,
                                                InvalidRSEExpression,
                                                StagingAreaRuleRequiresLifetime,
                                                DuplicateRule) as error:
                                            # Errors that won't be retried
                                            success = True
                                            logging.error(prepend_str + '%s' %
                                                          (str(error)))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                            break
                                        except (ReplicationRuleCreationTemporaryFailed,
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                DatabaseException,
                                                RSEBlacklisted,
                                                RSEWriteBlocked) as error:
                                            # Errors to be retried
                                            logging.error(
                                                prepend_str +
                                                '%s Will perform an other attempt %i/%i'
                                                % (str(error), attempt + 1,
                                                   nattempt))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                        except Exception:
                                            # Unexpected errors
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.unknown',
                                                delta=1)
                                            exc_type, exc_value, exc_traceback = exc_info(
                                            )
                                            logging.critical(
                                                prepend_str + ''.join(
                                                    format_exception(
                                                        exc_type, exc_value,
                                                        exc_traceback)).strip(
                                                        ))

                                    did_success = (did_success and success)
                                    if (attemptnr +
                                            1) == nattempt and not success:
                                        logging.error(
                                            prepend_str +
                                            'Rule for %s:%s on %s cannot be inserted'
                                            % (did['scope'], did['name'],
                                               rse_expression))
                                    else:
                                        logging.info(
                                            prepend_str +
                                            '%s rule(s) inserted in %f seconds'
                                            % (str(nb_rule),
                                               time.time() - stime))
                    except DataIdentifierNotFound as error:
                        logging.warning(prepend_str + error)

                if did_success:
                    if did['did_type'] == str(DIDType.FILE):
                        monitor.record_counter(
                            counters='transmogrifier.did.file.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.DATASET):
                        monitor.record_counter(
                            counters='transmogrifier.did.dataset.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.CONTAINER):
                        monitor.record_counter(
                            counters='transmogrifier.did.container.processed',
                            delta=1)
                    monitor.record_counter(
                        counters='transmogrifier.did.processed', delta=1)
                    identifiers.append({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        DIDType.from_sym(did['did_type'])
                    })

            time1 = time.time()

            #  Mark the DIDs as processed
            for identifier in chunks(identifiers, 100):
                _retrial(set_new_dids, identifier, None)

            logging.info(prepend_str + 'Time to set the new flag : %f' %
                         (time.time() - time1))
            tottime = time.time() - start_time
            for sub in subscriptions:
                update_subscription(
                    name=sub['name'],
                    account=sub['account'],
                    metadata={'last_processed': datetime.now()})
            logging.info(prepend_str +
                         'It took %f seconds to process %i DIDs' %
                         (tottime, len(dids)))
            logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids)))
            monitor.record_counter(counters='transmogrifier.job.done', delta=1)
            monitor.record_timer(stat='transmogrifier.job.duration',
                                 time=1000 * tottime)
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(
                format_exception(exc_type, exc_value, exc_traceback)).strip())
            monitor.record_counter(counters='transmogrifier.job.error',
                                   delta=1)
            monitor.record_counter(counters='transmogrifier.addnewrule.error',
                                   delta=1)
        if once is True:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info('%s Minos Temporary Expiration starting', prepend_str)

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info('%s Minos Temporary Expiration started', prepend_str)

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info('%s Getting list of expired replicas', prepend_str)
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas))
            logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
Exemplo n.º 13
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos Temporary Expiration starting')

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        try:
            # Get list of expired TU replicas
            logger(logging.INFO, 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logger(logging.INFO, '%s expired replicas returned', len(expired_replicas))
            logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except (DatabaseException, DatabaseError) as error:
                    if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                        logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                    else:
                        logger(logging.ERROR, 'Exception', exc_info=True)
                    session.rollback()
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logger(logging.CRITICAL, str(traceback.format_exc()))

        if once:
            break
        daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
Exemplo n.º 14
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                            unknown_replicas.extend(
                                tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logging.info(
                            prepend_str +
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logging.debug(prepend_str +
                                      'Running on RSE %s%s with %s replicas' %
                                      (get_rse_name(rse_id=rse_id), vo_str,
                                       len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logging.debug(prepend_str +
                                              'Unknown replicas : %s' %
                                              (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logging.debug(prepend_str +
                                  'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logging.debug(prepend_str +
                                  'Running on %s replicas on RSE %s' %
                                  (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.error(
                                prepend_str +
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logging.debug(prepend_str + 'Working on %s' %
                                              (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    if rep['rse_id'] in unavailable_states:
                                        logging.info(
                                            prepend_str +
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    elif expires_at < datetime.now():
                                        logging.info(
                                            '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns',
                                            prepend_str, str(rep['pfn']),
                                            expires_at)
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound) as error:
                                    logging.error(
                                        prepend_str +
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logging.critical(traceback.format_exc())
                            session = get_session()

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logging.info(
                prepend_str +
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 15
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'],
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' %
                         len(expired_replicas))
            logging.debug(prepend_str +
                          'List of expired replicas returned %s' %
                          str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            nchunk = 0
            tot_chunk = int(math.ceil(len(replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                # Process and update the replicas in chunks
                replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                } for replica in chunk]
                # Remove the replicas from bad_replicas table in chunks
                bad_replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': BadFilesStatus.TEMPORARY_UNAVAILABLE
                } for replica in chunk]
                try:
                    nchunk += 1
                    logging.debug(prepend_str +
                                  'Running on %s chunk out of %s' %
                                  (nchunk, tot_chunk))
                    update_replicas_states(replicas,
                                           nowait=True,
                                           session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning(
                        prepend_str +
                        'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s'
                        % str(error))
                    for idx in range(len(chunk)):
                        logging.debug(prepend_str + 'Working on %s' %
                                      (str(replicas[idx])))
                        try:
                            get_metadata(replicas[idx]['scope'],
                                         replicas[idx]['name'])
                            update_replicas_states([
                                replicas[idx],
                            ],
                                                   nowait=True,
                                                   session=session)
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                'DID %s:%s does not exist anymore. ' %
                                (bad_replicas[idx]['scope'],
                                 bad_replicas[idx]['name']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                '%s:%s on RSEID %s does not exist anymore. ' %
                                (replicas[idx]['scope'], replicas[idx]['name'],
                                 replicas[idx]['rse_id']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())
                    session = get_session()

        except Exception as error:
            logging.critical(traceback.format_exc())

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Exemplo n.º 16
0
        assert_equal(None, get_did_atime(scope=tmp_scope, name=tmp_dsn2))

    def test_update_dids(self):
        """ DATA IDENTIFIERS (CORE): Update file size and checksum"""
        tmp_scope = 'mock'
        dsn = 'dsn_%s' % generate_uuid()
        lfn = 'lfn.%s' % str(generate_uuid())
        add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account='root')

        files = [{'scope': tmp_scope, 'name': lfn,
                  'bytes': 724963570L, 'adler32': '0cc737eb',
                  'meta': {'guid': str(generate_uuid()), 'events': 100}}]
        attach_dids(scope=tmp_scope, name=dsn, rse='MOCK', dids=files, account='root')

        set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee')
        assert_equal(get_metadata(scope=tmp_scope, name=lfn)['adler32'], '0cc737ee')

        with assert_raises(UnsupportedOperation):
            set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee')

        set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577L)
        assert_equal(get_metadata(scope=tmp_scope, name=lfn)['bytes'], 724963577L)

    def test_get_did_with_dynamic(self):
        """ DATA IDENTIFIERS (CORE): Get did with dynamic resolve of size"""
        tmp_scope = 'mock'
        tmp_dsn1 = 'dsn_%s' % generate_uuid()
        tmp_dsn2 = 'dsn_%s' % generate_uuid()
        tmp_dsn3 = 'dsn_%s' % generate_uuid()
        tmp_dsn4 = 'dsn_%s' % generate_uuid()
Exemplo n.º 17
0
                                                     'scope': response['scope'],
                                                     'name': response['name'],
                                                     'state': ReplicaState.AVAILABLE}],
                                                   nowait=False,
                                                   session=session)
                except UnsupportedOperation:
                    # replica not found, but it has been transferred because cancel came too late.
                    # so we need to register the replica and schedule it for deletion again
                    record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000)
                    logging.warn('DID %s:%s AT RSE %s NOT FOUND - Registering replica and scheduling for immediate deletion' % (response['scope'],
                                                                                                                                response['name'],
                                                                                                                                rse_name))
                    did_meta = None
                    try:
                        did_meta = did.get_metadata(response['scope'],
                                                    response['name'],
                                                    session=session)
                    except:
                        logging.critical('DID %s:%s NOT FOUND - Cannot re-register replica - potential dark data' % (response['scope'],
                                                                                                                     response['name']))
                        raise

                    if did_meta:
                        try:
                            replica.add_replica(rse_name,
                                                response['scope'],
                                                response['name'],
                                                did_meta['bytes'],
                                                did_meta['account'],
                                                adler32=did_meta['adler32'],
                                                tombstone=datetime.datetime.utcnow(),
Exemplo n.º 18
0
def run_once(heartbeat_handler: "HeartbeatHandler", bulk: int,
             **_kwargs) -> bool:

    worker_number, total_workers, logger = heartbeat_handler.live()
    dids, subscriptions = [], []
    tottime = 0
    try:
        #  Get the new DIDs based on the is_new flag
        logger(logging.DEBUG, "Listing new dids")
        for did in list_new_dids(
                thread=worker_number,
                total_threads=total_workers,
                chunk_size=bulk,
                did_type=None,
        ):
            dids.append({
                "scope": did["scope"],
                "did_type": str(did["did_type"]),
                "name": did["name"],
            })
        logger(logging.INFO, "%i new dids to process", len(dids))

        sub_dict = {3: []}
        #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
        #  The priority is defined as 'policyid'
        logger(logging.DEBUG, "Listing active subscriptions")
        for sub in list_subscriptions(None, None):
            if (sub["state"] != SubscriptionState.INACTIVE and sub["lifetime"]
                    and (datetime.now() > sub["lifetime"])):
                update_subscription(
                    name=sub["name"],
                    account=sub["account"],
                    metadata={"state": SubscriptionState.INACTIVE},
                )

            elif sub["state"] in [
                    SubscriptionState.ACTIVE, SubscriptionState.UPDATED
            ]:
                priority = 3
                if "policyid" in sub:
                    if int(sub["policyid"]) not in sub_dict:
                        sub_dict[int(sub["policyid"])] = []
                    priority = int(sub["policyid"])
                sub_dict[priority].append(sub)
        priorities = list(sub_dict.keys())
        priorities.sort()
        #  Order the subscriptions according to their priority
        for priority in priorities:
            subscriptions.extend(sub_dict[priority])
        logger(logging.INFO, "%i active subscriptions", len(subscriptions))
    except SubscriptionNotFound as error:
        logger(logging.WARNING, "No subscriptions defined: %s" % (str(error)))
        must_sleep = True
        return must_sleep
    except Exception as error:
        logger(
            logging.ERROR,
            "Failed to get list of new DIDs or subscriptions: %s" %
            (str(error)),
        )
        must_sleep = False
        return must_sleep

    results = {}
    start_time = time.time()
    blocklisted_rse_id = [
        rse["id"] for rse in list_rses({"availability_write": False})
    ]
    identifiers = []
    #  Loop over all the new dids
    for did in dids:
        _, _, logger = heartbeat_handler.live()
        did_success = True
        if did["did_type"] == str(DIDType.DATASET) or did["did_type"] == str(
                DIDType.CONTAINER):
            did_tag = "%s:%s" % (did["scope"].internal, did["name"])
            results[did_tag] = []
            try:
                metadata = get_metadata(did["scope"], did["name"])
                # Loop over all the subscriptions
                for subscription in subscriptions:
                    #  Check if the DID match the subscription
                    if is_matching_subscription(subscription, did,
                                                metadata) is True:
                        filter_string = loads(subscription["filter"])
                        split_rule = filter_string.get("split_rule", False)
                        stime = time.time()
                        results[did_tag].append(subscription["id"])
                        logger(
                            logging.INFO,
                            "%s:%s matches subscription %s" %
                            (did["scope"], did["name"], subscription["name"]),
                        )
                        rules = loads(subscription["replication_rules"])
                        created_rules = {}
                        cnt = 0
                        for rule_dict in rules:
                            cnt += 1
                            created_rules[cnt] = []
                            # Get all the rule and subscription parameters
                            grouping = rule_dict.get("grouping", "DATASET")
                            lifetime = rule_dict.get("lifetime", None)
                            ignore_availability = rule_dict.get(
                                "ignore_availability", None)
                            weight = rule_dict.get("weight", None)
                            source_replica_expression = rule_dict.get(
                                "source_replica_expression", None)
                            locked = rule_dict.get("locked", None)
                            if locked == "True":
                                locked = True
                            else:
                                locked = False
                            purge_replicas = rule_dict.get(
                                "purge_replicas", False)
                            if purge_replicas == "True":
                                purge_replicas = True
                            else:
                                purge_replicas = False
                            rse_expression = str(rule_dict["rse_expression"])
                            comment = str(subscription["comments"]
                                          )[:RULES_COMMENT_LENGTH]
                            if "comments" in rule_dict:
                                comment = str(rule_dict["comments"])
                            subscription_id = str(subscription["id"])
                            account = subscription["account"]
                            copies = int(rule_dict["copies"])
                            activity = rule_dict.get("activity",
                                                     "User Subscriptions")
                            try:
                                validate_schema(name="activity",
                                                obj=activity,
                                                vo=account.vo)
                            except InputValidationError as error:
                                logger(
                                    logging.ERROR,
                                    "Error validating the activity %s" %
                                    (str(error)),
                                )
                                activity = "User Subscriptions"
                            if lifetime:
                                lifetime = int(lifetime)

                            str_activity = "".join(activity.split())
                            success = False
                            nattempt = 5
                            attemptnr = 0
                            skip_rule_creation = False

                            selected_rses = []
                            chained_idx = rule_dict.get("chained_idx", None)
                            if chained_idx:
                                params = {}
                                if rule_dict.get("associated_site_idx", None):
                                    params[
                                        "associated_site_idx"] = rule_dict.get(
                                            "associated_site_idx", None)
                                logger(
                                    logging.DEBUG,
                                    "Chained subscription identified. Will use %s",
                                    str(created_rules[chained_idx]),
                                )
                                algorithm = rule_dict.get("algorithm", None)
                                selected_rses = select_algorithm(
                                    algorithm, created_rules[chained_idx],
                                    params)
                            else:
                                # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm
                                if split_rule:
                                    preferred_rses = set()
                                    for rule in list_rules(
                                            filters={
                                                "subscription_id":
                                                subscription_id,
                                                "scope": did["scope"],
                                                "name": did["name"],
                                            }):
                                        for rse_dict in parse_expression(
                                                rule["rse_expression"],
                                                filter_={"vo": account.vo},
                                        ):
                                            preferred_rses.add(rse_dict["rse"])
                                    preferred_rses = list(preferred_rses)

                                    try:
                                        (
                                            selected_rses,
                                            preferred_unmatched,
                                        ) = resolve_rse_expression(
                                            rse_expression,
                                            account,
                                            weight=weight,
                                            copies=copies,
                                            size=0,
                                            preferred_rses=preferred_rses,
                                            blocklist=blocklisted_rse_id,
                                        )

                                    except (
                                            InsufficientTargetRSEs,
                                            InsufficientAccountLimit,
                                            InvalidRuleWeight,
                                            RSEOverQuota,
                                    ) as error:
                                        logger(
                                            logging.WARNING,
                                            'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blocklisted sites'
                                            % (
                                                subscription["name"],
                                                account,
                                                str(error),
                                            ),
                                        )
                                        # Now including the blocklisted sites
                                        try:
                                            (
                                                selected_rses,
                                                preferred_unmatched,
                                            ) = resolve_rse_expression(
                                                rse_expression,
                                                account,
                                                weight=weight,
                                                copies=copies,
                                                size=0,
                                                preferred_rses=preferred_rses,
                                            )
                                            ignore_availability = True
                                        except (
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                InvalidRuleWeight,
                                                RSEOverQuota,
                                        ) as error:
                                            logger(
                                                logging.ERROR,
                                                'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                % (
                                                    subscription["name"],
                                                    account,
                                                    str(error),
                                                ),
                                            )
                                            monitor.record_counter(
                                                name=
                                                "transmogrifier.addnewrule.errortype.{exception}",
                                                labels={
                                                    "exception":
                                                    str(error.__class__.
                                                        __name__)
                                                },
                                            )
                                            # The DID won't be reevaluated at the next cycle
                                            did_success = did_success and True
                                            continue

                                    if (len(preferred_rses) -
                                            len(preferred_unmatched) >=
                                            copies):
                                        skip_rule_creation = True

                            for attempt in range(0, nattempt):
                                attemptnr = attempt
                                nb_rule = 0
                                #  Try to create the rule
                                try:
                                    if split_rule:
                                        if not skip_rule_creation:
                                            for rse in selected_rses:
                                                if isinstance(
                                                        selected_rses, dict):
                                                    source_replica_expression = (
                                                        selected_rses[rse].get(
                                                            "source_replica_expression",
                                                            None,
                                                        ))
                                                    weight = selected_rses[
                                                        rse].get(
                                                            "weight", None)
                                                logger(
                                                    logging.INFO,
                                                    "Will insert one rule for %s:%s on %s"
                                                    % (did["scope"],
                                                       did["name"], rse),
                                                )
                                                rule_ids = add_rule(
                                                    dids=[{
                                                        "scope": did["scope"],
                                                        "name": did["name"],
                                                    }],
                                                    account=account,
                                                    copies=1,
                                                    rse_expression=rse,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription_id,
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment,
                                                )
                                                created_rules[cnt].append(
                                                    rule_ids[0])
                                                nb_rule += 1
                                                if nb_rule == copies:
                                                    success = True
                                                    break
                                    else:
                                        rule_ids = add_rule(
                                            dids=[{
                                                "scope": did["scope"],
                                                "name": did["name"],
                                            }],
                                            account=account,
                                            copies=copies,
                                            rse_expression=rse_expression,
                                            grouping=grouping,
                                            weight=weight,
                                            lifetime=lifetime,
                                            locked=locked,
                                            subscription_id=subscription["id"],
                                            source_replica_expression=
                                            source_replica_expression,
                                            activity=activity,
                                            purge_replicas=purge_replicas,
                                            ignore_availability=
                                            ignore_availability,
                                            comment=comment,
                                        )
                                        created_rules[cnt].append(rule_ids[0])
                                        nb_rule += 1
                                    monitor.record_counter(
                                        name="transmogrifier.addnewrule.done",
                                        delta=nb_rule,
                                    )
                                    monitor.record_counter(
                                        name=
                                        "transmogrifier.addnewrule.activity.{activity}",
                                        delta=nb_rule,
                                        labels={"activity": str_activity},
                                    )
                                    success = True
                                    break
                                except (
                                        InvalidReplicationRule,
                                        InvalidRuleWeight,
                                        InvalidRSEExpression,
                                        StagingAreaRuleRequiresLifetime,
                                        DuplicateRule,
                                ) as error:
                                    # Errors that won't be retried
                                    success = True
                                    logger(logging.ERROR, str(error))
                                    monitor.record_counter(
                                        name=
                                        "transmogrifier.addnewrule.errortype.{exception}",
                                        labels={
                                            "exception":
                                            str(error.__class__.__name__)
                                        },
                                    )
                                    break
                                except (
                                        ReplicationRuleCreationTemporaryFailed,
                                        InsufficientTargetRSEs,
                                        InsufficientAccountLimit,
                                        DatabaseException,
                                        RSEWriteBlocked,
                                ) as error:
                                    # Errors to be retried
                                    logger(
                                        logging.ERROR,
                                        "%s Will perform an other attempt %i/%i"
                                        % (str(error), attempt + 1, nattempt),
                                    )
                                    monitor.record_counter(
                                        name=
                                        "transmogrifier.addnewrule.errortype.{exception}",
                                        labels={
                                            "exception":
                                            str(error.__class__.__name__)
                                        },
                                    )
                                except Exception:
                                    # Unexpected errors
                                    monitor.record_counter(
                                        name=
                                        "transmogrifier.addnewrule.errortype.{exception}",
                                        labels={"exception": "unknown"},
                                    )
                                    logger(logging.ERROR,
                                           "Unexpected error",
                                           exc_info=True)

                            did_success = did_success and success
                            if (attemptnr + 1) == nattempt and not success:
                                logger(
                                    logging.ERROR,
                                    "Rule for %s:%s on %s cannot be inserted" %
                                    (did["scope"], did["name"],
                                     rse_expression),
                                )
                            else:
                                logger(
                                    logging.INFO,
                                    "%s rule(s) inserted in %f seconds" %
                                    (str(nb_rule), time.time() - stime),
                                )
            except DataIdentifierNotFound as error:
                logger(logging.WARNING, str(error))

        if did_success:
            if did["did_type"] == str(DIDType.FILE):
                monitor.record_counter(
                    name="transmogrifier.did.file.processed")
            elif did["did_type"] == str(DIDType.DATASET):
                monitor.record_counter(
                    name="transmogrifier.did.dataset.processed")
            elif did["did_type"] == str(DIDType.CONTAINER):
                monitor.record_counter(
                    name="transmogrifier.did.container.processed", delta=1)
            monitor.record_counter(name="transmogrifier.did.processed",
                                   delta=1)
            identifiers.append({
                "scope": did["scope"],
                "name": did["name"],
                "did_type": did["did_type"],
            })

    time1 = time.time()

    #  Mark the DIDs as processed
    for identifier in chunks(identifiers, 100):
        _retrial(set_new_dids, identifier, None)

    logger(logging.DEBUG,
           "Time to set the new flag : %f" % (time.time() - time1))
    tottime = time.time() - start_time
    for sub in subscriptions:
        update_subscription(
            name=sub["name"],
            account=sub["account"],
            metadata={"last_processed": datetime.now()},
        )
    logger(logging.INFO,
           "It took %f seconds to process %i DIDs" % (tottime, len(dids)))
    logger(logging.DEBUG, "DIDs processed : %s" % (str(dids)))
    monitor.record_counter(name="transmogrifier.job.done", delta=1)
    monitor.record_timer(name="transmogrifier.job.duration",
                         time=1000 * tottime)
    must_sleep = True
    return must_sleep
Exemplo n.º 19
0
        assert_equal(None, get_did_atime(scope=tmp_scope, name=tmp_dsn2))

    def test_update_dids(self):
        """ DATA IDENTIFIERS (CORE): Update file size and checksum"""
        tmp_scope = 'mock'
        dsn = 'dsn_%s' % generate_uuid()
        lfn = 'lfn.%s' % str(generate_uuid())
        add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account='root')

        files = [{'scope': tmp_scope, 'name': lfn,
                  'bytes': 724963570L, 'adler32': '0cc737eb',
                  'meta': {'guid': str(generate_uuid()), 'events': 100}}]
        attach_dids(scope=tmp_scope, name=dsn, rse='MOCK', dids=files, account='root')

        set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee')
        assert_equal(get_metadata(scope=tmp_scope, name=lfn)['adler32'], '0cc737ee')

        with assert_raises(UnsupportedOperation):
            set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee')

        set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577L)
        assert_equal(get_metadata(scope=tmp_scope, name=lfn)['bytes'], 724963577L)


class TestDIDApi:

    def test_list_new_dids(self):
        """ DATA IDENTIFIERS (API): List new identifiers """
        tmp_scope = scope_name_generator()
        tmp_dsn = 'dsn_%s' % generate_uuid()
        scope.add_scope(tmp_scope, 'jdoe', 'jdoe')
Exemplo n.º 20
0
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300):
    """
    Main loop to check for available replicas which are labeled as suspicious.

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param vos: VOs on which to look for RSEs. Only used in multi-VO mode.
                If None, we either use all VOs if run from "def",
    :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE
                                          is considered problematic and the suspicious replicas on that RSE
                                          are labeled as 'TEMPORARY_UNAVAILABLE'.
    :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then
                       it should sleep until sleep_time is over.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]

    prepend_str = 'replica_recoverer: '
    logger = formatted_logger(logging.log, prepend_str + '%s')

    multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False)
    if not multi_vo:
        if vos:
            logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.')
        vos = ['def']
    else:
        if vos:
            invalid = set(vos) - set([v['vo'] for v in list_vos()])
            if invalid:
                msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid]))
                raise VONotFound(msg)
        else:
            vos = [v['vo'] for v in list_vos()]
        logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos])))

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
    prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread'] + 1

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname())
                die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
                break

            prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers)
            logger = formatted_logger(logging.log, prepend_str + '%s')

            start = time.time()

            try:
                json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json")
            except:
                logger(logging.WARNING, "An error occured whilst trying to open the JSON file.")
                break

            try:
                json_data = json.load(json_file)
            except ValueError:
                logger(logging.WARNING, "No JSON object could be decoded.")

            # Checking that the json file is formatedd properly.
            for i, entry in enumerate(json_data):
                if "datatype" not in entry or "action" not in entry:
                    logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i)
                    break

            logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts)

            getfileskwargs = {'younger_than': younger_than,
                              'nattempts': nattempts,
                              'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                              'is_suspicious': True}

            for vo in vos:
                logger(logging.INFO, 'Start replica recovery for VO: %s', vo)
                recoverable_replicas = {}
                if vo not in recoverable_replicas:
                    recoverable_replicas[vo] = {}

                # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse'])
                rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse'])

                logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:")
                for i in rse_list:
                    logger(logging.DEBUG, '%s', i)

                for rse in rse_list:
                    time_start_rse = time.time()
                    rse_expr = rse['rse']
                    cnt_surl_not_found = 0
                    if rse_expr not in recoverable_replicas[vo]:
                        recoverable_replicas[vo][rse_expr] = {}
                    # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs
                    suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs)
                    # Get the suspicious replicas that are the last remaining copies
                    suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs)

                    logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr)
                    logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere))
                    for i in suspicious_replicas_avail_elsewhere:
                        logger(logging.DEBUG, '%s', i)
                    logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy))
                    for i in suspicious_replicas_last_copy:
                        logger(logging.DEBUG, '%s', i)

                    # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE.
                    if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)):
                        logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability'])
                        continue

                    if suspicious_replicas_avail_elsewhere:
                        for replica in suspicious_replicas_avail_elsewhere:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                for rep in list_replicas([{'scope': scope, 'name': rep_name}]):
                                    for rse_ in rep['rses']:
                                        if rse_ == rse_id:
                                            recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True}
                                            surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr)

                    if suspicious_replicas_last_copy:
                        for replica in suspicious_replicas_last_copy:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                # Should only return one rse, as there is only one replica remaining
                                for rep in list_replicas([{'scope': scope, 'name': rep_name}]):
                                    recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False}
                                    surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr)

                    logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.',
                           time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy),
                           len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy))

                    if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0:
                        logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr)
                        for i in recoverable_replicas[vo][rse_expr]:
                            logger(logging.DEBUG, '%s', i)

                # Log file is long and hard to read -> implement some spacing
                logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start)
                logger(logging.INFO, 'Begin check for problematic RSEs.')
                time_start_check_probl = time.time()

                # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE.
                # The suspicious files are marked as temporarily unavailable.
                list_problematic_rses = []
                for rse_key in list(recoverable_replicas[vo].keys()):
                    if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse:
                        list_problematic_rses.append(rse_key)
                        surls_list = []
                        for replica_value in recoverable_replicas[vo][rse_key].values():
                            surls_list.append(replica_value['surl'])

                        add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3))

                        logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse)
                        logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key)
                        for rse_values in recoverable_replicas[vo][rse_key].values():
                            logger(logging.INFO, 'Scope: %s    Name: %s', rse_values['scope'], rse_values['name'])
                        # Remove the RSE from the dictionary as it has been dealt with.
                        del recoverable_replicas[vo][rse_key]

                logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses))
                for rse in list_problematic_rses:
                    logger(logging.INFO, "%s", rse)

                # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious).
                # If they are the last remaining copies, deal with them differently.
                for rse_key in list(recoverable_replicas[vo].keys()):
                    files_to_be_declared_bad = []
                    files_to_be_ignored = []
                    # Remove RSEs from dictionary that don't have any suspicious replicas
                    if len(recoverable_replicas[vo][rse_key]) == 0:
                        del recoverable_replicas[vo][rse_key]
                        continue
                    # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there
                    rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id']
                    for replica_key in list(recoverable_replicas[vo][rse_key].keys()):
                        if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True:
                            # Replicas with other copies on at least one other RSE can safely be labeled as bad
                            files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                            # Remove replica from dictionary
                            del recoverable_replicas[vo][rse_key][replica_key]
                        elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False:
                            if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")):
                                # Don't keep log files or user files
                                files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                del recoverable_replicas[vo][rse_key][replica_key]
                            else:
                                # Deal with replicas based on their metadata.
                                file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"])
                                if file_metadata["datatype"] is None:  # "None" type has no function "split()"
                                    files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                    continue
                                for i in json_data:
                                    if i["datatype"] == file_metadata["datatype"].split("_")[-1]:
                                        action = i["action"]
                                        if action == "ignore":
                                            files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                        elif action == "declare bad":
                                            files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                        else:
                                            logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)",
                                                   rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"])
                                        break
                                    else:
                                        # If no policy has be set, default to ignoring the file (no action taken).
                                        files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])

                    logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key)
                    for i in files_to_be_ignored:
                        logger(logging.INFO, '%s', i)
                    logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key)
                    for i in files_to_be_declared_bad:
                        logger(logging.INFO, '%s', i)

                    if files_to_be_declared_bad:
                        logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id))

                        declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None)

                        logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key)

                logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl)

                time_passed = time.time() - start
                logger(logging.INFO, 'Total time: %s seconds', time_passed)
                daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
            else:
                logger(logging.CRITICAL, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
        except Exception as err:
            logger(logging.CRITICAL, traceback.format_exc())
            record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
        if once:
            break

    die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
    logger(logging.INFO, 'Graceful stop done.')
Exemplo n.º 21
0
def lfn2pfn_DUNE(scope, name, rse, rse_attrs, protocol_attrs):
    global metacat_base

    from rucio.common import config
    from rucio.common.types import InternalScope
    from rucio.rse import rsemanager
    from metacat.webapi import MetaCatClient

    # current URL: https://metacat.fnal.gov:9443/dune_meta_demo/app
    metacat_url = config.config_get(
        'policy', 'metacat_base_url') or os.environ.get("METACAT_SERVER_URL")
    if metacat_url is None:
        raise ValueError("MetaCat client URL is not configured")

    metacat_client = MetaCatClient(metacat_url)

    def get_metadata_field(metadata, field):
        if field in metadata:
            return metadata[field]
        return 'None'

    # check to see if PFN is already cached in Rucio's metadata system
    didclient = None
    didmd = {}
    internal_scope = InternalScope(scope)
    if getattr(rsemanager, 'CLIENT_MODE', None):
        from rucio.client.didclient import DIDClient
        didclient = DIDClient()
        didmd = didclient.get_metadata(internal_scope, name)
    if getattr(rsemanager, 'SERVER_MODE', None):
        from rucio.core.did import get_metadata
        didmd = get_metadata(internal_scope, name)

    # if it is, just return it
    md_key = 'PFN_' + rse
    if md_key in didmd:
        return didmd[md_key]

    lfn = scope + ':' + name
    jsondata = metacat_client.get_file(name=lfn)
    metadata = jsondata["metadata"]

    # determine year from timestamps
    timestamp = None
    if 'core.start_time' in metadata:
        timestamp = metadata['core.start_time']
    elif 'core.end_time' in metadata:
        timestamp = metadata['core.end_time']
    elif 'created_timestamp' in jsondata:
        timestamp = jsondata['created_timestamp']
    if timestamp is None:
        year = 'None'
    else:
        dt = datetime.utcfromtimestamp(timestamp)
        year = str(dt.year)

    # determine hashes from run number
    run_number = 0
    if 'core.runs' in metadata:
        run_number = int(metadata['core.runs'][0])

    hash1 = "%02d" % ((run_number // 1000000) % 100)
    hash2 = "%02d" % ((run_number // 10000) % 100)
    hash3 = "%02d" % ((run_number // 100) % 100)
    hash4 = "%02d" % (run_number % 100)

    run_type = get_metadata_field(metadata, 'core.run_type')
    data_tier = get_metadata_field(metadata, 'core.data_tier')
    file_type = get_metadata_field(metadata, 'core.file_type')
    data_stream = get_metadata_field(metadata, 'core.data_stream')
    data_campaign = get_metadata_field(metadata, 'DUNE.campaign')
    filename = name

    pfn = 'pnfs/dune/tape_backed/dunepro/' + run_type + '/' + data_tier + '/' + year + '/' + file_type + '/' + data_stream + '/' + data_campaign + '/' + hash1 + '/' + hash2 + '/' + hash3 + '/' + hash4 + '/' + filename

    # store the PFN in Rucio metadata for next time
    if getattr(rsemanager, 'CLIENT_MODE', None):
        didclient.set_metadata(internal_scope, name, md_key, pfn)
    if getattr(rsemanager, 'SERVER_MODE', None):
        from rucio.core.did import set_metadata
        set_metadata(internal_scope, name, md_key, pfn)

    return pfn