예제 #1
0
def bulk_group_transfer(transfers, policy='rule', group_bulk=200, source_strategy=None, max_time_in_queue=None, session=None, logger=logging.log, group_by_scope=False, archive_timeout_override=None):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:                List of transfers to group.
    :param plicy:                    Policy to use to group.
    :param group_bulk:               Bulk sizes.
    :param source_strategy:          Strategy to group sources
    :param max_time_in_queue:        Maximum time in queue
    :param archive_timeout_override: Override the archive_timeout parameter for any transfers with it set (0 to unset)
    :param logger:                   Optional decorated logger that can be passed from the calling daemons or servers.
    :return:                         List of grouped transfers.
    """

    grouped_transfers = {}
    grouped_jobs = {}

    # Use empty string, but any string is OK, it is internal to this function only
    _catch_all_scopes_str = ''

    try:
        default_source_strategy = get(section='conveyor', option='default-source-strategy')
    except ConfigNotFound:
        default_source_strategy = 'orderly'

    try:
        activity_source_strategy = get(section='conveyor', option='activity-source-strategy')
        activity_source_strategy = loads(activity_source_strategy)
    except ConfigNotFound:
        activity_source_strategy = {}
    except ValueError:
        logger(logging.WARNING, 'activity_source_strategy not properly defined')
        activity_source_strategy = {}

    for request_id in transfers:
        transfer = transfers[request_id]

        verify_checksum, checksums_to_use = transfer_core.checksum_validation_strategy(transfer.src.rse.attributes, transfer.dst.rse.attributes, logger=logger)
        t_file = {'sources': transfer['sources'],
                  'destinations': transfer['dest_urls'],
                  'metadata': transfer['file_metadata'],
                  'filesize': int(transfer['file_metadata']['filesize']),
                  'checksum': None,
                  'verify_checksum': verify_checksum,
                  'selection_strategy': source_strategy if source_strategy else activity_source_strategy.get(str(transfer['file_metadata']['activity']), default_source_strategy),
                  'request_type': transfer['file_metadata'].get('request_type', None),
                  'activity': str(transfer['file_metadata']['activity'])}

        if verify_checksum != 'none':
            set_checksum_value(t_file, checksums_to_use)

        multihop = transfer.get('multihop', False)
        strict_copy = transfer.get('strict_copy', False)
        use_ipv4 = transfer.get('use_ipv4', False)

        external_host = transfer['external_host']
        scope = t_file['metadata']['scope']
        activity = t_file['activity']
        if group_by_scope:
            scope_str = scope.internal
        else:
            # Use a catch-all scope which will be removed at the end
            scope_str = _catch_all_scopes_str

        if external_host not in grouped_transfers:
            grouped_transfers[external_host] = {}
            grouped_jobs[external_host] = {}
            if scope_str not in grouped_transfers[external_host]:
                grouped_transfers[external_host][scope_str] = {}
                grouped_jobs[external_host][scope_str] = []

        current_transfers_group = grouped_transfers[external_host][scope_str]
        current_jobs_group = grouped_jobs[external_host][scope_str]

        job_params = {'account': transfer['account'],
                      'use_oidc': transfer_core.oidc_supported(transfer),
                      'verify_checksum': verify_checksum,
                      'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1,
                      'bring_online': transfer['bring_online'] if transfer['bring_online'] else None,
                      'job_metadata': {'issuer': 'rucio'},  # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc.
                      'overwrite': transfer['overwrite'],
                      'priority': 3}
        if transfer.get('archive_timeout', None):
            if archive_timeout_override is None:
                job_params['archive_timeout'] = transfer['archive_timeout']
            elif archive_timeout_override != 0:
                job_params['archive_timeout'] = archive_timeout_override
            # else don't set the value
        if multihop:
            job_params['multihop'] = True
        if strict_copy:
            job_params['strict_copy'] = True
        if use_ipv4:
            job_params['ipv4'] = True
            job_params['ipv6'] = False

        # Don't put optional & missing keys in the parameters
        if transfer['dest_spacetoken']:
            job_params.update({'spacetoken': transfer['dest_spacetoken']})
        if transfer['src_spacetoken']:
            job_params.update({'source_spacetoken': transfer['src_spacetoken']})

        if max_time_in_queue:
            if transfer['file_metadata']['activity'] in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue[transfer['file_metadata']['activity']]
            elif 'default' in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue['default']

        # for multiple source replicas, no bulk submission
        if len(transfer['sources']) > 1:
            job_params['job_metadata']['multi_sources'] = True
            current_jobs_group.append({'files': [t_file], 'job_params': job_params})
        else:
            job_params['job_metadata']['multi_sources'] = False
            job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (job_params['verify_checksum'], job_params.get('spacetoken', None),
                                                   job_params['copy_pin_lifetime'],
                                                   job_params['bring_online'], job_params['job_metadata'],
                                                   job_params.get('source_spacetoken', None),
                                                   job_params['overwrite'], job_params['priority'])
            if 'max_time_in_queue' in job_params:
                job_key = job_key + ',%s' % job_params['max_time_in_queue']

            if multihop:
                job_key = 'multihop_%s' % (transfer['initial_request_id'])

            if job_key not in current_transfers_group:
                current_transfers_group[job_key] = {}

            if multihop:
                policy_key = 'multihop_%s' % (transfer['initial_request_id'])
            else:
                if policy == 'rule':
                    policy_key = '%s' % (transfer['rule_id'])
                if policy == 'dest':
                    policy_key = '%s' % (t_file['metadata']['dst_rse'])
                if policy == 'src_dest':
                    policy_key = '%s,%s' % (t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'rule_src_dest':
                    policy_key = '%s,%s,%s' % (transfer['rule_id'], t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'activity_dest':
                    policy_key = '%s %s' % (activity, t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                if policy == 'activity_src_dest':
                    policy_key = '%s %s %s' % (activity, t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                    # maybe here we need to hash the key if it's too long

            if policy_key not in current_transfers_group[job_key]:
                current_transfers_group[job_key][policy_key] = {'files': [], 'job_params': job_params}
            current_transfers_policy = current_transfers_group[job_key][policy_key]
            if multihop:
                # The parent transfer should be the first of the list
                # TODO : Only work for a single hop now, need to be able to handle multiple hops
                if transfer['parent_request']:  # This is the child
                    current_transfers_policy['files'].append(t_file)
                else:
                    current_transfers_policy['files'].insert(0, t_file)
            else:
                current_transfers_policy['files'].append(t_file)

    # for jobs with different job_key, we cannot put in one job.
    for external_host in grouped_transfers:
        for scope_key in grouped_transfers[external_host]:
            for job_key in grouped_transfers[external_host][scope_key]:
                # for all policy groups in job_key, the job_params is the same.
                for policy_key in grouped_transfers[external_host][scope_key][job_key]:
                    job_params = grouped_transfers[external_host][scope_key][job_key][policy_key]['job_params']
                    for xfers_files in chunks(grouped_transfers[external_host][scope_key][job_key][policy_key]['files'], group_bulk):
                        # for the last small piece, just submit it.
                        grouped_jobs[external_host][scope_key].append({'files': xfers_files, 'job_params': job_params})

    if not group_by_scope:
        for external_host in grouped_jobs:
            grouped_jobs[external_host] = grouped_jobs[external_host][_catch_all_scopes_str]

    return grouped_jobs
예제 #2
0
파일: common.py 프로젝트: pic-es/rucio
def bulk_group_transfer(transfers, policy='rule', group_bulk=200, source_strategy=None, max_time_in_queue=None, session=None):
    """
    Group transfers in bulk based on certain criterias

    :param transfers:             List of transfers to group.
    :param plicy:                 Policy to use to group.
    :param group_bulk:            Bulk sizes.
    :param source_strategy:       Strategy to group sources
    :param max_time_in_queue:     Maximum time in queue
    :return:                      List of grouped transfers.
    """

    grouped_transfers = {}
    grouped_jobs = {}

    try:
        default_source_strategy = get(section='conveyor', option='default-source-strategy')
    except ConfigNotFound:
        default_source_strategy = 'orderly'

    try:
        activity_source_strategy = get(section='conveyor', option='activity-source-strategy')
        activity_source_strategy = loads(activity_source_strategy)
    except ConfigNotFound:
        activity_source_strategy = {}
    except ValueError:
        logging.warning('activity_source_strategy not properly defined')
        activity_source_strategy = {}

    for request_id in transfers:
        transfer = transfers[request_id]
        verify_checksum = transfer['file_metadata'].get('verify_checksum', 'both')

        dest_rse_id = transfer['file_metadata']['dest_rse_id']
        source_rse_id = transfer['file_metadata']['src_rse_id']

        dest_supported_checksums = get_rse_supported_checksums(rse_id=dest_rse_id, session=session)
        source_supported_checksums = get_rse_supported_checksums(rse_id=source_rse_id, session=session)
        common_checksum_names = set(source_supported_checksums).intersection(dest_supported_checksums)

        if source_supported_checksums == ['none']:
            if dest_supported_checksums == ['none']:
                # both endpoints support none
                verify_checksum = 'none'
            else:
                # src supports none but dst does
                verify_checksum = 'destination'
        else:
            if dest_supported_checksums == ['none']:
                # source supports some but destination does not
                verify_checksum = 'source'
            else:
                if len(common_checksum_names) == 0:
                    # source and dst support some bot none in common (dst priority)
                    verify_checksum = 'destination'
                else:
                    # Don't override the value in the file_metadata
                    pass

        t_file = {'sources': transfer['sources'],
                  'destinations': transfer['dest_urls'],
                  'metadata': transfer['file_metadata'],
                  'filesize': int(transfer['file_metadata']['filesize']),
                  'checksum': None,
                  'verify_checksum': verify_checksum,
                  'selection_strategy': source_strategy if source_strategy else activity_source_strategy.get(str(transfer['file_metadata']['activity']), default_source_strategy),
                  'request_type': transfer['file_metadata'].get('request_type', None),
                  'activity': str(transfer['file_metadata']['activity'])}

        if verify_checksum != 'none':
            if verify_checksum == 'both':
                set_checksum_value(t_file, common_checksum_names)
            if verify_checksum == 'source':
                set_checksum_value(t_file, source_supported_checksums)
            if verify_checksum == 'destination':
                set_checksum_value(t_file, dest_supported_checksums)

        multihop = transfer.get('multihop', False)
        strict_copy = transfer.get('strict_copy', False)

        external_host = transfer['external_host']
        scope = t_file['metadata']['scope']
        scope_str = scope.internal
        activity = t_file['activity']

        if external_host not in grouped_transfers:
            grouped_transfers[external_host] = {}
            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                grouped_jobs[external_host] = []
            elif activity in USER_ACTIVITY:
                grouped_jobs[external_host] = {}
                if scope_str not in grouped_transfers[external_host]:
                    grouped_transfers[external_host][scope_str] = {}
                    grouped_jobs[external_host][scope_str] = []

        job_params = {'account': transfer['account'],
                      'use_oidc': transfer.get('use_oidc', False),
                      'verify_checksum': verify_checksum,
                      'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1,
                      'bring_online': transfer['bring_online'] if transfer['bring_online'] else None,
                      'job_metadata': {'issuer': 'rucio'},  # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc.
                      'overwrite': transfer['overwrite'],
                      'priority': 3,
                      's3alternate': True}
        if multihop:
            job_params['multihop'] = True
        if strict_copy:
            job_params['strict_copy'] = True

        # Don't put optional & missing keys in the parameters
        if transfer['dest_spacetoken']:
            job_params.update({'spacetoken': transfer['dest_spacetoken']})
        if transfer['src_spacetoken']:
            job_params.update({'source_spacetoken': transfer['src_spacetoken']})

        if max_time_in_queue:
            if transfer['file_metadata']['activity'] in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue[transfer['file_metadata']['activity']]
            elif 'default' in max_time_in_queue:
                job_params['max_time_in_queue'] = max_time_in_queue['default']

        # for multiple source replicas, no bulk submission
        if len(transfer['sources']) > 1:
            job_params['job_metadata']['multi_sources'] = True
            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                grouped_jobs[external_host].append({'files': [t_file], 'job_params': job_params})
            elif activity in USER_ACTIVITY:
                grouped_jobs[external_host][scope_str].append({'files': [t_file], 'job_params': job_params})
        else:
            job_params['job_metadata']['multi_sources'] = False
            job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (job_params['verify_checksum'], job_params.get('spacetoken', None),
                                                   job_params['copy_pin_lifetime'],
                                                   job_params['bring_online'], job_params['job_metadata'],
                                                   job_params.get('source_spacetoken', None),
                                                   job_params['overwrite'], job_params['priority'])
            if 'max_time_in_queue' in job_params:
                job_key = job_key + ',%s' % job_params['max_time_in_queue']

            if multihop:
                job_key = 'multihop_%s' % (transfer['initial_request_id'])

            if job_key not in grouped_transfers[external_host]:
                if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                    grouped_transfers[external_host][job_key] = {}
                elif activity in USER_ACTIVITY:
                    grouped_transfers[external_host][scope_str][job_key] = {}

            if multihop:
                policy_key = 'multihop_%s' % (transfer['initial_request_id'])
            else:
                if policy == 'rule':
                    policy_key = '%s' % (transfer['rule_id'])
                if policy == 'dest':
                    policy_key = '%s' % (t_file['metadata']['dst_rse'])
                if policy == 'src_dest':
                    policy_key = '%s,%s' % (t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'rule_src_dest':
                    policy_key = '%s,%s,%s' % (transfer['rule_id'], t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                if policy == 'activity_dest':
                    policy_key = '%s %s' % (activity, t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                if policy == 'activity_src_dest':
                    policy_key = '%s %s %s' % (activity, t_file['metadata']['src_rse'], t_file['metadata']['dst_rse'])
                    policy_key = "_".join(policy_key.split(' '))
                    # maybe here we need to hash the key if it's too long

            if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
                if policy_key not in grouped_transfers[external_host][job_key]:
                    grouped_transfers[external_host][job_key][policy_key] = {'files': [t_file], 'job_params': job_params}
                else:
                    if multihop:
                        # The parent transfer should be the first of the list
                        # TODO : Only work for a single hop now, need to be able to handle multiple hops
                        if transfer['parent_request']:  # This is the child
                            grouped_transfers[external_host][job_key][policy_key]['files'].append(t_file)
                        else:
                            grouped_transfers[external_host][job_key][policy_key]['files'].insert(0, t_file)
                    else:
                        grouped_transfers[external_host][job_key][policy_key]['files'].append(t_file)
            elif activity in USER_ACTIVITY:
                if policy_key not in grouped_transfers[external_host][scope_str][job_key]:
                    grouped_transfers[external_host][scope_str][job_key][policy_key] = {'files': [t_file], 'job_params': job_params}
                else:
                    if multihop:
                        # The parent transfer should be the first of the list
                        # TODO : Only work for a single hop now, need to be able to handle multiple hops
                        if transfer['parent_request']:  # This is the child
                            grouped_transfers[external_host][scope_str][job_key][policy_key]['files'].append(t_file)
                        else:
                            grouped_transfers[external_host][scope_str][job_key][policy_key]['files'].insert(0, t_file)

    # for jobs with different job_key, we cannot put in one job.
    for external_host in grouped_transfers:
        if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY:
            for job_key in grouped_transfers[external_host]:
                # for all policy groups in job_key, the job_params is the same.
                for policy_key in grouped_transfers[external_host][job_key]:
                    job_params = grouped_transfers[external_host][job_key][policy_key]['job_params']
                    for xfers_files in chunks(grouped_transfers[external_host][job_key][policy_key]['files'], group_bulk):
                        # for the last small piece, just submit it.
                        grouped_jobs[external_host].append({'files': xfers_files, 'job_params': job_params})
        elif activity in USER_ACTIVITY:
            for scope_key in grouped_transfers[external_host]:
                for job_key in grouped_transfers[external_host][scope_key]:
                    # for all policy groups in job_key, the job_params is the same.
                    for policy_key in grouped_transfers[external_host][scope_key][job_key]:
                        job_params = grouped_transfers[external_host][scope_key][job_key][policy_key]['job_params']
                        for xfers_files in chunks(grouped_transfers[external_host][scope_key][job_key][policy_key]['files'], group_bulk):
                            # for the last small piece, just submit it.
                            grouped_jobs[external_host][scope_key].append({'files': xfers_files, 'job_params': job_params})

    return grouped_jobs