def bulk_group_transfer(transfers, policy='rule', group_bulk=200, source_strategy=None, max_time_in_queue=None, session=None, logger=logging.log, group_by_scope=False, archive_timeout_override=None): """ Group transfers in bulk based on certain criterias :param transfers: List of transfers to group. :param plicy: Policy to use to group. :param group_bulk: Bulk sizes. :param source_strategy: Strategy to group sources :param max_time_in_queue: Maximum time in queue :param archive_timeout_override: Override the archive_timeout parameter for any transfers with it set (0 to unset) :param logger: Optional decorated logger that can be passed from the calling daemons or servers. :return: List of grouped transfers. """ grouped_transfers = {} grouped_jobs = {} # Use empty string, but any string is OK, it is internal to this function only _catch_all_scopes_str = '' try: default_source_strategy = get(section='conveyor', option='default-source-strategy') except ConfigNotFound: default_source_strategy = 'orderly' try: activity_source_strategy = get(section='conveyor', option='activity-source-strategy') activity_source_strategy = loads(activity_source_strategy) except ConfigNotFound: activity_source_strategy = {} except ValueError: logger(logging.WARNING, 'activity_source_strategy not properly defined') activity_source_strategy = {} for request_id in transfers: transfer = transfers[request_id] verify_checksum, checksums_to_use = transfer_core.checksum_validation_strategy(transfer.src.rse.attributes, transfer.dst.rse.attributes, logger=logger) t_file = {'sources': transfer['sources'], 'destinations': transfer['dest_urls'], 'metadata': transfer['file_metadata'], 'filesize': int(transfer['file_metadata']['filesize']), 'checksum': None, 'verify_checksum': verify_checksum, 'selection_strategy': source_strategy if source_strategy else activity_source_strategy.get(str(transfer['file_metadata']['activity']), default_source_strategy), 'request_type': transfer['file_metadata'].get('request_type', None), 'activity': str(transfer['file_metadata']['activity'])} if verify_checksum != 'none': set_checksum_value(t_file, checksums_to_use) multihop = transfer.get('multihop', False) strict_copy = transfer.get('strict_copy', False) use_ipv4 = transfer.get('use_ipv4', False) external_host = transfer['external_host'] scope = t_file['metadata']['scope'] activity = t_file['activity'] if group_by_scope: scope_str = scope.internal else: # Use a catch-all scope which will be removed at the end scope_str = _catch_all_scopes_str if external_host not in grouped_transfers: grouped_transfers[external_host] = {} grouped_jobs[external_host] = {} if scope_str not in grouped_transfers[external_host]: grouped_transfers[external_host][scope_str] = {} grouped_jobs[external_host][scope_str] = [] current_transfers_group = grouped_transfers[external_host][scope_str] current_jobs_group = grouped_jobs[external_host][scope_str] job_params = {'account': transfer['account'], 'use_oidc': transfer_core.oidc_supported(transfer), 'verify_checksum': verify_checksum, 'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1, 'bring_online': transfer['bring_online'] if transfer['bring_online'] else None, 'job_metadata': {'issuer': 'rucio'}, # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc. 'overwrite': transfer['overwrite'], 'priority': 3} if transfer.get('archive_timeout', None): if archive_timeout_override is None: job_params['archive_timeout'] = transfer['archive_timeout'] elif archive_timeout_override != 0: job_params['archive_timeout'] = archive_timeout_override # else don't set the value if multihop: job_params['multihop'] = True if strict_copy: job_params['strict_copy'] = True if use_ipv4: job_params['ipv4'] = True job_params['ipv6'] = False # Don't put optional & missing keys in the parameters if transfer['dest_spacetoken']: job_params.update({'spacetoken': transfer['dest_spacetoken']}) if transfer['src_spacetoken']: job_params.update({'source_spacetoken': transfer['src_spacetoken']}) if max_time_in_queue: if transfer['file_metadata']['activity'] in max_time_in_queue: job_params['max_time_in_queue'] = max_time_in_queue[transfer['file_metadata']['activity']] elif 'default' in max_time_in_queue: job_params['max_time_in_queue'] = max_time_in_queue['default'] # for multiple source replicas, no bulk submission if len(transfer['sources']) > 1: job_params['job_metadata']['multi_sources'] = True current_jobs_group.append({'files': [t_file], 'job_params': job_params}) else: job_params['job_metadata']['multi_sources'] = False job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (job_params['verify_checksum'], job_params.get('spacetoken', None), job_params['copy_pin_lifetime'], job_params['bring_online'], job_params['job_metadata'], job_params.get('source_spacetoken', None), job_params['overwrite'], job_params['priority']) if 'max_time_in_queue' in job_params: job_key = job_key + ',%s' % job_params['max_time_in_queue'] if multihop: job_key = 'multihop_%s' % (transfer['initial_request_id']) if job_key not in current_transfers_group: current_transfers_group[job_key] = {} if multihop: policy_key = 'multihop_%s' % (transfer['initial_request_id']) else: if policy == 'rule': policy_key = '%s' % (transfer['rule_id']) if policy == 'dest': policy_key = '%s' % (t_file['metadata']['dst_rse']) if policy == 'src_dest': policy_key = '%s,%s' % (t_file['metadata']['src_rse'], t_file['metadata']['dst_rse']) if policy == 'rule_src_dest': policy_key = '%s,%s,%s' % (transfer['rule_id'], t_file['metadata']['src_rse'], t_file['metadata']['dst_rse']) if policy == 'activity_dest': policy_key = '%s %s' % (activity, t_file['metadata']['dst_rse']) policy_key = "_".join(policy_key.split(' ')) if policy == 'activity_src_dest': policy_key = '%s %s %s' % (activity, t_file['metadata']['src_rse'], t_file['metadata']['dst_rse']) policy_key = "_".join(policy_key.split(' ')) # maybe here we need to hash the key if it's too long if policy_key not in current_transfers_group[job_key]: current_transfers_group[job_key][policy_key] = {'files': [], 'job_params': job_params} current_transfers_policy = current_transfers_group[job_key][policy_key] if multihop: # The parent transfer should be the first of the list # TODO : Only work for a single hop now, need to be able to handle multiple hops if transfer['parent_request']: # This is the child current_transfers_policy['files'].append(t_file) else: current_transfers_policy['files'].insert(0, t_file) else: current_transfers_policy['files'].append(t_file) # for jobs with different job_key, we cannot put in one job. for external_host in grouped_transfers: for scope_key in grouped_transfers[external_host]: for job_key in grouped_transfers[external_host][scope_key]: # for all policy groups in job_key, the job_params is the same. for policy_key in grouped_transfers[external_host][scope_key][job_key]: job_params = grouped_transfers[external_host][scope_key][job_key][policy_key]['job_params'] for xfers_files in chunks(grouped_transfers[external_host][scope_key][job_key][policy_key]['files'], group_bulk): # for the last small piece, just submit it. grouped_jobs[external_host][scope_key].append({'files': xfers_files, 'job_params': job_params}) if not group_by_scope: for external_host in grouped_jobs: grouped_jobs[external_host] = grouped_jobs[external_host][_catch_all_scopes_str] return grouped_jobs
def bulk_group_transfer(transfers, policy='rule', group_bulk=200, source_strategy=None, max_time_in_queue=None, session=None): """ Group transfers in bulk based on certain criterias :param transfers: List of transfers to group. :param plicy: Policy to use to group. :param group_bulk: Bulk sizes. :param source_strategy: Strategy to group sources :param max_time_in_queue: Maximum time in queue :return: List of grouped transfers. """ grouped_transfers = {} grouped_jobs = {} try: default_source_strategy = get(section='conveyor', option='default-source-strategy') except ConfigNotFound: default_source_strategy = 'orderly' try: activity_source_strategy = get(section='conveyor', option='activity-source-strategy') activity_source_strategy = loads(activity_source_strategy) except ConfigNotFound: activity_source_strategy = {} except ValueError: logging.warning('activity_source_strategy not properly defined') activity_source_strategy = {} for request_id in transfers: transfer = transfers[request_id] verify_checksum = transfer['file_metadata'].get('verify_checksum', 'both') dest_rse_id = transfer['file_metadata']['dest_rse_id'] source_rse_id = transfer['file_metadata']['src_rse_id'] dest_supported_checksums = get_rse_supported_checksums(rse_id=dest_rse_id, session=session) source_supported_checksums = get_rse_supported_checksums(rse_id=source_rse_id, session=session) common_checksum_names = set(source_supported_checksums).intersection(dest_supported_checksums) if source_supported_checksums == ['none']: if dest_supported_checksums == ['none']: # both endpoints support none verify_checksum = 'none' else: # src supports none but dst does verify_checksum = 'destination' else: if dest_supported_checksums == ['none']: # source supports some but destination does not verify_checksum = 'source' else: if len(common_checksum_names) == 0: # source and dst support some bot none in common (dst priority) verify_checksum = 'destination' else: # Don't override the value in the file_metadata pass t_file = {'sources': transfer['sources'], 'destinations': transfer['dest_urls'], 'metadata': transfer['file_metadata'], 'filesize': int(transfer['file_metadata']['filesize']), 'checksum': None, 'verify_checksum': verify_checksum, 'selection_strategy': source_strategy if source_strategy else activity_source_strategy.get(str(transfer['file_metadata']['activity']), default_source_strategy), 'request_type': transfer['file_metadata'].get('request_type', None), 'activity': str(transfer['file_metadata']['activity'])} if verify_checksum != 'none': if verify_checksum == 'both': set_checksum_value(t_file, common_checksum_names) if verify_checksum == 'source': set_checksum_value(t_file, source_supported_checksums) if verify_checksum == 'destination': set_checksum_value(t_file, dest_supported_checksums) multihop = transfer.get('multihop', False) strict_copy = transfer.get('strict_copy', False) external_host = transfer['external_host'] scope = t_file['metadata']['scope'] scope_str = scope.internal activity = t_file['activity'] if external_host not in grouped_transfers: grouped_transfers[external_host] = {} if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY: grouped_jobs[external_host] = [] elif activity in USER_ACTIVITY: grouped_jobs[external_host] = {} if scope_str not in grouped_transfers[external_host]: grouped_transfers[external_host][scope_str] = {} grouped_jobs[external_host][scope_str] = [] job_params = {'account': transfer['account'], 'use_oidc': transfer.get('use_oidc', False), 'verify_checksum': verify_checksum, 'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1, 'bring_online': transfer['bring_online'] if transfer['bring_online'] else None, 'job_metadata': {'issuer': 'rucio'}, # finaly job_meta will like this. currently job_meta will equal file_meta to include request_id and etc. 'overwrite': transfer['overwrite'], 'priority': 3, 's3alternate': True} if multihop: job_params['multihop'] = True if strict_copy: job_params['strict_copy'] = True # Don't put optional & missing keys in the parameters if transfer['dest_spacetoken']: job_params.update({'spacetoken': transfer['dest_spacetoken']}) if transfer['src_spacetoken']: job_params.update({'source_spacetoken': transfer['src_spacetoken']}) if max_time_in_queue: if transfer['file_metadata']['activity'] in max_time_in_queue: job_params['max_time_in_queue'] = max_time_in_queue[transfer['file_metadata']['activity']] elif 'default' in max_time_in_queue: job_params['max_time_in_queue'] = max_time_in_queue['default'] # for multiple source replicas, no bulk submission if len(transfer['sources']) > 1: job_params['job_metadata']['multi_sources'] = True if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY: grouped_jobs[external_host].append({'files': [t_file], 'job_params': job_params}) elif activity in USER_ACTIVITY: grouped_jobs[external_host][scope_str].append({'files': [t_file], 'job_params': job_params}) else: job_params['job_metadata']['multi_sources'] = False job_key = '%s,%s,%s,%s,%s,%s,%s,%s' % (job_params['verify_checksum'], job_params.get('spacetoken', None), job_params['copy_pin_lifetime'], job_params['bring_online'], job_params['job_metadata'], job_params.get('source_spacetoken', None), job_params['overwrite'], job_params['priority']) if 'max_time_in_queue' in job_params: job_key = job_key + ',%s' % job_params['max_time_in_queue'] if multihop: job_key = 'multihop_%s' % (transfer['initial_request_id']) if job_key not in grouped_transfers[external_host]: if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY: grouped_transfers[external_host][job_key] = {} elif activity in USER_ACTIVITY: grouped_transfers[external_host][scope_str][job_key] = {} if multihop: policy_key = 'multihop_%s' % (transfer['initial_request_id']) else: if policy == 'rule': policy_key = '%s' % (transfer['rule_id']) if policy == 'dest': policy_key = '%s' % (t_file['metadata']['dst_rse']) if policy == 'src_dest': policy_key = '%s,%s' % (t_file['metadata']['src_rse'], t_file['metadata']['dst_rse']) if policy == 'rule_src_dest': policy_key = '%s,%s,%s' % (transfer['rule_id'], t_file['metadata']['src_rse'], t_file['metadata']['dst_rse']) if policy == 'activity_dest': policy_key = '%s %s' % (activity, t_file['metadata']['dst_rse']) policy_key = "_".join(policy_key.split(' ')) if policy == 'activity_src_dest': policy_key = '%s %s %s' % (activity, t_file['metadata']['src_rse'], t_file['metadata']['dst_rse']) policy_key = "_".join(policy_key.split(' ')) # maybe here we need to hash the key if it's too long if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY: if policy_key not in grouped_transfers[external_host][job_key]: grouped_transfers[external_host][job_key][policy_key] = {'files': [t_file], 'job_params': job_params} else: if multihop: # The parent transfer should be the first of the list # TODO : Only work for a single hop now, need to be able to handle multiple hops if transfer['parent_request']: # This is the child grouped_transfers[external_host][job_key][policy_key]['files'].append(t_file) else: grouped_transfers[external_host][job_key][policy_key]['files'].insert(0, t_file) else: grouped_transfers[external_host][job_key][policy_key]['files'].append(t_file) elif activity in USER_ACTIVITY: if policy_key not in grouped_transfers[external_host][scope_str][job_key]: grouped_transfers[external_host][scope_str][job_key][policy_key] = {'files': [t_file], 'job_params': job_params} else: if multihop: # The parent transfer should be the first of the list # TODO : Only work for a single hop now, need to be able to handle multiple hops if transfer['parent_request']: # This is the child grouped_transfers[external_host][scope_str][job_key][policy_key]['files'].append(t_file) else: grouped_transfers[external_host][scope_str][job_key][policy_key]['files'].insert(0, t_file) # for jobs with different job_key, we cannot put in one job. for external_host in grouped_transfers: if USER_TRANSFERS not in ['cms'] or activity not in USER_ACTIVITY: for job_key in grouped_transfers[external_host]: # for all policy groups in job_key, the job_params is the same. for policy_key in grouped_transfers[external_host][job_key]: job_params = grouped_transfers[external_host][job_key][policy_key]['job_params'] for xfers_files in chunks(grouped_transfers[external_host][job_key][policy_key]['files'], group_bulk): # for the last small piece, just submit it. grouped_jobs[external_host].append({'files': xfers_files, 'job_params': job_params}) elif activity in USER_ACTIVITY: for scope_key in grouped_transfers[external_host]: for job_key in grouped_transfers[external_host][scope_key]: # for all policy groups in job_key, the job_params is the same. for policy_key in grouped_transfers[external_host][scope_key][job_key]: job_params = grouped_transfers[external_host][scope_key][job_key][policy_key]['job_params'] for xfers_files in chunks(grouped_transfers[external_host][scope_key][job_key][policy_key]['files'], group_bulk): # for the last small piece, just submit it. grouped_jobs[external_host][scope_key].append({'files': xfers_files, 'job_params': job_params}) return grouped_jobs