class PhEDExDeletionInterface(DeletionInterface): """Deletion using PhEDEx.""" def __init__(self, config): DeletionInterface.__init__(self, config) self._phedex = PhEDEx(config.phedex) self.auto_approval = config.auto_approval self.allow_tape_deletion = config.allow_tape_deletion self.tape_auto_approval = config.tape_auto_approval self.deletion_chunk_size = config.chunk_size * 1.e+12 def schedule_deletion(self, replica, comments=''): #override request_mapping = {} if replica.site.storage_type == Site.TYPE_MSS and self.allow_tape_deletion: LOG.warning('Deletion from MSS is not allowed by configuration.') return request_mapping deletion_list = [] if type(replica) is DatasetReplica: replica_blocks = set(r.block for r in replica.block_replicas) if replica_blocks == replica.dataset.blocks: deletion_list.append(replica.dataset) level = 'dataset' else: deletion_list.extend(replica_blocks) level = 'block' else: #BlockReplica deletion_list.append(replica.block) level = 'block' self._run_deletion_request(request_mapping, replica.site, level, deletion_list, comments) return request_mapping def schedule_deletions(self, replica_list, comments=''): #override request_mapping = {} replicas_by_site = collections.defaultdict(list) for replica in replica_list: replicas_by_site[replica.site].append(replica) if replica.site.storage_type == Site.TYPE_MSS and not self.allow_tape_deletion: LOG.warning('Deletion from MSS not allowed by configuration.') return {} for site, replica_list in replicas_by_site.iteritems(): # execute the deletions in two steps: one for dataset-level and one for block-level deletion_lists = {'dataset': [], 'block': []} for replica in replica_list: if type(replica) is DatasetReplica: blocks = set(r.block for r in replica.block_replicas) if blocks == replica.dataset.blocks: deletion_lists['dataset'].append(replica.dataset) else: deletion_lists['block'].extend(blocks) else: #BlockReplica deletion_lists['block'].append(replica.block) self._run_deletion_request(request_mapping, site, 'dataset', deletion_lists['dataset'], comments) self._run_deletion_request(request_mapping, site, 'block', deletion_lists['block'], comments) return request_mapping def _run_deletion_request(self, request_mapping, site, level, deletion_list, comments): full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in deletion_list: full_catalog[dataset] = [] elif level == 'block': for block in deletion_list: full_catalog[block.dataset].append(block) request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.deletion_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'rm_subscriptions': 'y', 'comments': comments } # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if successful if self.dry_run: result = [{'id': '0'}] else: try: result = self._phedex.make_request('delete', options, method=POST) except: if self._phedex.last_errorcode == 400: # Sometimes we have invalid data in the list of objects to delete. # PhEDEx throws a 400 error in such a case. We have to then try to identify the # problematic item through trial and error. if len(items) == 1: LOG.error('Could not delete %s from %s', str(items[0]), site.name) result = [] else: self._run_deletion_request(request_mapping, site, level, item[:len(item) / 2], comments) self._run_deletion_request(request_mapping, site, level, item[len(item) / 2:], comments) else: result = [] if len(result) != 0: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx deletion request id: %d', request_id) approved = False if self.dry_run: approved = True elif self.auto_approval: try: result = self._phedex.make_request('updaterequest', { 'decision': 'approve', 'request': request_id, 'node': site.name }, method=POST) except: LOG.error('deletion approval of request %d failed.', request_id) else: approved = True request_mapping[request_id] = (approved, site, items) else: LOG.error('Deletion %s failed.', str(options)) # we should probably do something here request_catalog = {} chunk_size = 0 items = [] def deletion_status(self, request_id): #override request = self._phedex.make_request('deleterequests', 'request=%d' % request_id) if len(request) == 0: return {} node_info = request[0]['nodes']['node'][0] site_name = node_info['name'] last_update = node_info['decided_by']['time_decided'] status = {} for ds_entry in request[0]['data']['dbs']['dataset']: status[ds_entry['name']] = (ds_entry['bytes'], ds_entry['bytes'], last_update) return status
class PhEDExCopyInterface(CopyInterface): """Copy using PhEDEx.""" def __init__(self, config=None): config = Configuration(config) CopyInterface.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._history = HistoryDatabase(config.get('history', None)) self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12 def schedule_copies(self, replica_list, operation_id, comments=''): #override sites = set(r.site for r in replica_list) if len(sites) != 1: raise OperationalError( 'schedule_copies should be called with a list of replicas at a single site.' ) site = list(sites)[0] LOG.info( 'Scheduling copy of %d replicas to %s using PhEDEx (operation %d)', len(replica_list), site, operation_id) # sort the subscriptions into dataset level / block level and by groups subscription_lists = {} subscription_lists['dataset'] = collections.defaultdict( list) # {(level, group_name): [replicas]} subscription_lists['block'] = collections.defaultdict( list) # {(level, group_name): [replicas]} for replica in replica_list: if replica.growing: subscription_lists['dataset'][replica.group].append( replica.dataset) else: blocks_by_group = collections.defaultdict(set) for block_replica in replica.block_replicas: blocks_by_group[block_replica.group].add( block_replica.block) for group, blocks in blocks_by_group.iteritems(): subscription_lists['block'][group].extend(blocks) # for convenience, mapping dataset -> replica result = {} for level in ['dataset', 'block']: for group, items in subscription_lists[level].iteritems(): success = self._run_subscription_request( operation_id, site, group, level, items, comments) for replica in success: if replica.dataset in result: booked = result[replica.dataset] # need to merge for block_replica in replica.block_replicas: # there shouldn't be any block replica overlap but we will be careful if booked.find_block_replica( block_replica.block) is None: booked.block_replicas.add(block_replica) else: result[replica.dataset] = replica return result.values() def _run_subscription_request(self, operation_id, site, group, level, subscription_list, comments): # Make a subscription request for potentitally multiple datasets or blocks but to one site and one group full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in subscription_list: full_catalog[dataset] = [] elif level == 'block': for block in subscription_list: full_catalog[block.dataset].append(block) history_sql = 'INSERT INTO `phedex_requests` (`id`, `operation_type`, `operation_id`, `approved`) VALUES (%s, \'copy\', %s, %s)' success = [] # make requests in chunks request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.subscription_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'priority': 'low', 'move': 'n', 'static': 'n', 'custodial': 'n', 'group': group.name, 'request_only': 'n', 'no_mail': 'n', 'comments': comments } try: if self._read_only: result = [{'id': 0}] else: result = self._phedex.make_request('subscribe', options, method=POST) except: LOG.error('Copy %s failed.', str(options)) # we should probably do something here else: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx subscription request id: %d', request_id) if not self._read_only: self._history.db.query(history_sql, request_id, operation_id, True) for dataset, blocks in request_catalog.iteritems(): if level == 'dataset': replica = DatasetReplica(dataset, site, growing=True, group=group) for block in dataset.blocks: replica.block_replicas.add( BlockReplica(block, site, group, size=0, last_update=int(time.time()))) else: replica = DatasetReplica(dataset, site, growing=False) for block in blocks: replica.block_replicas.add( BlockReplica(block, site, group, size=0, last_update=int(time.time()))) success.append(replica) request_catalog = {} chunk_size = 0 items = [] return success def copy_status(self, history_record, inventory): #override request_ids = self._history.db.query( 'SELECT `id` FROM `phedex_requests` WHERE `operation_type` = \'copy\' AND `operation_id` = %s', history_record.operation_id) if len(request_ids) == 0: return {} return self.transfer_request_status(request_ids) def transfer_request_status(self, request_ids): status = {} LOG.debug('Querying PhEDEx transferrequests for requests %s', request_ids) requests = self._phedex.make_request('transferrequests', [('request', i) for i in request_ids], method=POST) if len(requests) == 0: return status for request in requests: # A single request can have multiple destinations site_names = [d['name'] for d in request['destinations']['node']] dataset_names = [] for ds_entry in request['data']['dbs']['dataset']: dataset_names.append(ds_entry['name']) block_names = [] for ds_entry in request['data']['dbs']['block']: block_names.append(ds_entry['name']) if len(dataset_names) != 0: # Process dataset-level subscriptions subscriptions = [] chunks = [ dataset_names[i:i + 35] for i in xrange(0, len(dataset_names), 35) ] for site_name in site_names: for chunk in chunks: subscriptions.extend( self._phedex.make_request( 'subscriptions', ['node=%s' % site_name] + ['dataset=%s' % n for n in chunk])) for dataset in subscriptions: dataset_name = dataset['name'] try: cont = dataset['subscription'][0] except KeyError: LOG.error('Subscription of %s should exist but doesn\'t', dataset_name) continue site_name = cont['node'] bytes = dataset['bytes'] node_bytes = cont['node_bytes'] if node_bytes is None: node_bytes = 0 elif node_bytes != bytes: # it's possible that there were block-level deletions blocks = self._phedex.make_request( 'blockreplicas', ['node=%s' % site_name, 'dataset=%s' % dataset_name]) bytes = sum(b['bytes'] for b in blocks) status[(site_name, dataset_name)] = (bytes, node_bytes, cont['time_update']) if len(block_names) != 0: # Process block-level subscriptions subscriptions = [] chunks = [ block_names[i:i + 35] for i in xrange(0, len(block_names), 35) ] for site_name in site_names: for chunk in chunks: subscriptions.extend( self._phedex.make_request( 'subscriptions', ['node=%s' % site_name] + ['block=%s' % n for n in chunk])) overridden = set() for dataset in subscriptions: dataset_name = dataset['name'] try: blocks = dataset['block'] except KeyError: try: cont = dataset['subscription'][0] except KeyError: LOG.error( 'Subscription of %s neither block-level nor dataset-level', dataset_name) continue site_name = cont['node'] if (site_name, dataset_name) in overridden: # this is a dataset-level subscription and we've processed this dataset already continue overridden.add((site_name, dataset_name)) LOG.debug( 'Block-level subscription of %s at %s is overridden', dataset_name, site_name) requested_blocks = [ name for name in block_names if name.startswith(dataset_name + '#') ] blocks = self._phedex.make_request( 'blockreplicas', ['node=%s' % site_name, 'dataset=%s' % dataset_name]) for block in blocks: block_name = block['name'] if block_name not in requested_blocks: continue replica = block['replica'][0] status[(site_name, block_name)] = (block['bytes'], replica['bytes'], replica['time_update']) continue for block in blocks: block_name = block['name'] try: cont = block['subscription'][0] except KeyError: LOG.error( 'Subscription of %s should exist but doesn\'t', block_name) continue node_bytes = cont['node_bytes'] if node_bytes is None: node_bytes = 0 status[(cont['node'], block_name)] = (block['bytes'], node_bytes, cont['time_update']) # now we pick up whatever did not appear in the subscriptions call for site_name in site_names: for dataset_name in dataset_names: key = (site_name, dataset_name) if key not in status: status[key] = None for block_name in block_names: key = (site_name, block_name) if key not in status: status[key] = None return status
class PhEDExCopyInterface(CopyInterface): """Copy using PhEDEx.""" def __init__(self, config): CopyInterface.__init__(self, config) self._phedex = PhEDEx(config.phedex) self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12 def schedule_copy(self, replica, comments=''): #override request_mapping = {} subscription_list = [] if type(replica) is DatasetReplica: blocks_by_group = collections.defaultdict(set) for block_replica in replica.block_replicas: blocks_by_group[block_replica.group].add(block_replica.block) if len(blocks_by_group) > 1: # this was called as a dataset-level copy, but in fact we have multiple # sets of blocks with different groups -> recall block-level schedule_copies return self.schedule_copies(replica.block_replicas, comments) group, block_replicas = blocks_by_group.items()[0] if block_replicas == replica.dataset.blocks: subscription_list.append(replica.dataset) level = 'dataset' else: subscription_list.extend(block_replicas) level = 'block' else: #BlockReplica group = replica.group subscription_list.append(replica.block) level = 'block' self._run_subscription_request(request_mapping, replica.site, group, level, subscription_list, comments) return request_mapping def schedule_copies(self, replicas, comments=''): #override request_mapping = {} replicas_by_site = collections.defaultdict(list) for replica in replicas: replicas_by_site[replica.site].append(replica) for site, replica_list in replicas_by_site.iteritems(): # sort the subscriptions into dataset level / block level and by groups subscription_lists = {} subscription_lists['dataset'] = collections.defaultdict( list) # {(level, group_name): [replicas]} subscription_lists['block'] = collections.defaultdict( list) # {(level, group_name): [replicas]} for replica in replica_list: if type(replica) is DatasetReplica: blocks_by_group = collections.defaultdict(set) for block_replica in replica.block_replicas: blocks_by_group[block_replica.group].add( block_replica.block) for group, blocks in blocks_by_group.iteritems(): if blocks == replica.dataset.blocks: subscription_lists['dataset'][group].append( replica.dataset) else: subscription_lists['block'][group].extend(blocks) else: subscription_lists['block'][replica.group].append( replica.block) for level in ['dataset', 'block']: for group, items in subscription_lists[level].iteritems(): self._run_subscription_request(request_mapping, site, group, level, items, comments) return request_mapping def _run_subscription_request(self, request_mapping, site, group, level, subscription_list, comments): # Make a subscription request for potentitally multiple datasets or blocks but to one site and one group full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in subscription_list: full_catalog[dataset] = [] elif level == 'block': for block in subscription_list: full_catalog[block.dataset].append(block) LOG.info('Subscribing %d datasets for %s at %s', len(full_catalog), group.name, site.name) # make requests in chunks request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.subscription_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'priority': 'normal', 'move': 'n', 'static': 'n', 'custodial': 'n', 'group': group.name, 'request_only': 'n', 'no_mail': 'n', 'comments': comments } # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if self.dry_run: result = [{'id': '0'}] else: try: result = self._phedex.make_request('subscribe', options, method=POST) except: result = [] if len(result) != 0: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx subscription request id: %d', request_id) request_mapping[request_id] = (True, site, items) else: LOG.error('Copy %s failed.', str(options)) # we should probably do something here request_catalog = {} chunk_size = 0 items = [] def copy_status(self, request_id): #override request = self._phedex.make_request('transferrequests', 'request=%d' % request_id) if len(request) == 0: return {} site_name = request[0]['destinations']['node'][0]['name'] dataset_names = [] for ds_entry in request[0]['data']['dbs']['dataset']: dataset_names.append(ds_entry['name']) block_names = [] for ds_entry in request[0]['data']['dbs']['block']: block_names.append(ds_entry['name']) subscriptions = [] if len(dataset_names) != 0: chunks = [ dataset_names[i:i + 35] for i in xrange(0, len(dataset_names), 35) ] for chunk in chunks: subscriptions.extend( self._phedex.make_request( 'subscriptions', ['node=%s' % site_name] + ['dataset=%s' % n for n in chunk])) if len(block_names) != 0: chunks = [ block_names[i:i + 35] for i in xrange(0, len(block_names), 35) ] for chunk in chunks: subscriptions.extend( self._phedex.make_request('subscriptions', ['node=%s' % site_name] + ['block=%s' % n for n in chunk])) status = {} for dataset in subscriptions: try: cont = dataset['subscription'][0] bytes = dataset['bytes'] node_bytes = cont['node_bytes'] time_update = cont['time_update'] except KeyError: # this was a block-level subscription (no 'subscription' field for the dataset) bytes = 0 node_bytes = 0 time_update = 0 for block in dataset['block']: cont = block['subscription'][0] bytes += block['bytes'] node_bytes += cont['node_bytes'] time_update = max(time_update, cont['time_update']) status[(site_name, dataset['name'])] = (bytes, node_bytes, time_update) return status
class PhEDExDeletionInterface(DeletionInterface): """Deletion using PhEDEx.""" def __init__(self, config=None): config = Configuration(config) DeletionInterface.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._history = HistoryDatabase(config.get('history', None)) self.auto_approval = config.get('auto_approval', True) self.allow_tape_deletion = config.get('allow_tape_deletion', True) self.tape_auto_approval = config.get('tape_auto_approval', False) self.deletion_chunk_size = config.get('chunk_size', 50.) * 1.e+12 def schedule_deletions(self, replica_list, operation_id, comments=''): #override sites = set(r.site for r, b in replica_list) if len(sites) != 1: raise OperationalError( 'schedule_copies should be called with a list of replicas at a single site.' ) site = list(sites)[0] if site.storage_type == Site.TYPE_MSS and not self.allow_tape_deletion: LOG.warning('Deletion from MSS not allowed by configuration.') return [] if self.allow_tape_deletion and self.auto_approval: LOG.warning( 'You cannot have auto-approved tape deletions. Set auto-approval to False.' ) return [] # execute the deletions in two steps: one for dataset-level and one for block-level datasets = [] blocks = [] # maps used later for cloning # getting ugly here.. should come up with a better way of making clones replica_map = {} block_replica_map = {} for dataset_replica, block_replicas in replica_list: if block_replicas is None: datasets.append(dataset_replica.dataset) else: blocks.extend(br.block for br in block_replicas) replica_map[dataset_replica.dataset] = dataset_replica block_replica_map.update( (br.block, br) for br in block_replicas) success = [] deleted_datasets = self._run_deletion_request(operation_id, site, 'dataset', datasets, comments) for dataset in deleted_datasets: replica = DatasetReplica(dataset, site, growing=False, group=Group.null_group) success.append((replica, None)) tmp_map = dict((dataset, []) for dataset in replica_map.iterkeys()) deleted_blocks = self._run_deletion_request(operation_id, site, 'block', blocks, comments) for block in deleted_blocks: tmp_map[block.dataset].append(block) for dataset, blocks in tmp_map.iteritems(): replica = DatasetReplica(dataset, site) replica.copy(replica_map[dataset]) success.append((replica, [])) for block in blocks: block_replica = BlockReplica(block, site, Group.null_group) block_replica.copy(block_replica_map[block]) block_replica.last_update = int(time.time()) success[-1][1].append(block_replica) return success def _run_deletion_request(self, operation_id, site, level, deletion_list, comments): full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in deletion_list: full_catalog[dataset] = [] elif level == 'block': for block in deletion_list: full_catalog[block.dataset].append(block) history_sql = 'INSERT INTO `phedex_requests` (`id`, `operation_type`, `operation_id`, `approved`) VALUES (%s, \'deletion\', %s, %s)' deleted_items = [] request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.deletion_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'rm_subscriptions': 'y', 'comments': comments } # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if successful try: if self._read_only: result = [{'id': 0}] else: result = self._phedex.make_request('delete', options, method=POST) except: LOG.error('Deletion %s failed.', str(options)) if self._phedex.last_errorcode == 400: # Sometimes we have invalid data in the list of objects to delete. # PhEDEx throws a 400 error in such a case. We have to then try to identify the # problematic item through trial and error. if len(items) == 1: LOG.error('Could not delete %s from %s', str(items[0]), site.name) else: LOG.info('Retrying with a reduced item list.') deleted_items.extend( self._run_deletion_request(operation_id, site, level, items[:len(items) / 2], comments)) deleted_items.extend( self._run_deletion_request(operation_id, site, level, items[len(items) / 2:], comments)) else: raise else: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx deletion request id: %d', request_id) approved = False if self._read_only: approved = True elif self.auto_approval: try: result = self._phedex.make_request('updaterequest', { 'decision': 'approve', 'request': request_id, 'node': site.name }, method=POST) except: LOG.error('deletion approval of request %d failed.', request_id) else: approved = True if not self._read_only: self._history.db.query(history_sql, request_id, operation_id, approved) if approved: deleted_items.extend(items) request_catalog = {} chunk_size = 0 items = [] return deleted_items def deletion_status(self, request_id): #override request = self._phedex.make_request('deleterequests', 'request=%d' % request_id) if len(request) == 0: return {} node_info = request[0]['nodes']['node'][0] site_name = node_info['name'] last_update = node_info['decided_by']['time_decided'] status = {} for ds_entry in request[0]['data']['dbs']['dataset']: status[ds_entry['name']] = (ds_entry['bytes'], ds_entry['bytes'], last_update) return status