def __init__(self, config=None): config = Configuration(config) DeletionInterface.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._history = HistoryDatabase(config.get('history', None)) self.auto_approval = config.get('auto_approval', True) self.allow_tape_deletion = config.get('allow_tape_deletion', True) self.tape_auto_approval = config.get('tape_auto_approval', False) self.deletion_chunk_size = config.get('chunk_size', 50.) * 1.e+12
class TapeCopyRequested(object): """ Check for pending tape transfer requests. Sets one attr: tape_copy_requested """ produces = ['tape_copy_requested'] def __init__(self, config): self._phedex = PhEDEx(config.get('phedex', None)) def load(self, inventory): for site in inventory.sites.itervalues(): if site.storage_type != Site.TYPE_MSS: continue requests = self._phedex.make_request( 'transferrequests', ['node=' + site.name, 'approval=pending']) for request in requests: for dest in request['destinations']['node']: if dest['name'] != site.name: continue if 'decided_by' in dest: break for dataset_entry in request['data']['dbs']['dataset']: try: dataset = inventory.datasets[dataset_entry['name']] except KeyError: continue dataset.attr['tape_copy_requested'] = True for block_entry in request['data']['dbs']['block']: dataset_name, block_name = Block.from_full_name( block_entry['name']) try: dataset = inventory.datasets[dataset_name] except KeyError: continue # just label the entire dataset dataset.attr['tape_copy_requested'] = True
class PhEDExReplicaInfoSource(ReplicaInfoSource): """ReplicaInfoSource using PhEDEx.""" def __init__(self, config=None): if config is None: config = Configuration() ReplicaInfoSource.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._parallelizer_config = config def replica_exists_at_site(self, site, item): #override options = ['node=' + site.name] if type(item) == Dataset: options += ['dataset=' + item.name, 'show_dataset=y'] elif type(item) == DatasetReplica: options += ['dataset=' + item.dataset.name, 'show_dataset=y'] elif type(item) == Block: options += ['block=' + item.full_name()] elif type(item) == BlockReplica: options += ['block=' + item.block.full_name()] else: raise RuntimeError('Invalid input passed: ' + repr(item)) source = self._phedex.make_request('blockreplicas', options, timeout=600) if len(source) != 0: return True options = ['node=' + site.name] if type(item) == Dataset: # check both dataset-level and block-level subscriptions options += ['dataset=' + item.name, 'block=%s#*' % item.name] elif type(item) == DatasetReplica: options += [ 'dataset=' + item.dataset.name, 'block=%s#*' % item.dataset.name ] elif type(item) == Block: options += ['block=' + item.full_name()] elif type(item) == BlockReplica: options += ['block=' + item.block.full_name()] # blockreplicas has max ~20 minutes latency source = self._phedex.make_request('subscriptions', options, timeout=600) return len(source) != 0 def get_replicas(self, site=None, dataset=None, block=None): #override if site is None: site_check = self.check_allowed_site else: site_check = None if not self.check_allowed_site(site): return [] if dataset is None and block is None: dataset_check = self.check_allowed_dataset else: dataset_check = None if dataset is not None: if not self.check_allowed_dataset(dataset): return [] if block is not None: if not self.check_allowed_dataset(block[:block.find('#')]): return [] options = [] if site is not None: options.append('node=' + site) if dataset is not None: options.append('dataset=' + dataset) if block is not None: options.append('block=' + block) LOG.info('get_replicas(' + ','.join(options) + ') Fetching the list of replicas from PhEDEx') if len(options) == 0: return [] block_entries = self._phedex.make_request('blockreplicas', options, timeout=7200) parallelizer = Map() parallelizer.timeout = 7200 # Automatically starts a thread as we add the output of block_entries combine_file = parallelizer.get_starter(self._combine_file_info) for block_entry in block_entries: for replica_entry in block_entry['replica']: if replica_entry['complete'] == 'n': break else: continue # there is at least one incomplete replica try: dataset_name, block_name = Block.from_full_name( block_entry['name']) except ObjectError: # invalid name continue if dataset_check and not dataset_check(dataset_name): continue combine_file.add_input(block_entry) combine_file.close() # _combine_file_info alters block_entries directly - no need to deal with output combine_file.get_outputs() block_replicas = PhEDExReplicaInfoSource.make_block_replicas( block_entries, PhEDExReplicaInfoSource.maker_blockreplicas, site_check=site_check, dataset_check=dataset_check) # Also use subscriptions call which has a lower latency than blockreplicas # For example, group change on a block replica at time T may not show up in blockreplicas until up to T + 15 minutes # while in subscriptions it is visible within a few seconds # But subscriptions call without a dataset or block takes too long if dataset is None and block is None: return block_replicas indexed = collections.defaultdict(dict) for replica in block_replicas: indexed[(replica.site.name, replica.block.dataset.name)][replica.block.name] = replica dataset_entries = self._phedex.make_request('subscriptions', options, timeout=3600) for dataset_entry in dataset_entries: dataset_name = dataset_entry['name'] if not self.check_allowed_dataset(dataset_name): continue try: subscriptions = dataset_entry['subscription'] except KeyError: pass else: for sub_entry in subscriptions: site_name = sub_entry['node'] if not self.check_allowed_site(site_name): continue replicas = indexed[(site_name, dataset_name)] for replica in replicas.itervalues(): replica.group = Group(sub_entry['group']) replica.is_custodial = (sub_entry['custodial'] == 'y') try: block_entries = dataset_entry['block'] except KeyError: pass else: for block_entry in block_entries: try: _, block_name = Block.from_full_name( block_entry['name']) except ObjectError: continue try: subscriptions = block_entry['subscription'] except KeyError: continue for sub_entry in subscriptions: site_name = sub_entry['node'] if not self.check_allowed_site(site_name): continue try: replica = indexed[(site_name, dataset_name)][block_name] except KeyError: continue replica.group = Group(sub_entry['group']) if sub_entry['node_bytes'] == block_entry['bytes']: # complete replica.size = sub_entry['node_bytes'] if replica.size is None: replica.size = 0 replica.files = None else: # incomplete - since we cannot know what files are there, we'll just have to pretend there is none replica.size = 0 replica.files = tuple() replica.is_custodial = (sub_entry['custodial'] == 'y') if sub_entry['time_update'] is not None: replica.last_update = 0 else: replica.last_update = int(sub_entry['time_update']) return block_replicas def get_updated_replicas(self, updated_since, inventory): #override LOG.info( 'get_updated_replicas(%d) Fetching the list of replicas from PhEDEx', updated_since) nodes = [] for entry in self._phedex.make_request('nodes', timeout=600): if not self.check_allowed_site(entry['name']): continue if entry['name'] not in inventory.sites: continue nodes.append(entry['name']) try: tmpconfig = Configuration( self._parallelizer_config.get('parallel', None)) except Exception as e: LOG.error(str(e)) tmpconfig = Configuration() parallelizer = Map(tmpconfig) parallelizer.timeout = 5400 def get_node_replicas(node): options = ['update_since=%d' % updated_since, 'node=%s' % node] results = self._phedex.make_request('blockreplicas', options) return node, results # Use async to fire threads on demand node_results = parallelizer.execute(get_node_replicas, nodes, async=True) # Automatically starts a thread as we add the output of block_replicas combine_file = parallelizer.get_starter(self._combine_file_info) all_block_entries = [] for node, block_entries in node_results: site = inventory.sites[node] for block_entry in block_entries: all_block_entries.append(block_entry) replica_entry = block_entry['replica'][0] if replica_entry['complete'] == 'y': continue # incomplete block replica - should we fetch file info? try: dataset_name, block_name = Block.from_full_name( block_entry['name']) except ObjectError: pass else: try: dataset = inventory.datasets[dataset_name] block = dataset.find_block(block_name) replica = block.find_replica(site) if replica.file_ids is None: num_files = block.num_files else: num_files = len(replica.file_ids) if replica.size == replica_entry[ 'bytes'] and num_files == replica_entry[ 'files']: # no we don't have to continue except: # At any point of the above lookups we may hit a None object or KeyError or what not pass LOG.debug( 'Replica %s:%s is incomplete. Fetching file information.', replica_entry['node'], block_entry['name']) combine_file.add_input(block_entry) combine_file.close() # _combine_file_info alters block_entries directly - no need to deal with output combine_file.get_outputs() LOG.info('get_updated_replicas(%d) Got outputs' % updated_since) return PhEDExReplicaInfoSource.make_block_replicas( all_block_entries, PhEDExReplicaInfoSource.maker_blockreplicas, dataset_check=self.check_allowed_dataset) def get_deleted_replicas(self, deleted_since): #override LOG.info( 'get_deleted_replicas(%d) Fetching the list of replicas from PhEDEx', deleted_since) result = self._phedex.make_request( 'deletions', ['complete_since=%d' % deleted_since], timeout=7200) # result is by dataset block_entries = [] for dataset_entry in result: block_entries.extend(dataset_entry['block']) return PhEDExReplicaInfoSource.make_block_replicas( block_entries, PhEDExReplicaInfoSource.maker_deletions) def _combine_file_info(self, block_entry): try: LOG.debug( '_combine_file_info(%s) Fetching file replicas from PhEDEx', block_entry['name']) file_info = self._phedex.make_request( 'filereplicas', ['block=%s' % block_entry['name']])[0]['file'] except (IndexError, KeyError): # Somehow PhEDEx didn't have a filereplicas entry for this block at this node block_entry['file'] = [] else: block_entry['file'] = file_info @staticmethod def make_block_replicas(block_entries, replica_maker, site_check=None, dataset_check=None): """Return a list of block replicas linked to Dataset, Block, Site, and Group""" dataset = None block_replicas = [] for block_entry in block_entries: try: dataset_name, block_name = Block.from_full_name( block_entry['name']) except ObjectError: # invalid name continue if dataset is None or dataset.name != dataset_name: if dataset_check and not dataset_check(dataset_name): continue try: dataset = Dataset(dataset_name) except ObjectError: # invalid name dataset = None if dataset is None: continue block = Block(block_name, dataset, block_entry['bytes']) if block.size is None: block.size = 0 block_replicas.extend( replica_maker(block, block_entry, site_check=site_check)) return block_replicas @staticmethod def maker_blockreplicas(block, block_entry, site_check=None): """Return a list of block replicas using blockreplicas data or a combination of blockreplicas and filereplicas calls.""" sites = {} invalid_sites = set() groups = {} block_replicas = {} for replica_entry in block_entry['replica']: site_name = replica_entry['node'] try: site = sites[site_name] except KeyError: if site_check: if site_name in invalid_sites: continue if not site_check(site_name): invalid_sites.add(site_name) continue site = sites[site_name] = Site(site_name) group_name = replica_entry['group'] try: group = groups[group_name] except KeyError: group = groups[group_name] = Group(group_name) try: time_update = int(replica_entry['time_update']) except TypeError: # time_update was None time_update = 0 block_replica = BlockReplica( block, site, group, is_custodial=(replica_entry['custodial'] == 'y'), last_update=time_update) block_replicas[site_name] = block_replica if replica_entry['complete'] == 'n': # temporarily make this a list block_replica.file_ids = [] block_replica.size = 0 LOG.info("Incomplete %s" % str(block_replica)) if 'file' in block_entry: for file_entry in block_entry['file']: for replica_entry in file_entry['replica']: site_name = replica_entry['node'] try: block_replica = block_replicas[site_name] except KeyError: continue if block_replica.file_ids is None: continue # add LFN instead of file id block_replica.file_ids.append(file_entry['name']) file_size = file_entry['bytes'] if file_size is not None: block_replica.size += file_size try: time_create = int(replica_entry['time_create']) except TypeError: pass else: if time_create > block_replica.last_update: block_replica.last_update = time_create for block_replica in block_replicas.itervalues(): if block_replica.file_ids is not None: block_replica.file_ids = tuple(block_replica.file_ids) return block_replicas.values() @staticmethod def maker_deletions(block, block_entry, site_check=None): replicas = [] for deletion_entry in block_entry['deletion']: if site_check and not site_check(deletion_entry['node']): continue block_replica = BlockReplica(block, Site(deletion_entry['node']), Group.null_group) replicas.append(block_replica) return replicas
def __init__(self, config): CopyInterface.__init__(self, config) self._phedex = PhEDEx(config.phedex) self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12
class PhEDExCopyInterface(CopyInterface): """Copy using PhEDEx.""" def __init__(self, config): CopyInterface.__init__(self, config) self._phedex = PhEDEx(config.phedex) self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12 def schedule_copy(self, replica, comments=''): #override request_mapping = {} subscription_list = [] if type(replica) is DatasetReplica: blocks_by_group = collections.defaultdict(set) for block_replica in replica.block_replicas: blocks_by_group[block_replica.group].add(block_replica.block) if len(blocks_by_group) > 1: # this was called as a dataset-level copy, but in fact we have multiple # sets of blocks with different groups -> recall block-level schedule_copies return self.schedule_copies(replica.block_replicas, comments) group, block_replicas = blocks_by_group.items()[0] if block_replicas == replica.dataset.blocks: subscription_list.append(replica.dataset) level = 'dataset' else: subscription_list.extend(block_replicas) level = 'block' else: #BlockReplica group = replica.group subscription_list.append(replica.block) level = 'block' self._run_subscription_request(request_mapping, replica.site, group, level, subscription_list, comments) return request_mapping def schedule_copies(self, replicas, comments=''): #override request_mapping = {} replicas_by_site = collections.defaultdict(list) for replica in replicas: replicas_by_site[replica.site].append(replica) for site, replica_list in replicas_by_site.iteritems(): # sort the subscriptions into dataset level / block level and by groups subscription_lists = {} subscription_lists['dataset'] = collections.defaultdict( list) # {(level, group_name): [replicas]} subscription_lists['block'] = collections.defaultdict( list) # {(level, group_name): [replicas]} for replica in replica_list: if type(replica) is DatasetReplica: blocks_by_group = collections.defaultdict(set) for block_replica in replica.block_replicas: blocks_by_group[block_replica.group].add( block_replica.block) for group, blocks in blocks_by_group.iteritems(): if blocks == replica.dataset.blocks: subscription_lists['dataset'][group].append( replica.dataset) else: subscription_lists['block'][group].extend(blocks) else: subscription_lists['block'][replica.group].append( replica.block) for level in ['dataset', 'block']: for group, items in subscription_lists[level].iteritems(): self._run_subscription_request(request_mapping, site, group, level, items, comments) return request_mapping def _run_subscription_request(self, request_mapping, site, group, level, subscription_list, comments): # Make a subscription request for potentitally multiple datasets or blocks but to one site and one group full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in subscription_list: full_catalog[dataset] = [] elif level == 'block': for block in subscription_list: full_catalog[block.dataset].append(block) LOG.info('Subscribing %d datasets for %s at %s', len(full_catalog), group.name, site.name) # make requests in chunks request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.subscription_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'priority': 'normal', 'move': 'n', 'static': 'n', 'custodial': 'n', 'group': group.name, 'request_only': 'n', 'no_mail': 'n', 'comments': comments } # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if self.dry_run: result = [{'id': '0'}] else: try: result = self._phedex.make_request('subscribe', options, method=POST) except: result = [] if len(result) != 0: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx subscription request id: %d', request_id) request_mapping[request_id] = (True, site, items) else: LOG.error('Copy %s failed.', str(options)) # we should probably do something here request_catalog = {} chunk_size = 0 items = [] def copy_status(self, request_id): #override request = self._phedex.make_request('transferrequests', 'request=%d' % request_id) if len(request) == 0: return {} site_name = request[0]['destinations']['node'][0]['name'] dataset_names = [] for ds_entry in request[0]['data']['dbs']['dataset']: dataset_names.append(ds_entry['name']) block_names = [] for ds_entry in request[0]['data']['dbs']['block']: block_names.append(ds_entry['name']) subscriptions = [] if len(dataset_names) != 0: chunks = [ dataset_names[i:i + 35] for i in xrange(0, len(dataset_names), 35) ] for chunk in chunks: subscriptions.extend( self._phedex.make_request( 'subscriptions', ['node=%s' % site_name] + ['dataset=%s' % n for n in chunk])) if len(block_names) != 0: chunks = [ block_names[i:i + 35] for i in xrange(0, len(block_names), 35) ] for chunk in chunks: subscriptions.extend( self._phedex.make_request('subscriptions', ['node=%s' % site_name] + ['block=%s' % n for n in chunk])) status = {} for dataset in subscriptions: try: cont = dataset['subscription'][0] bytes = dataset['bytes'] node_bytes = cont['node_bytes'] time_update = cont['time_update'] except KeyError: # this was a block-level subscription (no 'subscription' field for the dataset) bytes = 0 node_bytes = 0 time_update = 0 for block in dataset['block']: cont = block['subscription'][0] bytes += block['bytes'] node_bytes += cont['node_bytes'] time_update = max(time_update, cont['time_update']) status[(site_name, dataset['name'])] = (bytes, node_bytes, time_update) return status
def __init__(self, config): DatasetInfoSource.__init__(self, config) self._phedex = PhEDEx(config.phedex) self._dbs = RESTService(config.dbs)
class PhEDExDatasetInfoSource(DatasetInfoSource): """DatasetInfoSource using PhEDEx and DBS.""" def __init__(self, config): DatasetInfoSource.__init__(self, config) self._phedex = PhEDEx(config.phedex) self._dbs = RESTService(config.dbs) def get_dataset_names(self, include=['*'], exclude=[]): dataset_names = [] exclude_exps = [] for pattern in exclude: exclude_exps.append(re.compile(fnmatch.translate(pattern))) def add_datasets(result): for entry in result: name = entry['dataset'] for ex_exp in exclude_exps: if ex_exp.match(name): break else: # not excluded dataset_names.append(name) if len(include) == 1 and include[0] == '/*/*/*': # all datasets requested - will do this efficiently result = self._dbs.make_request('acquisitioneras') sds = [entry['acquisition_era_name'] for entry in result] # query DBS in parallel args = [('datasets', ['acquisition_era_name=' + sd]) for sd in sds] results = Map().execute(self._dbs.make_request, args) for result in results: add_datasets(result) for in_pattern in include: result = self._dbs.make_request('datasets', ['dataset=' + in_pattern]) add_datasets(result) return dataset_names def get_updated_datasets(self, updated_since): #override LOG.warning( 'PhEDExDatasetInfoSource can only return a list of datasets and blocks that are created since the given timestamp.' ) result = self._phedex.make_request('data', [ 'dataset=' + name, 'level=block', 'create_since=%d' % updated_since ]) if len(result) == 0 or 'dataset' not in result[0]: return [] updated_datasets = [] for dataset_entry in result[0]['dataset']: dataset = self._create_dataset(dataset_entry) updated_datasets.append(dataset) return updated_datasets def get_dataset(self, name, with_files=False): #override ## Get the full dataset-block-file data from PhEDEx if with_files: level = 'file' else: level = 'block' result = self._phedex.make_request( 'data', ['dataset=' + name, 'level=' + level]) try: dataset_entry = result[0]['dataset'][0] except: return None ## Create the dataset object dataset = self._create_dataset(dataset_entry) ## Fill block and file data if 'block' in dataset_entry: for block_entry in dataset_entry['block']: block = self._create_block(block_entry, dataset) dataset.blocks.add(block) # size and num_files are left 0 in _create_dataset (PhEDEx does not tell) dataset.size += block.size dataset.num_files += block.num_files if with_files and 'file' in block_entry: # See comments in get_block block._files = set() for file_entry in block_entry['file']: block._files.add(self._create_file(file_entry, block)) return dataset def get_block(self, name, dataset=None, with_files=False): #override ## Get the full block-file data from PhEDEx if with_files: level = 'file' else: level = 'block' result = self._phedex.make_request('data', ['block=' + name, 'level=' + level]) try: dataset_entry = result[0]['dataset'][0] block_entry = dataset_entry['block'][0] except: return None if dataset is None: link_dataset = False # Just need a named object dataset = Dataset(dataset_entry['name']) else: link_dataset = True if dataset.name != dataset_entry['name']: raise IntegrityError( 'Inconsistent dataset %s passed to get_block(%s)', dataset.name, name) block = self._create_block(block_entry, dataset) if with_files and 'file' in block_entry: # _create_block sets size and num_files; just need to update the files list # Directly creating the _files set # This list will persist (unlike the weak proxy version loaded from inventory), but the returned block # from this function is only used temporarily anyway block._files = set() for file_entry in block_entry['file']: block._files.add(self._create_file(file_entry, block)) if link_dataset: existing = dataset.find_block(block.name) if existing is None: dataset.blocks.add(block) dataset.size += block.size dataset.num_files += block.num_files else: dataset.blocks.remove(existing) dataset.size += block.size - existing.size dataset.num_files += block.num_files - existing.num_files return block def get_file(self, name, block=None): ## Get the file data from PhEDEx result = self._phedex.make_request('data', ['file=' + name, 'level=file']) try: block_entry = result[0]['dataset'][0]['block'][0] file_entry = block_entry['file'][0] except: return None bname = block_entry['name'] block_name = Block.to_internal_name(bname[bname.find('#') + 1:]) if block is None: link_block = False # Just need a named object dataset = Dataset(dataset_entry['name']) block = Block(block_name, dataset) else: link_block = True if block.name != block_name: raise IntegrityError( 'Inconsistent block %s passed to get_file(%s)', block.full_name(), name) lfile = self._create_file(file_entry, block) if link_block: # Caution - by adding this file we edit the block properties too existing = block.find_file(lfile.fid()) if existing is None: block.add_file(lfile) else: block.remove_file(existing) block.add_file(lfile) return lfile def get_files(self, dataset_or_block): #override files = set() if type(dataset_or_block) is Dataset: result = self._phedex.make_request( 'data', ['dataset=' + dataset_or_block.name, 'level=file']) blocks = dict((b.name, b) for b in dataset_or_block.blocks) else: result = self._phedex.make_request( 'data', ['block=' + dataset_or_block.full_name(), 'level=file']) blocks = {dataset_or_block.name: dataset_or_block} try: block_entries = result[0]['dataset'][0]['block'] except: return files for block_entry in block_entries: try: file_entries = block_entry['file'] except: continue bname = block_entry['name'] block_name = Block.to_internal_name(bname[bname.find('#') + 1:]) try: block = blocks[block_name] except: # unknown block! maybe should raise? continue for file_entry in file_entries: files.add(self._create_file(file_entry, block)) return files def _create_dataset(self, dataset_entry): """ Create a dataset object with blocks and files from a PhEDEx dataset entry """ dataset = Dataset(dataset_entry['name'], is_open=(dataset_entry['is_open'] == 'y')) if 'time_update' in dataset_entry and dataset_entry[ 'time_update'] is not None: dataset.last_update = int(dataset_entry['time_update']) else: dataset.last_update = int(dataset_entry['time_create']) ## Get other details of the dataset from DBS self._fill_dataset_details(dataset) return dataset def _create_block(self, block_entry, dataset): """ Create a block object with files from a PhEDEx block entry """ bname = block_entry['name'] block_name = Block.to_internal_name(bname[bname.find('#') + 1:]) block = Block(block_name, dataset, size=block_entry['bytes'], num_files=block_entry['files'], is_open=(block_entry['is_open'] == 'y')) if 'time_update' in block_entry and block_entry[ 'time_update'] is not None: block.last_update = int(block_entry['time_update']) else: block.last_update = int(block_entry['time_create']) return block def _create_file(self, file_entry, block): lfile = File(file_entry['lfn'], block=block, size=file_entry['size']) return lfile def _fill_dataset_details(self, dataset): # 1. status and PD type result = self._dbs.make_request('datasets', [ 'dataset=' + dataset.name, 'dataset_access_type=*', 'detail=True' ]) if len(result) != 0: dbs_entry = result[0] dataset.status = Dataset.status_val( dbs_entry['dataset_access_type']) dataset.data_type = Dataset.data_type_val( dbs_entry['primary_ds_type']) else: dataset.status = Dataset.STAT_UNKNOWN dataset.data_type = Dataset.TYPE_UNKNOWN # 2. software version result = self._dbs.make_request('releaseversions', ['dataset=' + dataset.name]) if len(result) != 0: try: version = result[0]['release_version'][0] except KeyError: pass else: matches = re.match('CMSSW_([0-9]+)_([0-9]+)_([0-9]+)(|_.*)', version) if matches: cycle, major, minor = map( int, [matches.group(i) for i in range(1, 4)]) if matches.group(4): suffix = matches.group(4)[1:] else: suffix = '' dataset.software_version = (cycle, major, minor, suffix)
class PhEDExReplicaInfoSource(ReplicaInfoSource): """ReplicaInfoSource using PhEDEx.""" def __init__(self, config): ReplicaInfoSource.__init__(self, config) self._phedex = PhEDEx(config.phedex) def replica_exists_at_site(self, site, item): #override options = ['node=' + site.name] if type(item) == Dataset: options += ['dataset=' + item.name, 'show_dataset=y'] elif type(item) == DatasetReplica: options += ['dataset=' + item.dataset.name, 'show_dataset=y'] elif type(item) == Block: options += ['block=' + item.full_name()] elif type(item) == BlockReplica: options += ['block=' + item.block.full_name()] else: raise RuntimeError('Invalid input passed: ' + repr(item)) source = self._phedex.make_request('blockreplicas', options) return len(source) != 0 def get_replicas(self, site=None, dataset=None, block=None): #override options = [] if site is not None: options.append('node=' + site) if dataset is not None: options.append('dataset=' + dataset) if block is not None: options.append('block=' + block) LOG.info('get_replicas(' + ','.join(options) + ') Fetching the list of replicas from PhEDEx') if len(options) == 0: return [] result = self._phedex.make_request('blockreplicas', ['show_dataset=y'] + options) return PhEDExReplicaInfoSource.make_block_replicas( result, PhEDExReplicaInfoSource.maker_blockreplicas) def get_updated_replicas(self, updated_since): #override LOG.info( 'get_updated_replicas(%d) Fetching the list of replicas from PhEDEx', updated_since) result = self._phedex.make_request( 'blockreplicas', ['show_dataset=y', 'update_since=%d' % updated_since]) return PhEDExReplicaInfoSource.make_block_replicas( result, PhEDExReplicaInfoSource.maker_blockreplicas) def get_deleted_replicas(self, deleted_since): #override LOG.info( 'get_deleted_replicas(%d) Fetching the list of replicas from PhEDEx', deleted_since) result = self._phedex.make_request( 'deletions', ['complete_since=%d' % deleted_since]) return PhEDExReplicaInfoSource.make_block_replicas( result, PhEDExReplicaInfoSource.maker_deletions) @staticmethod def make_block_replicas(dataset_entries, replica_maker): """Return a list of block replicas linked to Dataset, Block, Site, and Group""" block_replicas = [] for dataset_entry in dataset_entries: dataset = Dataset(dataset_entry['name']) for block_entry in dataset_entry['block']: name = block_entry['name'] try: block_name = Block.to_internal_name(name[name.find('#') + 1:]) except ValueError: # invalid name continue block = Block(block_name, dataset, block_entry['bytes']) block_replicas.extend(replica_maker(block, block_entry)) return block_replicas @staticmethod def maker_blockreplicas(block, block_entry): replicas = [] for replica_entry in block_entry['replica']: block_replica = BlockReplica( block, Site(replica_entry['node']), Group(replica_entry['group']), is_complete=(replica_entry['bytes'] == block.size), is_custodial=(replica_entry['custodial'] == 'y'), size=replica_entry['bytes'], last_update=int(replica_entry['time_update'])) replicas.append(block_replica) return replicas @staticmethod def maker_deletions(block, block_entry): replicas = [] for deletion_entry in block_entry['deletion']: block_replica = BlockReplica(block, Site(deletion_entry['node']), Group.null_group) replicas.append(block_replica) return replicas
class PhEDExGroupInfoSource(GroupInfoSource): """GroupInfoSource using PhEDEx.""" def __init__(self, config): GroupInfoSource.__init__(self, config) self._phedex = PhEDEx(config.phedex) def get_group(self, name): #override if self.include is not None: matched = False for pattern in self.include: if fnmatch.fnmatch(name, pattern): matched = True break if not matched: LOG.info('get_group(%s) %s is not included by configuration', name, name) return None if self.exclude is not None: for pattern in self.exclude: if fnmatch.fnmatch(name, pattern): LOG.info('get_group(%s) %s is excluded by configuration', name, name) return None LOG.info('get_group(%s) Fetching info on group %s', name, name) result = self._phedex.make_request('groups', ['group=' + name]) if len(result) == 0: return None group = Group(name) if name in self.dataset_level_groups: group.olevel = Dataset else: group.olevel = Block return group def get_group_list(self): #override LOG.info('get_group_list Fetching the list of groups from PhEDEx') LOG.debug('Groups with dataset-level ownership: %s', str(self.dataset_level_groups)) group_list = [] for entry in self._phedex.make_request('groups'): if self.include is not None: matched = False for pattern in self.include: if fnmatch.fnmatch(entry['name'], pattern): matched = True break if not matched: continue if self.exclude is not None: matched = False for pattern in self.exclude: if fnmatch.fnmatch(entry['name'], pattern): matched = True break if matched: continue if entry['name'] in self.dataset_level_groups: olevel = Dataset else: olevel = Block group_list.append(Group(entry['name'], olevel=olevel)) return group_list
def __init__(self, config): self._phedex = PhEDEx(config.get('phedex', None))
class PhEDExCopyInterface(CopyInterface): """Copy using PhEDEx.""" def __init__(self, config=None): config = Configuration(config) CopyInterface.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._history = HistoryDatabase(config.get('history', None)) self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12 def schedule_copies(self, replica_list, operation_id, comments=''): #override sites = set(r.site for r in replica_list) if len(sites) != 1: raise OperationalError( 'schedule_copies should be called with a list of replicas at a single site.' ) site = list(sites)[0] LOG.info( 'Scheduling copy of %d replicas to %s using PhEDEx (operation %d)', len(replica_list), site, operation_id) # sort the subscriptions into dataset level / block level and by groups subscription_lists = {} subscription_lists['dataset'] = collections.defaultdict( list) # {(level, group_name): [replicas]} subscription_lists['block'] = collections.defaultdict( list) # {(level, group_name): [replicas]} for replica in replica_list: if replica.growing: subscription_lists['dataset'][replica.group].append( replica.dataset) else: blocks_by_group = collections.defaultdict(set) for block_replica in replica.block_replicas: blocks_by_group[block_replica.group].add( block_replica.block) for group, blocks in blocks_by_group.iteritems(): subscription_lists['block'][group].extend(blocks) # for convenience, mapping dataset -> replica result = {} for level in ['dataset', 'block']: for group, items in subscription_lists[level].iteritems(): success = self._run_subscription_request( operation_id, site, group, level, items, comments) for replica in success: if replica.dataset in result: booked = result[replica.dataset] # need to merge for block_replica in replica.block_replicas: # there shouldn't be any block replica overlap but we will be careful if booked.find_block_replica( block_replica.block) is None: booked.block_replicas.add(block_replica) else: result[replica.dataset] = replica return result.values() def _run_subscription_request(self, operation_id, site, group, level, subscription_list, comments): # Make a subscription request for potentitally multiple datasets or blocks but to one site and one group full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in subscription_list: full_catalog[dataset] = [] elif level == 'block': for block in subscription_list: full_catalog[block.dataset].append(block) history_sql = 'INSERT INTO `phedex_requests` (`id`, `operation_type`, `operation_id`, `approved`) VALUES (%s, \'copy\', %s, %s)' success = [] # make requests in chunks request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.subscription_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'priority': 'low', 'move': 'n', 'static': 'n', 'custodial': 'n', 'group': group.name, 'request_only': 'n', 'no_mail': 'n', 'comments': comments } try: if self._read_only: result = [{'id': 0}] else: result = self._phedex.make_request('subscribe', options, method=POST) except: LOG.error('Copy %s failed.', str(options)) # we should probably do something here else: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx subscription request id: %d', request_id) if not self._read_only: self._history.db.query(history_sql, request_id, operation_id, True) for dataset, blocks in request_catalog.iteritems(): if level == 'dataset': replica = DatasetReplica(dataset, site, growing=True, group=group) for block in dataset.blocks: replica.block_replicas.add( BlockReplica(block, site, group, size=0, last_update=int(time.time()))) else: replica = DatasetReplica(dataset, site, growing=False) for block in blocks: replica.block_replicas.add( BlockReplica(block, site, group, size=0, last_update=int(time.time()))) success.append(replica) request_catalog = {} chunk_size = 0 items = [] return success def copy_status(self, history_record, inventory): #override request_ids = self._history.db.query( 'SELECT `id` FROM `phedex_requests` WHERE `operation_type` = \'copy\' AND `operation_id` = %s', history_record.operation_id) if len(request_ids) == 0: return {} return self.transfer_request_status(request_ids) def transfer_request_status(self, request_ids): status = {} LOG.debug('Querying PhEDEx transferrequests for requests %s', request_ids) requests = self._phedex.make_request('transferrequests', [('request', i) for i in request_ids], method=POST) if len(requests) == 0: return status for request in requests: # A single request can have multiple destinations site_names = [d['name'] for d in request['destinations']['node']] dataset_names = [] for ds_entry in request['data']['dbs']['dataset']: dataset_names.append(ds_entry['name']) block_names = [] for ds_entry in request['data']['dbs']['block']: block_names.append(ds_entry['name']) if len(dataset_names) != 0: # Process dataset-level subscriptions subscriptions = [] chunks = [ dataset_names[i:i + 35] for i in xrange(0, len(dataset_names), 35) ] for site_name in site_names: for chunk in chunks: subscriptions.extend( self._phedex.make_request( 'subscriptions', ['node=%s' % site_name] + ['dataset=%s' % n for n in chunk])) for dataset in subscriptions: dataset_name = dataset['name'] try: cont = dataset['subscription'][0] except KeyError: LOG.error('Subscription of %s should exist but doesn\'t', dataset_name) continue site_name = cont['node'] bytes = dataset['bytes'] node_bytes = cont['node_bytes'] if node_bytes is None: node_bytes = 0 elif node_bytes != bytes: # it's possible that there were block-level deletions blocks = self._phedex.make_request( 'blockreplicas', ['node=%s' % site_name, 'dataset=%s' % dataset_name]) bytes = sum(b['bytes'] for b in blocks) status[(site_name, dataset_name)] = (bytes, node_bytes, cont['time_update']) if len(block_names) != 0: # Process block-level subscriptions subscriptions = [] chunks = [ block_names[i:i + 35] for i in xrange(0, len(block_names), 35) ] for site_name in site_names: for chunk in chunks: subscriptions.extend( self._phedex.make_request( 'subscriptions', ['node=%s' % site_name] + ['block=%s' % n for n in chunk])) overridden = set() for dataset in subscriptions: dataset_name = dataset['name'] try: blocks = dataset['block'] except KeyError: try: cont = dataset['subscription'][0] except KeyError: LOG.error( 'Subscription of %s neither block-level nor dataset-level', dataset_name) continue site_name = cont['node'] if (site_name, dataset_name) in overridden: # this is a dataset-level subscription and we've processed this dataset already continue overridden.add((site_name, dataset_name)) LOG.debug( 'Block-level subscription of %s at %s is overridden', dataset_name, site_name) requested_blocks = [ name for name in block_names if name.startswith(dataset_name + '#') ] blocks = self._phedex.make_request( 'blockreplicas', ['node=%s' % site_name, 'dataset=%s' % dataset_name]) for block in blocks: block_name = block['name'] if block_name not in requested_blocks: continue replica = block['replica'][0] status[(site_name, block_name)] = (block['bytes'], replica['bytes'], replica['time_update']) continue for block in blocks: block_name = block['name'] try: cont = block['subscription'][0] except KeyError: LOG.error( 'Subscription of %s should exist but doesn\'t', block_name) continue node_bytes = cont['node_bytes'] if node_bytes is None: node_bytes = 0 status[(cont['node'], block_name)] = (block['bytes'], node_bytes, cont['time_update']) # now we pick up whatever did not appear in the subscriptions call for site_name in site_names: for dataset_name in dataset_names: key = (site_name, dataset_name) if key not in status: status[key] = None for block_name in block_names: key = (site_name, block_name) if key not in status: status[key] = None return status
class PhEDExSiteInfoSource(SiteInfoSource): """SiteInfoSource for PhEDEx. Also use CMS Site Status Board for additional information.""" def __init__(self, config=None): config = Configuration(config) SiteInfoSource.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._ssb = SiteStatusBoard(config.get('ssb', None)) self.ssb_cache_lifetime = config.get('ssb_cache_lifetime', 3600) self._ssb_cache_timestamp = 0 self._caching_lock = threading.Lock() self._waitroom_sites = set() self._morgue_sites = set() def get_site(self, name): #override if not self.check_allowed_site(name): LOG.info('get_site(%s) %s is excluded by configuration.', name, name) return None LOG.info('get_site(%s) Fetching information of %s from PhEDEx', name, name) # General site info result = self._phedex.make_request('nodes', ['node=' + name]) if len(result) == 0: return None entry = result[0] host = entry['se'] storage_type = Site.storage_type_val(entry['kind']) return Site(name, host=host, storage_type=storage_type) def get_site_list(self, inventory): #override LOG.info('get_site_list Fetching the list of nodes from PhEDEx') site_list = [] for entry in self._phedex.make_request('nodes'): site_name = entry['name'] if not self.check_allowed_site(site_name): continue siteObj_new = Site(site_name, host=entry['se'], storage_type=Site.storage_type_val( entry['kind'])) if site_name in inventory.sites: siteObj_old = inventory.sites[site_name] siteObj_new.backend = siteObj_old.backend siteObj_new.x509proxy = siteObj_old.x509proxy site_list.append(siteObj_new) return site_list def get_site_status(self, site_name): #override with self._caching_lock: if time.time( ) > self._ssb_cache_timestamp + self.ssb_cache_lifetime: self._waitroom_sites = set() self._morgue_sites = set() latest_status = {} # get list of sites in waiting room (153) and morgue (199) for colid, stat, sitelist in [ (153, Site.STAT_WAITROOM, self._waitroom_sites), (199, Site.STAT_MORGUE, self._morgue_sites) ]: result = self._ssb.make_request( 'getplotdata', 'columnid=%d&time=2184&dateFrom=&dateTo=&sites=all&clouds=undefined&batch=1' % colid) for entry in result: site = entry['VOName'] # entry['Time'] is UTC but we are only interested in relative times here timestamp = time.mktime( time.strptime(entry['Time'], '%Y-%m-%dT%H:%M:%S')) if site in latest_status and latest_status[site][ 0] > timestamp: continue if entry['Status'] == 'in': latest_status[site] = (timestamp, stat) else: latest_status[site] = (timestamp, Site.STAT_READY) for site, (_, stat) in latest_status.items(): if stat == Site.STAT_WAITROOM: self._waitroom_sites.add(site) elif stat == Site.STAT_MORGUE: self._morgue_sites.add(site) self._ssb_cache_timestamp = time.time() if site_name in self._waitroom_sites: return Site.STAT_WAITROOM elif site_name in self._morgue_sites: return Site.STAT_MORGUE else: return Site.STAT_READY def get_filename_mapping(self, site_name): #override tfc = self._phedex.make_request('tfc', ['node=' + site_name])['array'] conversions = {} for elem in tfc: if elem['element_name'] != 'lfn-to-pfn': continue if 'destination-match' in elem and re.match( elem['destination-match'], site_name) is None: continue if 'chain' in elem: chain = elem['chain'] else: chain = None result = elem['result'] i = 1 while '$' in result: result = result.replace('$%d' % i, '{%d}' % (i - 1)) i += 1 if i == 100: # can't be possibly right break result = result.replace('\\', '') if elem['protocol'] in conversions: conversions[elem['protocol']].append( (elem['path-match'], result, chain)) else: conversions[elem['protocol']] = [(elem['path-match'], result, chain)] def make_mapping_chains(rule): if rule[2] is None: return [[(rule[0], rule[1])]] else: if rule[2] not in conversions: return None chains = [] for chained_rule in conversions[rule[2]]: mapped_chains = make_mapping_chains(chained_rule) if mapped_chains is None: continue chains.extend(mapped_chains) for chain in chains: chain.append((rule[0], rule[1])) return chains mappings = {} for protocol, rules in conversions.items(): if protocol == 'direct': continue if protocol == 'srmv2': # for historic reasons PhEDEx calls gfal2 srmv2 protocol = 'gfal2' mapping = [] for rule in rules: chains = make_mapping_chains(rule) if chains is None: continue mapping.extend(chains) mappings[protocol] = mapping return mappings
class PhEDExDeletionInterface(DeletionInterface): """Deletion using PhEDEx.""" def __init__(self, config=None): config = Configuration(config) DeletionInterface.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._history = HistoryDatabase(config.get('history', None)) self.auto_approval = config.get('auto_approval', True) self.allow_tape_deletion = config.get('allow_tape_deletion', True) self.tape_auto_approval = config.get('tape_auto_approval', False) self.deletion_chunk_size = config.get('chunk_size', 50.) * 1.e+12 def schedule_deletions(self, replica_list, operation_id, comments=''): #override sites = set(r.site for r, b in replica_list) if len(sites) != 1: raise OperationalError( 'schedule_copies should be called with a list of replicas at a single site.' ) site = list(sites)[0] if site.storage_type == Site.TYPE_MSS and not self.allow_tape_deletion: LOG.warning('Deletion from MSS not allowed by configuration.') return [] if self.allow_tape_deletion and self.auto_approval: LOG.warning( 'You cannot have auto-approved tape deletions. Set auto-approval to False.' ) return [] # execute the deletions in two steps: one for dataset-level and one for block-level datasets = [] blocks = [] # maps used later for cloning # getting ugly here.. should come up with a better way of making clones replica_map = {} block_replica_map = {} for dataset_replica, block_replicas in replica_list: if block_replicas is None: datasets.append(dataset_replica.dataset) else: blocks.extend(br.block for br in block_replicas) replica_map[dataset_replica.dataset] = dataset_replica block_replica_map.update( (br.block, br) for br in block_replicas) success = [] deleted_datasets = self._run_deletion_request(operation_id, site, 'dataset', datasets, comments) for dataset in deleted_datasets: replica = DatasetReplica(dataset, site, growing=False, group=Group.null_group) success.append((replica, None)) tmp_map = dict((dataset, []) for dataset in replica_map.iterkeys()) deleted_blocks = self._run_deletion_request(operation_id, site, 'block', blocks, comments) for block in deleted_blocks: tmp_map[block.dataset].append(block) for dataset, blocks in tmp_map.iteritems(): replica = DatasetReplica(dataset, site) replica.copy(replica_map[dataset]) success.append((replica, [])) for block in blocks: block_replica = BlockReplica(block, site, Group.null_group) block_replica.copy(block_replica_map[block]) block_replica.last_update = int(time.time()) success[-1][1].append(block_replica) return success def _run_deletion_request(self, operation_id, site, level, deletion_list, comments): full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in deletion_list: full_catalog[dataset] = [] elif level == 'block': for block in deletion_list: full_catalog[block.dataset].append(block) history_sql = 'INSERT INTO `phedex_requests` (`id`, `operation_type`, `operation_id`, `approved`) VALUES (%s, \'deletion\', %s, %s)' deleted_items = [] request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.deletion_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'rm_subscriptions': 'y', 'comments': comments } # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if successful try: if self._read_only: result = [{'id': 0}] else: result = self._phedex.make_request('delete', options, method=POST) except: LOG.error('Deletion %s failed.', str(options)) if self._phedex.last_errorcode == 400: # Sometimes we have invalid data in the list of objects to delete. # PhEDEx throws a 400 error in such a case. We have to then try to identify the # problematic item through trial and error. if len(items) == 1: LOG.error('Could not delete %s from %s', str(items[0]), site.name) else: LOG.info('Retrying with a reduced item list.') deleted_items.extend( self._run_deletion_request(operation_id, site, level, items[:len(items) / 2], comments)) deleted_items.extend( self._run_deletion_request(operation_id, site, level, items[len(items) / 2:], comments)) else: raise else: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx deletion request id: %d', request_id) approved = False if self._read_only: approved = True elif self.auto_approval: try: result = self._phedex.make_request('updaterequest', { 'decision': 'approve', 'request': request_id, 'node': site.name }, method=POST) except: LOG.error('deletion approval of request %d failed.', request_id) else: approved = True if not self._read_only: self._history.db.query(history_sql, request_id, operation_id, approved) if approved: deleted_items.extend(items) request_catalog = {} chunk_size = 0 items = [] return deleted_items def deletion_status(self, request_id): #override request = self._phedex.make_request('deleterequests', 'request=%d' % request_id) if len(request) == 0: return {} node_info = request[0]['nodes']['node'][0] site_name = node_info['name'] last_update = node_info['decided_by']['time_decided'] status = {} for ds_entry in request[0]['data']['dbs']['dataset']: status[ds_entry['name']] = (ds_entry['bytes'], ds_entry['bytes'], last_update) return status
def __init__(self, config): GroupInfoSource.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None))
class PhEDExDatasetInfoSource(DatasetInfoSource): """DatasetInfoSource using PhEDEx and DBS.""" def __init__(self, config=None): if config is None: config = Configuration() DatasetInfoSource.__init__(self, config) self._phedex = PhEDEx(config.get('phedex', None)) self._dbs = DBS(config.get('dbs', None)) def get_dataset_names(self, include=['*'], exclude=[]): dataset_names = [] exclude_exps = [] for pattern in exclude: exclude_exps.append(re.compile(fnmatch.translate(pattern))) def add_datasets(result): for entry in result: name = entry['dataset'] for ex_exp in exclude_exps: if ex_exp.match(name): break else: # not excluded by args, now check my include/exclude list if self.check_allowed_dataset(name): dataset_names.append(name) if len(include) == 1 and include[0] == '/*/*/*': # all datasets requested - will do this efficiently result = self._dbs.make_request('acquisitioneras') sds = [entry['acquisition_era_name'] for entry in result] # query DBS in parallel args = [('datasets', ['acquisition_era_name=' + sd]) for sd in sds] results = Map().execute(self._dbs.make_request, args) for result in results: add_datasets(result) for in_pattern in include: if in_pattern.startswith('/') and in_pattern.count('/') == 3: result = self._dbs.make_request('datasets', ['dataset=' + in_pattern]) add_datasets(result) return dataset_names def get_updated_datasets(self, updated_since): #override LOG.warning( 'PhEDExDatasetInfoSource can only return a list of datasets and blocks that are created since the given timestamp.' ) result = self._phedex.make_request('data', [ 'dataset=' + name, 'level=block', 'create_since=%d' % updated_since ]) try: dataset_entries = result[0]['dataset'] except: return [] if self.include is not None or self.exclude is not None: ientry = 0 while ientry != len(dataset_entries): if self.check_allowed_dataset(dataset_entries[ientry]['name']): ientry += 1 else: dataset_entries.pop(ientry) return Map().execute(self._create_dataset, dataset_entries) def get_dataset(self, name, with_files=False): #override ## Get the full dataset-block-file data from PhEDEx if not name.startswith('/') or name.count('/') != 3: return None if not self.check_allowed_dataset(name): return None def get_dbs_datasets(name, dbs_data): dbs_data['datasets'] = self._dbs.make_request( 'datasets', ['dataset=' + name, 'dataset_access_type=*', 'detail=True']) def get_dbs_releaseversions(name, dbs_data): dbs_data['releaseversions'] = self._dbs.make_request( 'releaseversions', ['dataset=' + name]) dbs_data = {} th1 = threading.Thread(target=get_dbs_datasets, args=(name, dbs_data)) th1.start() th2 = threading.Thread(target=get_dbs_releaseversions, args=(name, dbs_data)) th2.start() if with_files: level = 'file' else: level = 'block' result = self._phedex.make_request( 'data', ['dataset=' + name, 'level=' + level]) th1.join() th2.join() try: dataset_entry = result[0]['dataset'][0] except: return None ## Create the dataset object dataset = self._create_dataset(dataset_entry, dbs_data) ## Fill block and file data if 'block' in dataset_entry: for block_entry in dataset_entry['block']: block = self._create_block(block_entry, dataset) dataset.blocks.add(block) if with_files and 'file' in block_entry: # See comments in get_block block._files = set() for file_entry in block_entry['file']: block._files.add(self._create_file(file_entry, block)) return dataset def get_block(self, name, with_files=False): #override ## Get the full block-file data from PhEDEx if not name.startswith('/') or name.count('/') != 3 or '#' in name: return None if not self.check_allowed_dataset(name[:name.find('#')]): return None if with_files: level = 'file' else: level = 'block' result = self._phedex.make_request('data', ['block=' + name, 'level=' + level]) try: dataset_entry = result[0]['dataset'][0] block_entry = dataset_entry['block'][0] except: return None # Just need a named object dataset = Dataset(dataset_entry['name']) block = self._create_block(block_entry, dataset) if with_files and 'file' in block_entry: # _create_block sets size and num_files; just need to update the files list # Directly creating the _files set # This list will persist (unlike the weak proxy version loaded from inventory), but the returned block # from this function is only used temporarily anyway block._files = set() for file_entry in block_entry['file']: block._files.add(self._create_file(file_entry, block)) return block def get_file(self, name): ## Get the file data from PhEDEx result = self._phedex.make_request('data', ['file=' + name, 'level=file']) try: dataset_entry = result[0]['dataset'][0] block_entry = dataset_entry['block'][0] file_entry = block_entry['file'][0] except: return None if not self.check_allowed_deataset(dataset_entry['name']): return None bname = block_entry['name'] block_name = Block.to_internal_name(bname[bname.find('#') + 1:]) # Just need a named object dataset = Dataset(dataset_entry['name']) block = Block(block_name, dataset) lfile = self._create_file(file_entry, block) return lfile def get_files(self, dataset_or_block): #override files = set() if type(dataset_or_block) is Dataset: result = self._phedex.make_request( 'data', ['dataset=' + dataset_or_block.name, 'level=file']) blocks = dict((b.name, b) for b in dataset_or_block.blocks) else: result = self._phedex.make_request( 'data', ['block=' + dataset_or_block.full_name(), 'level=file']) blocks = {dataset_or_block.name: dataset_or_block} try: block_entries = result[0]['dataset'][0]['block'] except: return files for block_entry in block_entries: try: file_entries = block_entry['file'] except: continue bname = block_entry['name'] block_name = Block.to_internal_name(bname[bname.find('#') + 1:]) try: block = blocks[block_name] except: # unknown block! maybe should raise? continue for file_entry in file_entries: files.add(self._create_file(file_entry, block)) return files def _create_dataset(self, dataset_entry, dbs_data=None): """ Create a dataset object with blocks and files from a PhEDEx dataset entry """ dataset = Dataset(dataset_entry['name'], is_open=(dataset_entry['is_open'] == 'y')) if 'time_update' in dataset_entry and dataset_entry[ 'time_update'] is not None: dataset.last_update = int(dataset_entry['time_update']) else: dataset.last_update = int(dataset_entry['time_create']) ## Get other details of the dataset from DBS self._fill_dataset_details(dataset, dbs_data) return dataset def _create_block(self, block_entry, dataset): """ Create a block object with files from a PhEDEx block entry """ bname = block_entry['name'] block_name = Block.to_internal_name(bname[bname.find('#') + 1:]) block = Block(block_name, dataset, size=block_entry['bytes'], num_files=block_entry['files'], is_open=(block_entry['is_open'] == 'y')) if 'time_update' in block_entry and block_entry[ 'time_update'] is not None: block.last_update = int(block_entry['time_update']) else: block.last_update = int(block_entry['time_create']) return block def _create_file(self, file_entry, block): adler32 = '' crc32 = 0 for cksum in file_entry['checksum'].split(','): if cksum.startswith('adler32'): adler32 = cksum[8:] elif cksum.startswith('cksum'): crc32 = int(cksum[6:]) lfile = File(file_entry['lfn'], block=block, size=file_entry['size'], checksum=(crc32, adler32)) return lfile def _fill_dataset_details(self, dataset, dbs_data=None): if dbs_data is None: dbs_data = {} if dataset.name.startswith('/') and dataset.name.count('/') == 3: dbs_data['datasets'] = self._dbs.make_request( 'datasets', [ 'dataset=' + dataset.name, 'dataset_access_type=*', 'detail=True' ]) else: dbs_data['datasets'] = [] dbs_data['releaseversions'] = self._dbs.make_request( 'releaseversions', ['dataset=' + dataset.name]) # 1. status and PD type if len(dbs_data['datasets']) != 0: dbs_entry = dbs_data['datasets'][0] dataset.status = Dataset.status_val( dbs_entry['dataset_access_type']) dataset.data_type = Dataset.data_type_val( dbs_entry['primary_ds_type']) else: dataset.status = Dataset.STAT_UNKNOWN dataset.data_type = Dataset.TYPE_UNKNOWN # 2. software version if len(dbs_data['releaseversions']) != 0: try: version = dbs_data['releaseversions'][0]['release_version'][0] except KeyError: pass else: matches = re.match('CMSSW_([0-9]+)_([0-9]+)_([0-9]+)(|_.*)', version) if matches: cycle, major, minor = map( int, [matches.group(i) for i in range(1, 4)]) if matches.group(4): suffix = matches.group(4)[1:] else: suffix = '' dataset.software_version = (cycle, major, minor, suffix)
class PhEDExDeletionInterface(DeletionInterface): """Deletion using PhEDEx.""" def __init__(self, config): DeletionInterface.__init__(self, config) self._phedex = PhEDEx(config.phedex) self.auto_approval = config.auto_approval self.allow_tape_deletion = config.allow_tape_deletion self.tape_auto_approval = config.tape_auto_approval self.deletion_chunk_size = config.chunk_size * 1.e+12 def schedule_deletion(self, replica, comments=''): #override request_mapping = {} if replica.site.storage_type == Site.TYPE_MSS and self.allow_tape_deletion: LOG.warning('Deletion from MSS is not allowed by configuration.') return request_mapping deletion_list = [] if type(replica) is DatasetReplica: replica_blocks = set(r.block for r in replica.block_replicas) if replica_blocks == replica.dataset.blocks: deletion_list.append(replica.dataset) level = 'dataset' else: deletion_list.extend(replica_blocks) level = 'block' else: #BlockReplica deletion_list.append(replica.block) level = 'block' self._run_deletion_request(request_mapping, replica.site, level, deletion_list, comments) return request_mapping def schedule_deletions(self, replica_list, comments=''): #override request_mapping = {} replicas_by_site = collections.defaultdict(list) for replica in replica_list: replicas_by_site[replica.site].append(replica) if replica.site.storage_type == Site.TYPE_MSS and not self.allow_tape_deletion: LOG.warning('Deletion from MSS not allowed by configuration.') return {} for site, replica_list in replicas_by_site.iteritems(): # execute the deletions in two steps: one for dataset-level and one for block-level deletion_lists = {'dataset': [], 'block': []} for replica in replica_list: if type(replica) is DatasetReplica: blocks = set(r.block for r in replica.block_replicas) if blocks == replica.dataset.blocks: deletion_lists['dataset'].append(replica.dataset) else: deletion_lists['block'].extend(blocks) else: #BlockReplica deletion_lists['block'].append(replica.block) self._run_deletion_request(request_mapping, site, 'dataset', deletion_lists['dataset'], comments) self._run_deletion_request(request_mapping, site, 'block', deletion_lists['block'], comments) return request_mapping def _run_deletion_request(self, request_mapping, site, level, deletion_list, comments): full_catalog = collections.defaultdict(list) if level == 'dataset': for dataset in deletion_list: full_catalog[dataset] = [] elif level == 'block': for block in deletion_list: full_catalog[block.dataset].append(block) request_catalog = {} chunk_size = 0 items = [] while len(full_catalog) != 0: dataset, blocks = full_catalog.popitem() request_catalog[dataset] = blocks if level == 'dataset': chunk_size += dataset.size items.append(dataset) elif level == 'block': chunk_size += sum(b.size for b in blocks) items.extend(blocks) if chunk_size < self.deletion_chunk_size and len( full_catalog) != 0: continue options = { 'node': site.name, 'data': self._phedex.form_catalog_xml(request_catalog), 'level': level, 'rm_subscriptions': 'y', 'comments': comments } # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if successful if self.dry_run: result = [{'id': '0'}] else: try: result = self._phedex.make_request('delete', options, method=POST) except: if self._phedex.last_errorcode == 400: # Sometimes we have invalid data in the list of objects to delete. # PhEDEx throws a 400 error in such a case. We have to then try to identify the # problematic item through trial and error. if len(items) == 1: LOG.error('Could not delete %s from %s', str(items[0]), site.name) result = [] else: self._run_deletion_request(request_mapping, site, level, item[:len(item) / 2], comments) self._run_deletion_request(request_mapping, site, level, item[len(item) / 2:], comments) else: result = [] if len(result) != 0: request_id = int(result[0]['id']) # return value is a string LOG.warning('PhEDEx deletion request id: %d', request_id) approved = False if self.dry_run: approved = True elif self.auto_approval: try: result = self._phedex.make_request('updaterequest', { 'decision': 'approve', 'request': request_id, 'node': site.name }, method=POST) except: LOG.error('deletion approval of request %d failed.', request_id) else: approved = True request_mapping[request_id] = (approved, site, items) else: LOG.error('Deletion %s failed.', str(options)) # we should probably do something here request_catalog = {} chunk_size = 0 items = [] def deletion_status(self, request_id): #override request = self._phedex.make_request('deleterequests', 'request=%d' % request_id) if len(request) == 0: return {} node_info = request[0]['nodes']['node'][0] site_name = node_info['name'] last_update = node_info['decided_by']['time_decided'] status = {} for ds_entry in request[0]['data']['dbs']['dataset']: status[ds_entry['name']] = (ds_entry['bytes'], ds_entry['bytes'], last_update) return status
class InvalidationRequest(WebModule): def __init__(self, config): WebModule.__init__(self, config) self.dbs = DBS() self.phedex = PhEDEx() self.registry = RegistryDatabase() self.authorized_users = list(config.file_invalidation.authorized_users) def run(self, caller, request, inventory): if caller.name not in self.authorized_users: raise AuthorizationError() try: item = request['item'] except KeyError: raise MissingParameter('item') if type(item) is list: items = item else: items = [item] invalidated_items = [] sql = 'INSERT INTO `invalidations` (`item`, `db`, `user_id`, `timestamp`) VALUES (%s, %s, %s, NOW())' for item in items: invalidated = False if item in inventory.datasets: # item is a dataset result = self.dbs.make_request('datasets', [ 'dataset=' + item, 'dataset_access_type=*', 'detail=true' ]) if len(result) != 0: status = result[0]['dataset_access_type'] if status in ('VALID', 'PRODUCTION'): self.registry.db.query(sql, item, 'dbs', caller.id) for entry in self.dbs.make_request( 'files', ['dataset=' + item, 'validFileOnly=1']): self.registry.db.query(sql, entry['logical_file_name'], 'dbs', caller.id) invalidated = True result = self.phedex.make_request( 'data', ['dataset=' + item, 'level=block']) if len(result) != 0: self.registry.db.query(sql, item, 'tmdb', caller.id) invalidated = True else: try: dataset_name, block_name = Block.from_full_name(item) except: lfile = inventory.find_file(item) if lfile is not None: # item is a file result = self.dbs.make_request( 'files', ['logical_file_name=' + item, 'validFileOnly=1']) if len(result) != 0: self.registry.db.query( sql, result[0]['logical_file_name'], 'dbs', caller.id) invalidated = True result = self.phedex.make_request( 'data', ['file=' + item]) if len(result) != 0: self.registry.db.query(sql, item, 'tmdb', caller.id) invalidated = True else: # item is a block for entry in self.dbs.make_request( 'files', ['block_name=' + item, 'validFileOnly=1']): self.registry.db.query(sql, entry['logical_file_name'], 'dbs', caller.id) invalidated = True result = self.phedex.make_request( 'data', ['block=' + item, 'level=block']) if len(result) != 0: self.registry.db.query(sql, item, 'tmdb', caller.id) invalidated = True if invalidated: invalidated_items.append({'item': item}) return invalidated_items
def __init__(self, config): GroupInfoSource.__init__(self, config) self._phedex = PhEDEx(config.phedex)
def __init__(self, config): ReplicaInfoSource.__init__(self, config) self._phedex = PhEDEx(config.phedex)