class RunSync(object): """ Synchronize the replica of a given run at WIPAC-ORIG the corresponding Rucio site. """ def __init__(self, run, originrse=DEFAULT_ORIGIN_RSE, destrse=None, scope=DEFAULT_SCOPE, check=True, lifetime=None, dry_run=False, container=None): """ :param dataset: Name of the PhEDEx dataset to synchronize with Rucio. :param pnn: PhEDEx node name to filter on for replica information. """ self.run = run self.originrse = originrse self.destrse = destrse self.scope = scope self.check = check self.lifetime = lifetime self.dry_run = dry_run self.container = container self.rucio_datasets = {} self.run_files = {} self.existent_replica_files = {} self.url = '' self.gfal = Gfal2Context() self.run_Number = None self.get_run_Number() self.files_storage = {} self.get_global_url() self.didc = DIDClient() self.repc = ReplicaClient() self.rulesClient = RuleClient() # Right now obtaining the Metadata from the storage at WIPAC # Hopefully in the future from JADE # TODO self.get_run_Files() self.get_rucio_metadata() self.update_run_Files() self.get_files_metadata() def update_run_Files(self): """ Updating the run files wiht only the files that have not been registered """ for f in self.existent_replica_files: file_name = f.split('/')[-1:][0] if file_name in self.run_files: print("File: %s already registered. Skipping it" % file_name) self.run_files.pop(file_name) def get_files_metadata(self): for f in self.run_files: if self.run + '/' + f not in self.existent_replica_files: self.obtain_metadata(f) print("Metadat initialization done") def obtain_metadata(self, filename): """ Get the size and checksum for every file in the run from the gftp server """ url = self.get_file_url(filename) print("checking metadata for url %s" % url) try: size = self.gfal.stat(str(url)).st_size adler32 = self.gfal.checksum(str(url), 'adler32') print( "got size and adler 32checksum of file: pfn=%s size=%s checksum=%s" % (url, size, adler32)) self.run_files[filename] = { 'size': size, 'adler32': adler32, 'name': self.run + '/' + filename } except GError: print("no file found at %s" % url) return False def get_file_url(self, filename): return self.url + '/' + self.run + '/' + filename def get_global_url(self): """ Return the base path of the rucio url """ print("Getting parameters for rse %s" % self.originrse) rse = rsemgr.get_rse_info(self.originrse) proto = rse['protocols'][0] schema = proto['scheme'] prefix = proto['prefix'] + self.scope.replace('.', '/') if schema == 'srm': prefix = proto['extended_attributes']['web_service_path'] + prefix url = schema + '://' + proto['hostname'] if proto['port'] != 0: url = url + ':' + str(proto['port']) self.url = url + prefix print("Determined base url %s" % self.url) def get_run_Number(self): """ Obtain the run number out of whole run IceCube/2016/filtered/level2pass2/0101/Run00127347 """ print("Obtaining run number out of run(dataset): %s" % self.run) self.run_Number = self.run.split("/")[-1] print("Run number (dataset): %s" % self.run_Number) def get_run_Files(self): """ Gets the list of files for a given run and their checksums from the storage """ self.run_url = self.url + '/' + self.run print("Listin files from url : %s" % self.run_url) run_files = [] try: run_files = self.gfal.listdir(str(self.run_url)) except GError: print("No files found at %s" % str(self.run_url)) print("Files found in storage:") count = 0 for f in run_files: if len(f) > 3: if count < 5000: self.run_files[f] = {} count = count + 1 else: break def get_rucio_metadata(self): """ Gets the list of datasets at the Rucio RSE, the files, and the metadata. """ print( "Initializing Rucio... getting the list of blocks and files at %s" % self.originrse) registered_datasets = self.repc.list_datasets_per_rse(self.originrse) for dataset in registered_datasets: self.rucio_datasets[dataset] = {} replica_info = self.repc.list_replicas([{ "scope": self.scope, "name": '/' + self.run_Number }], rse_expression="rse=%s" % self.originrse) replica_files = set() for file_info in replica_info: name = file_info['name'] if self.originrse in file_info['rses']: replica_files.add(name) self.existent_replica_files = replica_files print("Rucio initialization done.") def register(self): """ Create the container, the datasets and attach them to the container. """ print("Registering...") self.register_dataset(self.run_Number) self.register_replicas(self.run_files) self.register_container(self.container) self.attach_dataset_to_container(self.run_Number, self.container) self.add_replica_rule(dataset=self.run_Number, destRSE=self.destrse) def register_container(self, container): """ Registering the container """ print("Registering the container %s with scope: %s" % (container, self.scope)) if container is None: print('No container added, not registering any container') return if self.dry_run: print('Dry run only, not registering the container') return try: self.didc.add_container(scope=self.scope, name=container, lifetime=self.lifetime) except DataIdentifierAlreadyExists: print("Container %s already exists" % container) except InvalidObject: print("Problem with container name: %s" % container) def attach_dataset_to_container(self, dataset, container): """ Attaching the dataset to a container """ print("Attaching dataset %s, to container: %s" % (dataset, container)) if container is None: print('No container added, not registering dataset in container') return if self.dry_run: print('Dry run only, not attaching dataset container') return try: self.didc.attach_dids(scope=self.scope, name=container, dids=[{ 'scope': self.scope, 'name': '/' + dataset }]) except RucioException: print("dataset already attached to container") return def register_dataset(self, run): """ Registering a dataset in the rucio database """ print("registering dataset %s" % run) if self.dry_run: print(' Dry run only. Not creating dataset.') return try: self.didc.add_dataset(scope=self.scope, name=run, lifetime=self.lifetime) except DataIdentifierAlreadyExists: print(" Dataset %s already exists" % run) def register_replicas(self, replicas): """ Register file replica. """ if not replicas: return print("registering files in Rucio: %s" % ", ".join([replicas[filemd]['name'] for filemd in replicas])) if self.dry_run: print(' Dry run only. Not registering files.') return try: self.repc.add_replicas(rse=self.originrse, files=[{ 'scope': self.scope, 'name': replicas[filemd]['name'], 'adler32': replicas[filemd]['adler32'], 'bytes': replicas[filemd]['size'], } for filemd in replicas]) print("Adding files to dataset: %s" % self.run_Number) except InvalidObject: print("Problem with file name does not match pattern") for filemd in replicas: try: self.didc.attach_dids(scope=self.scope, name=self.run_Number, dids=[{ 'scope': self.scope, 'name': replicas[filemd]['name'] }]) except FileAlreadyExists: print("File already attached") def add_replica_rule(self, destRSE, dataset): """ Create a replication rule for one dataset "Run" at an RSE """ print("Creating replica rule for dataset %s at rse: %s" % (dataset, destRSE)) if self.dry_run: print(' Dry run only. Not creating rules') return if destRSE: try: self.rulesClient.add_replication_rule([{ "scope": self.scope, "name": "/" + dataset }], copies=1, rse_expression=destRSE) except DuplicateRule: print('Rule already exists')
class Rucio : def __init__(self, myscope, orgRse, destRse, account='bruzzese', working_folder=None): self.myscope = myscope self.orgRse = orgRse self.destRse = destRse self.working_folder = working_folder self.gfal = Gfal2Context() self.didc = DIDClient() self.repc = ReplicaClient() self.rulesClient = RuleClient() # Configuration self.account = account # account=account self.client = Client(account=self.account) # Get list of all RSEs def rses(self) : rses_lists = list() for single_rse in list(self.client.list_rses()) : rses_lists.append(single_rse['rse']) return(rses_lists) def usage(self,s_rse) : return(list(self.client.get_local_account_usage(account=self.account,rse=s_rse))[0]) def rules(self) : return(list(self.client.list_account_rules(account=self.account))) def myfunc(self): print("Hello your setting are account=%s, scope=%s, origin RSE =%s and destination RSE =%s" %(self.account, self.myscope, self.orgRse, self.destRse)) def file_exists(self, pfn) : try : self.gfal.stat(pfn).st_size return(True) except : return(False) def get_rse_url(self): """ Return the base path of the rucio url """ rse_settings = rsemgr.get_rse_info(self.orgRse) protocol = rse_settings['protocols'][0] schema = protocol['scheme'] prefix = protocol['prefix'] port = protocol['port'] rucioserver = protocol['hostname'] rse_url = list() if None not in (schema,str(rucioserver+':'+str(port)),prefix): rse_url.extend([schema,rucioserver+':'+str(port),prefix,'','']) if self.working_folder != None : # Check if our test folder exists path = os.path.join(urlunsplit(rse_url), self.working_folder) self.gfal.mkdir_rec(path, 775) return(path) else : return(urlunsplit(rse_url)) else : return('Wrong url parameters') def check_replica(self, lfn, dest_rse=None): """ Check if a replica of the given file at the site already exists. """ if lfn : replicas = list( self.client.list_replicas([{ 'scope': self.myscope, 'name': lfn }], rse_expression=dest_rse)) if replicas: for replica in replicas: if isinstance(replica,dict) : if dest_rse in replica['rses']: path = replica['rses'][dest_rse][0] return(path) return(False) ############################ ## Create Metadata for DIDs ############################ def getFileMetaData(self, p_file, origenrse=None): """ Get the size and checksum for every file in the run from defined path """ ''' generate the registration of the file in a RSE : :param rse: the RSE name. :param scope: The scope of the file. :param name: The name of the file. :param bytes: The size in bytes. :param adler32: adler32 checksum. :param pfn: PFN of the file for non deterministic RSE :param dsn: is the dataset name. ''' name = os.path.basename(p_file) name = name.replace('/','') replica = { 'scope': self.myscope, 'name': name.replace('+','_'), 'adler32': self.gfal.checksum(p_file, 'adler32'), 'bytes': self.gfal.stat(p_file).st_size, 'pfn': p_file, "meta": {"guid": str(generate_uuid())} } Data = dict(); Data['replica'] = replica Data['scope'] = self.myscope return(Data) ############################ ## Create Groups of DIDs ############################ def createDataset(self, new_dataset) : logger.debug("| - - Checking if a provided dataset exists: %s for a scope %s" % (new_dataset, self.myscope)) try: self.client.add_dataset(scope=self.myscope, name=new_dataset) return(True) except DataIdentifierAlreadyExists: return(False) except Duplicate as error: return generate_http_error_flask(409, 'Duplicate', error.args[0]) except AccountNotFound as error: return generate_http_error_flask(404, 'AccountNotFound', error.args[0]) except RucioException as error: exc_type, exc_obj, tb = sys.exc_info() logger.debug(exc_obj) def createcontainer(self, name_container): ''' registration of the dataset into a container : :param name_container: the container's name :param info_dataset : contains, the scope: The scope of the file. the name: The dataset name. ''' logger.debug("| - - - registering container %s" % name_container) try: self.client.add_container(scope=self.myscope, name=name_container) except DataIdentifierAlreadyExists: logger.debug("| - - - Container %s already exists" % name_container) except Duplicate as error: return generate_http_error_flask(409, 'Duplicate', error.args[0]) except AccountNotFound as error: return generate_http_error_flask(404, 'AccountNotFound', error.args[0]) except RucioException as error: exc_type, exc_obj, tb = sys.exc_info() logger.debug(exc_obj) ############################ ## General funciotn for registering a did into a GROUP of DID (CONTAINER/DATASET) ############################ def registerIntoGroup(self,n_file, new_dataset): """ Attaching a DID to a GROUP """ type_1 = self.client.get_did(scope=self.myscope, name=new_dataset) type_2 = self.client.get_did(scope=self.myscope, name=n_file) try: self.client.attach_dids(scope=self.myscope, name=new_dataset, dids=[{'scope':self.myscope, 'name':n_file}]) except RucioException: logger.debug("| - - - %s already attached to %s" %(type_2['type'],type_1['type'])) ############################ ## MAGIC functions ############################ def create_groups(self, organization) : # 2.1) Create the dataset and containers for the file self.createDataset(organization['dataset_1']) # 2.1.1) Attach the dataset and containers for the file self.registerIntoGroup(organization['replica'], organization['dataset_1']) # 2.2) Create the dataset and containers for the file self.createcontainer(organization['container_1']) # 2.2.1) Attach the dataset and containers for the file self.registerIntoGroup(organization['dataset_1'], organization['container_1']) # 2.3) Create the dataset and containers for the file self.createcontainer(organization['container_2']) # 2.3.1) Attach the dataset and containers for the file self.registerIntoGroup(organization['container_1'], organization['container_2']) # 2.4) Create the dataset and containers for the file self.createcontainer(organization['container_3']) # 2.4.1) Attach the dataset and containers for the file self.registerIntoGroup(organization['container_2'], organization['container_3']) ############################ ## Create Rule for DIDs ############################ def addReplicaRule(self, destRSE, group): """ Create a replication rule for one dataset at a destination RSE """ type_1 = self.client.get_did(scope=self.myscope, name=group) logger.debug("| - - - Creating replica rule for %s %s at rse: %s" % (type_1['type'], group, destRSE)) if destRSE: try: rule = self.rulesClient.add_replication_rule([{"scope":self.myscope,"name":group}],copies=1, rse_expression=destRSE, grouping='ALL', account=self.account, purge_replicas=True) logger.debug("| - - - - Rule succesfully replicated at %s" % destRSE) logger.debug("| - - - - - The %s has the following id %s" % (rule, destRSE)) return(rule[0]) except DuplicateRule: exc_type, exc_obj, tb = sys.exc_info() rules = list(self.client.list_account_rules(account=self.account)) if rules : for rule in rules : if rule['rse_expression'] == destRSE and rule['scope'] == self.myscope and rule['name'] == group: logger.debug('| - - - - Rule already exists %s which contains the following DID %s:%s %s' % (rule['id'],self.myscope, group, str(exc_obj))) except ReplicationRuleCreationTemporaryFailed: exc_type, exc_obj, tb = sys.exc_info() rules = list(self.client.list_account_rules(account=self.account)) if rules : for rule in rules : if rule['rse_expression'] == destRSE and rule['scope'] == self.myscope and rule['name'] == group: print('| - - - - Rule already exists %s which contains the following DID %s:%s %s' % (rule['id'],self.myscope, group, str(exc_obj))) ############################ ## Create Rules for not registered DIDs ############################ def outdated_register_replica(self, filemds, dest_RSE, org_RSE): """ Register file replica. """ carrier_dataset = 'outdated_replication_dataset' + '-' + str(uuid.uuid4()) creation = self.createDataset(carrier_dataset) # Make sure your dataset is ephemeral self.client.set_metadata(scope=self.myscope, name=carrier_dataset, key='lifetime', value=86400) # 86400 in seconds = 1 day # Create a completly new create the RULE: for filemd in filemds : outdated = filemd['replica']['name'] self.registerIntoGroup(outdated, carrier_dataset) # Add dummy dataset for replicating at Destination RSE rule_child = self.addReplicaRule(dest_RSE, group=carrier_dataset) # Add dummy dataset for replicating Origin RSE rule_parent = self.addReplicaRule(org_RSE, group=carrier_dataset) # Create a relation rule between origin and destiny RSE, so that the source data can be deleted rule = self.client.update_replication_rule(rule_id=rule_parent, options={'lifetime': 10, 'child_rule_id':rule_child, 'purge_replicas':True}) logger.debug('| - - - - Creating relationship between parent %s and child %s : %s' % (rule_parent, rule_child, rule)) # Create a relation rule between the destinity rule RSE with itself, to delete the dummy rule, whiles keeping the destiny files rule = self.client.update_replication_rule(rule_id=rule_child, options={'lifetime': 10, 'child_rule_id':rule_child}) logger.debug('| - - - - Creating relationship between parent %s and child %s : %s' % (rule_parent, rule_child, rule)) ############################ ## Create Dictionary for Grafana ############################ def stats_rules(self, rules) : ''' Gather general information about total number of rules, and stats. ''' RUCIO = dict() if rules : for rule in rules : if 'outdated_replication_dataset' not in rule['name'] : if 'Rules' not in RUCIO : RUCIO['Rules'] = { 'total_stuck' : 0, 'total_replicating' : 0, 'total_ok' : 0, 'total_rules': 0 } RUCIO['Rules']['total_rules'] += 1 if rule['state'] == 'REPLICATING' : RUCIO['Rules']['total_replicating'] += 1 elif rule['state'] == 'STUCK' : RUCIO['Rules']['total_stuck'] += 1 elif rule['state'] == 'OK' : RUCIO['Rules']['total_ok'] += 1 else : RUCIO['Rules']['total_rules'] += 1 if rule['state'] == 'REPLICATING' : RUCIO['Rules']['total_replicating'] += 1 elif rule['state'] == 'STUCK' : RUCIO['Rules']['total_stuck'] += 1 elif rule['state'] == 'OK' : RUCIO['Rules']['total_ok'] += 1 if 'AllRules' not in RUCIO : RUCIO['AllRules'] = { 'total_stuck' : 0, 'total_replicating' : 0, 'total_ok' : 0, 'total_rules': 0 } RUCIO['AllRules']['total_rules'] += 1 if rule['state'] == 'REPLICATING' : RUCIO['AllRules']['total_replicating'] += 1 elif rule['state'] == 'STUCK' : RUCIO['AllRules']['total_stuck'] += 1 elif rule['state'] == 'OK' : RUCIO['AllRules']['total_ok'] += 1 else : RUCIO['AllRules']['total_rules'] += 1 if rule['state'] == 'REPLICATING' : RUCIO['AllRules']['total_replicating'] += 1 elif rule['state'] == 'STUCK' : RUCIO['AllRules']['total_stuck'] += 1 elif rule['state'] == 'OK' : RUCIO['AllRules']['total_ok'] += 1 ################## if 'Grouping' not in RUCIO : RUCIO['Grouping'] = { 'file' : 0, 'dataset' : 0, 'container' : 0 } if rule['did_type'] == 'CONTAINER' : RUCIO['Grouping']['container'] += 1 elif rule['did_type'] == 'DATASET' : RUCIO['Grouping']['dataset'] += 1 elif rule['did_type'] == 'FILE' : RUCIO['Grouping']['file'] += 1 else : if rule['did_type'] == 'CONTAINER' : RUCIO['Grouping']['container'] += 1 elif rule['did_type'] == 'DATASET' : RUCIO['Grouping']['dataset'] += 1 elif rule['did_type'] == 'FILE' : RUCIO['Grouping']['file'] += 1 return(RUCIO) def stats_replica_rules(self, rules) : ''' Gather specific information about state and number of replicas. ''' REPLICAS = dict() REPLICAS['RSE'] = {} if rules : # Creates a key for all the RSEs that we have replicas for rule in rules : # if the RSE is not in the dictionary #print(rule['rse_expression'], REPLICAS['RSE']) if rule['rse_expression'] not in REPLICAS['RSE'] : #print(REPLICAS) REPLICAS['RSE'][rule['rse_expression']] = { 'total_replica_stuck' : rule['locks_stuck_cnt'], 'total_replica_replicating' : rule['locks_replicating_cnt'], 'total_replica_ok' : rule['locks_ok_cnt'] } # else if it is, update replica numbers else : REPLICAS['RSE'][rule['rse_expression']]['total_replica_stuck'] += rule['locks_stuck_cnt'] REPLICAS['RSE'][rule['rse_expression']]['total_replica_replicating'] += rule['locks_replicating_cnt'] REPLICAS['RSE'][rule['rse_expression']]['total_replica_ok'] += rule['locks_ok_cnt'] return(REPLICAS) def stats_usage_rules(self, all_rses) : STORAGE = dict() STORAGE['USAGE'] = {} for x_rse in all_rses : rses = self.usage(x_rse) if rses['bytes'] != 0 : if rses['rse'] not in STORAGE['USAGE'] : STORAGE['USAGE'][rses['rse']] = { 'total_bytes_used' : rses['bytes'] } # else if it is, update replica numbers else : STORAGE['USAGE'][rses['rse']]['total_bytes_used'] += rses['bytes'] return(STORAGE)