Exemplo n.º 1
0
class RunSync(object):
    """
    Synchronize the replica of a given run at WIPAC-ORIG 
    the corresponding Rucio site.
    """
    def __init__(self,
                 run,
                 originrse=DEFAULT_ORIGIN_RSE,
                 destrse=None,
                 scope=DEFAULT_SCOPE,
                 check=True,
                 lifetime=None,
                 dry_run=False,
                 container=None):
        """
           :param dataset: Name of the PhEDEx dataset to synchronize with Rucio.
           :param pnn: PhEDEx node name to filter on for replica information.
        """
        self.run = run
        self.originrse = originrse
        self.destrse = destrse
        self.scope = scope
        self.check = check
        self.lifetime = lifetime
        self.dry_run = dry_run
        self.container = container

        self.rucio_datasets = {}
        self.run_files = {}
        self.existent_replica_files = {}
        self.url = ''
        self.gfal = Gfal2Context()

        self.run_Number = None

        self.get_run_Number()
        self.files_storage = {}
        self.get_global_url()

        self.didc = DIDClient()
        self.repc = ReplicaClient()
        self.rulesClient = RuleClient()

        # Right now obtaining the Metadata from the storage at WIPAC
        # Hopefully in the future from JADE                                                                                                                      # TODO
        self.get_run_Files()
        self.get_rucio_metadata()
        self.update_run_Files()
        self.get_files_metadata()

    def update_run_Files(self):
        """
        Updating the run files wiht only the files that have not been registered
        """
        for f in self.existent_replica_files:
            file_name = f.split('/')[-1:][0]
            if file_name in self.run_files:
                print("File: %s already registered. Skipping it" % file_name)
                self.run_files.pop(file_name)

    def get_files_metadata(self):
        for f in self.run_files:
            if self.run + '/' + f not in self.existent_replica_files:
                self.obtain_metadata(f)
        print("Metadat initialization done")

    def obtain_metadata(self, filename):
        """
        Get the size and checksum for every file in the run from the gftp server
        """
        url = self.get_file_url(filename)
        print("checking metadata for url %s" % url)
        try:
            size = self.gfal.stat(str(url)).st_size
            adler32 = self.gfal.checksum(str(url), 'adler32')
            print(
                "got size and adler 32checksum of file: pfn=%s size=%s checksum=%s"
                % (url, size, adler32))
            self.run_files[filename] = {
                'size': size,
                'adler32': adler32,
                'name': self.run + '/' + filename
            }
        except GError:
            print("no file found at %s" % url)
            return False

    def get_file_url(self, filename):
        return self.url + '/' + self.run + '/' + filename

    def get_global_url(self):
        """
        Return the base path of the rucio url
        """
        print("Getting parameters for rse %s" % self.originrse)
        rse = rsemgr.get_rse_info(self.originrse)
        proto = rse['protocols'][0]

        schema = proto['scheme']
        prefix = proto['prefix'] + self.scope.replace('.', '/')
        if schema == 'srm':
            prefix = proto['extended_attributes']['web_service_path'] + prefix
        url = schema + '://' + proto['hostname']
        if proto['port'] != 0:
            url = url + ':' + str(proto['port'])
        self.url = url + prefix
        print("Determined base url %s" % self.url)

    def get_run_Number(self):
        """
        Obtain the run number out of whole run IceCube/2016/filtered/level2pass2/0101/Run00127347
        """
        print("Obtaining run number out of run(dataset): %s" % self.run)
        self.run_Number = self.run.split("/")[-1]
        print("Run number (dataset): %s" % self.run_Number)

    def get_run_Files(self):
        """
        Gets the list of files for a given run and their checksums from the storage
        """
        self.run_url = self.url + '/' + self.run
        print("Listin files from url : %s" % self.run_url)
        run_files = []
        try:
            run_files = self.gfal.listdir(str(self.run_url))
        except GError:
            print("No files found at %s" % str(self.run_url))
        print("Files found in storage:")
        count = 0
        for f in run_files:
            if len(f) > 3:
                if count < 5000:
                    self.run_files[f] = {}
                    count = count + 1
                else:
                    break

    def get_rucio_metadata(self):
        """                                                                                                                                         
        Gets the list of datasets at the Rucio RSE, the files, and the metadata.                                                                           
        """
        print(
            "Initializing Rucio... getting the list of blocks and files at %s"
            % self.originrse)
        registered_datasets = self.repc.list_datasets_per_rse(self.originrse)
        for dataset in registered_datasets:
            self.rucio_datasets[dataset] = {}

        replica_info = self.repc.list_replicas([{
            "scope": self.scope,
            "name": '/' + self.run_Number
        }],
                                               rse_expression="rse=%s" %
                                               self.originrse)
        replica_files = set()
        for file_info in replica_info:
            name = file_info['name']
            if self.originrse in file_info['rses']:
                replica_files.add(name)

        self.existent_replica_files = replica_files
        print("Rucio initialization done.")

    def register(self):
        """
        Create the container, the datasets and attach them to the container.
        """
        print("Registering...")
        self.register_dataset(self.run_Number)
        self.register_replicas(self.run_files)
        self.register_container(self.container)
        self.attach_dataset_to_container(self.run_Number, self.container)
        self.add_replica_rule(dataset=self.run_Number, destRSE=self.destrse)

    def register_container(self, container):
        """
        Registering the container
        """
        print("Registering the container %s with scope: %s" %
              (container, self.scope))
        if container is None:
            print('No container added, not registering any container')
            return
        if self.dry_run:
            print('Dry run only, not registering the container')
            return
        try:
            self.didc.add_container(scope=self.scope,
                                    name=container,
                                    lifetime=self.lifetime)
        except DataIdentifierAlreadyExists:
            print("Container %s already exists" % container)
        except InvalidObject:
            print("Problem with container name: %s" % container)

    def attach_dataset_to_container(self, dataset, container):
        """
        Attaching the dataset to a container
        """
        print("Attaching dataset %s, to container: %s" % (dataset, container))
        if container is None:
            print('No container added, not registering dataset in container')
            return
        if self.dry_run:
            print('Dry run only, not attaching dataset container')
            return
        try:
            self.didc.attach_dids(scope=self.scope,
                                  name=container,
                                  dids=[{
                                      'scope': self.scope,
                                      'name': '/' + dataset
                                  }])
        except RucioException:
            print("dataset already attached to container")
        return

    def register_dataset(self, run):
        """
        Registering a dataset in the rucio database
        """
        print("registering dataset %s" % run)
        if self.dry_run:
            print(' Dry run only. Not creating dataset.')
            return
        try:
            self.didc.add_dataset(scope=self.scope,
                                  name=run,
                                  lifetime=self.lifetime)
        except DataIdentifierAlreadyExists:
            print(" Dataset %s already exists" % run)

    def register_replicas(self, replicas):
        """
        Register file replica.
        """
        if not replicas:
            return
        print("registering files in Rucio: %s" %
              ", ".join([replicas[filemd]['name'] for filemd in replicas]))
        if self.dry_run:
            print(' Dry run only. Not registering files.')
            return
        try:
            self.repc.add_replicas(rse=self.originrse,
                                   files=[{
                                       'scope':
                                       self.scope,
                                       'name':
                                       replicas[filemd]['name'],
                                       'adler32':
                                       replicas[filemd]['adler32'],
                                       'bytes':
                                       replicas[filemd]['size'],
                                   } for filemd in replicas])
            print("Adding files to dataset: %s" % self.run_Number)
        except InvalidObject:
            print("Problem with file name does not match pattern")

        for filemd in replicas:
            try:
                self.didc.attach_dids(scope=self.scope,
                                      name=self.run_Number,
                                      dids=[{
                                          'scope': self.scope,
                                          'name': replicas[filemd]['name']
                                      }])
            except FileAlreadyExists:
                print("File already attached")

    def add_replica_rule(self, destRSE, dataset):
        """
        Create a replication rule for one dataset "Run" at an RSE
        """
        print("Creating replica rule for dataset %s at rse: %s" %
              (dataset, destRSE))
        if self.dry_run:
            print(' Dry run only. Not creating rules')
            return
        if destRSE:
            try:
                self.rulesClient.add_replication_rule([{
                    "scope": self.scope,
                    "name": "/" + dataset
                }],
                                                      copies=1,
                                                      rse_expression=destRSE)
            except DuplicateRule:
                print('Rule already exists')
Exemplo n.º 2
0
class Rucio :
    def __init__(self, myscope, orgRse, destRse, account='bruzzese', working_folder=None):
        
        self.myscope = myscope
        self.orgRse = orgRse 
        self.destRse = destRse
        self.working_folder = working_folder
        
        self.gfal = Gfal2Context()

        self.didc = DIDClient()
        self.repc = ReplicaClient()
        self.rulesClient = RuleClient()
        
        # Configuration
        self.account = account

        # account=account
        self.client = Client(account=self.account)
        
        # Get list of all RSEs 
    def rses(self) :
        rses_lists = list()
        for single_rse in list(self.client.list_rses()) :
            rses_lists.append(single_rse['rse'])
        return(rses_lists)
    
    def usage(self,s_rse) :
        return(list(self.client.get_local_account_usage(account=self.account,rse=s_rse))[0])
        
    def rules(self) :
        return(list(self.client.list_account_rules(account=self.account)))
    
    def myfunc(self):
        print("Hello your setting are account=%s, scope=%s, origin RSE =%s and destination RSE =%s" %(self.account, self.myscope, self.orgRse, self.destRse))

    def file_exists(self, pfn) :
        try :
            self.gfal.stat(pfn).st_size
            return(True)
        except : 
            return(False)
        
        
    def get_rse_url(self):
        """
        Return the base path of the rucio url
        """
        rse_settings = rsemgr.get_rse_info(self.orgRse)
        protocol = rse_settings['protocols'][0]
        
        schema = protocol['scheme']
        prefix = protocol['prefix']
        port = protocol['port']
        rucioserver = protocol['hostname']
        
        rse_url = list()
        if None not in (schema,str(rucioserver+':'+str(port)),prefix): 
            rse_url.extend([schema,rucioserver+':'+str(port),prefix,'',''])
            if self.working_folder != None :
                # Check if our test folder exists
                path = os.path.join(urlunsplit(rse_url), self.working_folder)
                self.gfal.mkdir_rec(path, 775)
                return(path)
            else :
                return(urlunsplit(rse_url))
        else :
            return('Wrong url parameters')    

    def check_replica(self, lfn, dest_rse=None):
        """
        Check if a replica of the given file at the site already exists.
        """
        if lfn : 
            replicas = list(
                self.client.list_replicas([{
                    'scope': self.myscope,
                    'name': lfn
                }], rse_expression=dest_rse))

            if replicas:
                for replica in replicas:
                    if isinstance(replica,dict) :
                        if dest_rse in replica['rses']:
                            path = replica['rses'][dest_rse][0]
                            return(path)
            return(False)
        
    ############################

    ## Create Metadata for DIDs

    ############################    
    def getFileMetaData(self, p_file, origenrse=None):
        """
        Get the size and checksum for every file in the run from defined path
        """ 
        '''
        generate the registration of the file in a RSE :
        :param rse: the RSE name.
        :param scope: The scope of the file.
        :param name: The name of the file.
        :param bytes: The size in bytes.
        :param adler32: adler32 checksum.
        :param pfn: PFN of the file for non deterministic RSE  
        :param dsn: is the dataset name.
        '''
        name = os.path.basename(p_file)
        name = name.replace('/','')

        replica = {
        'scope': self.myscope,
        'name': name.replace('+','_'),
        'adler32': self.gfal.checksum(p_file, 'adler32'),
        'bytes': self.gfal.stat(p_file).st_size,
        'pfn': p_file,
        "meta": {"guid": str(generate_uuid())}
        }

        Data = dict();
        Data['replica'] = replica
        Data['scope'] = self.myscope

        return(Data) 

    ############################

    ## Create Groups of DIDs

    ############################
    def createDataset(self, new_dataset) :         
        logger.debug("|  -  - Checking if a provided dataset exists: %s for a scope %s" % (new_dataset, self.myscope))
        try:
            self.client.add_dataset(scope=self.myscope, name=new_dataset)
            return(True)
        except DataIdentifierAlreadyExists:
            return(False)
        except Duplicate as error:
            return generate_http_error_flask(409, 'Duplicate', error.args[0])
        except AccountNotFound as error:
            return generate_http_error_flask(404, 'AccountNotFound', error.args[0])
        except RucioException as error:
            exc_type, exc_obj, tb = sys.exc_info()
            logger.debug(exc_obj)

    def createcontainer(self, name_container):
        '''
        registration of the dataset into a container :
        :param name_container: the container's name
        :param info_dataset : contains, 
            the scope: The scope of the file.
            the name: The dataset name.
        '''
        logger.debug("|  -  -  - registering container %s" % name_container)

        try:
            self.client.add_container(scope=self.myscope, name=name_container)
        except DataIdentifierAlreadyExists:
            logger.debug("|  -  -  - Container %s already exists" % name_container)       
        except Duplicate as error:
            return generate_http_error_flask(409, 'Duplicate', error.args[0])
        except AccountNotFound as error:
            return generate_http_error_flask(404, 'AccountNotFound', error.args[0])
        except RucioException as error:
            exc_type, exc_obj, tb = sys.exc_info()
            logger.debug(exc_obj)
    
    ############################

    ## General funciotn for registering a did into a GROUP of DID (CONTAINER/DATASET)

    ############################
    def registerIntoGroup(self,n_file, new_dataset):
        """
        Attaching a DID to a GROUP
        """
        type_1 = self.client.get_did(scope=self.myscope, name=new_dataset)
        type_2 = self.client.get_did(scope=self.myscope, name=n_file)

        try:
            self.client.attach_dids(scope=self.myscope, name=new_dataset, dids=[{'scope':self.myscope, 'name':n_file}])
        except RucioException:
            logger.debug("| - - - %s already attached to %s" %(type_2['type'],type_1['type']))    

    ############################

    ## MAGIC functions 

    ############################
    def create_groups(self, organization) :

        # 2.1) Create the dataset and containers for the file 
        self.createDataset(organization['dataset_1']) 
        # 2.1.1) Attach the dataset and containers for the file 
        self.registerIntoGroup(organization['replica'], organization['dataset_1'])        

        # 2.2) Create the dataset and containers for the file 
        self.createcontainer(organization['container_1']) 
        # 2.2.1) Attach the dataset and containers for the file 
        self.registerIntoGroup(organization['dataset_1'], organization['container_1'])        

        # 2.3) Create the dataset and containers for the file 
        self.createcontainer(organization['container_2']) 
        # 2.3.1) Attach the dataset and containers for the file 
        self.registerIntoGroup(organization['container_1'], organization['container_2'])        

        # 2.4) Create the dataset and containers for the file 
        self.createcontainer(organization['container_3']) 
        # 2.4.1) Attach the dataset and containers for the file             
        self.registerIntoGroup(organization['container_2'], organization['container_3'])   

    
    ############################

    ## Create Rule for DIDs

    ############################            
    def addReplicaRule(self, destRSE, group):
        """
        Create a replication rule for one dataset at a destination RSE
        """

        type_1 = self.client.get_did(scope=self.myscope, name=group)
        logger.debug("| - - - Creating replica rule for %s %s at rse: %s" % (type_1['type'], group, destRSE))
        if destRSE:
            try:
                rule = self.rulesClient.add_replication_rule([{"scope":self.myscope,"name":group}],copies=1, rse_expression=destRSE, grouping='ALL', account=self.account, purge_replicas=True)
                logger.debug("| - - - - Rule succesfully replicated at %s" % destRSE)
                logger.debug("| - - - - - The %s has the following id %s" % (rule, destRSE))
                return(rule[0])
            except DuplicateRule:
                exc_type, exc_obj, tb = sys.exc_info()
                rules = list(self.client.list_account_rules(account=self.account))
                if rules : 
                    for rule in rules :
                        if rule['rse_expression'] == destRSE and rule['scope'] == self.myscope and rule['name'] == group:
                            logger.debug('| - - - - Rule already exists %s which contains the following DID %s:%s %s' % (rule['id'],self.myscope, group, str(exc_obj)))
            except ReplicationRuleCreationTemporaryFailed:    
                exc_type, exc_obj, tb = sys.exc_info()
                rules = list(self.client.list_account_rules(account=self.account))
                if rules : 
                    for rule in rules :
                        if rule['rse_expression'] == destRSE and rule['scope'] == self.myscope and rule['name'] == group:
                            print('| - - - - Rule already exists %s which contains the following DID %s:%s %s' % (rule['id'],self.myscope, group, str(exc_obj)))                
                
                
    ############################

    ## Create Rules for not registered DIDs

    ############################  
    def outdated_register_replica(self, filemds, dest_RSE, org_RSE):
        """
        Register file replica.
        """
        carrier_dataset = 'outdated_replication_dataset' + '-' + str(uuid.uuid4())

        creation = self.createDataset(carrier_dataset)

        # Make sure your dataset is ephemeral

        self.client.set_metadata(scope=self.myscope, name=carrier_dataset, key='lifetime', value=86400) # 86400 in seconds = 1 day       

        # Create a completly new create the RULE: 
        for filemd in filemds :
            outdated = filemd['replica']['name']
            self.registerIntoGroup(outdated, carrier_dataset)
            
        # Add dummy dataset for replicating at Destination RSE
        rule_child = self.addReplicaRule(dest_RSE, group=carrier_dataset)

        # Add dummy dataset for replicating Origin RSE
        rule_parent = self.addReplicaRule(org_RSE, group=carrier_dataset)
        
        # Create a relation rule between origin and destiny RSE, so that the source data can be deleted 
        rule = self.client.update_replication_rule(rule_id=rule_parent, options={'lifetime': 10, 'child_rule_id':rule_child, 'purge_replicas':True})
        logger.debug('| - - - - Creating relationship between parent %s and child %s : %s' % (rule_parent, rule_child, rule))

        # Create a relation rule between the destinity rule RSE with itself, to delete the dummy rule, whiles keeping the destiny files    
        rule = self.client.update_replication_rule(rule_id=rule_child, options={'lifetime': 10, 'child_rule_id':rule_child})
        logger.debug('| - - - - Creating relationship between parent %s and child %s : %s' % (rule_parent, rule_child, rule))                          
                        
    ############################

    ## Create Dictionary for Grafana

    ############################              
    def stats_rules(self, rules) :
        '''
        Gather general information about 
        total number of rules, and stats.
        '''
        RUCIO = dict()
        if rules : 
            for rule in rules :
                if 'outdated_replication_dataset' not in rule['name'] :
                    if 'Rules' not in RUCIO :
                        RUCIO['Rules'] = {
                            'total_stuck' : 0, 
                            'total_replicating' : 0,
                            'total_ok' : 0,
                            'total_rules': 0 
                        }

                        RUCIO['Rules']['total_rules'] += 1
                        if rule['state'] == 'REPLICATING' : 
                            RUCIO['Rules']['total_replicating'] += 1
                        elif rule['state'] == 'STUCK' :
                            RUCIO['Rules']['total_stuck'] += 1
                        elif rule['state'] == 'OK' :
                            RUCIO['Rules']['total_ok'] += 1

                    else :     
                        RUCIO['Rules']['total_rules'] += 1
                        if rule['state'] == 'REPLICATING' : 
                            RUCIO['Rules']['total_replicating'] += 1
                        elif rule['state'] == 'STUCK' :
                            RUCIO['Rules']['total_stuck'] += 1
                        elif rule['state'] == 'OK' :
                            RUCIO['Rules']['total_ok'] += 1

                if 'AllRules' not in RUCIO : 
                    RUCIO['AllRules'] = {
                        'total_stuck' : 0, 
                        'total_replicating' : 0,
                        'total_ok' : 0,
                        'total_rules': 0 
                    }

                    RUCIO['AllRules']['total_rules'] += 1
                    if rule['state'] == 'REPLICATING' : 
                        RUCIO['AllRules']['total_replicating'] += 1
                    elif rule['state'] == 'STUCK' :
                        RUCIO['AllRules']['total_stuck'] += 1
                    elif rule['state'] == 'OK' :
                        RUCIO['AllRules']['total_ok'] += 1

                else :     
                    RUCIO['AllRules']['total_rules'] += 1
                    if rule['state'] == 'REPLICATING' : 
                        RUCIO['AllRules']['total_replicating'] += 1
                    elif rule['state'] == 'STUCK' :
                        RUCIO['AllRules']['total_stuck'] += 1
                    elif rule['state'] == 'OK' :
                        RUCIO['AllRules']['total_ok'] += 1 

                ##################
                if 'Grouping' not in RUCIO : 
                    RUCIO['Grouping'] = {
                        'file' : 0, 
                        'dataset' : 0,
                        'container' : 0 
                    }

                    if rule['did_type'] == 'CONTAINER' : 
                        RUCIO['Grouping']['container'] += 1
                    elif rule['did_type'] == 'DATASET' :
                        RUCIO['Grouping']['dataset'] += 1
                    elif rule['did_type'] == 'FILE' :
                        RUCIO['Grouping']['file'] += 1

                else :     
                    if rule['did_type'] == 'CONTAINER' : 
                        RUCIO['Grouping']['container'] += 1
                    elif rule['did_type'] == 'DATASET' :
                        RUCIO['Grouping']['dataset'] += 1
                    elif rule['did_type'] == 'FILE' :
                        RUCIO['Grouping']['file'] += 1 
            return(RUCIO)

    def stats_replica_rules(self, rules) :

        '''
        Gather specific information about 
        state and number of replicas.
        '''
        REPLICAS = dict()
        REPLICAS['RSE'] = {}
        if rules : 
            # Creates a key for all the RSEs that we have replicas
            for rule in rules :
                # if the RSE is not in the dictionary
                #print(rule['rse_expression'], REPLICAS['RSE'])
                if rule['rse_expression'] not in REPLICAS['RSE'] : 
                    #print(REPLICAS)
                    REPLICAS['RSE'][rule['rse_expression']] = { 
                        'total_replica_stuck' : rule['locks_stuck_cnt'], 
                        'total_replica_replicating' : rule['locks_replicating_cnt'],
                        'total_replica_ok' : rule['locks_ok_cnt']
                    } 
                # else if it  is, update replica numbers
                else :
                    REPLICAS['RSE'][rule['rse_expression']]['total_replica_stuck'] += rule['locks_stuck_cnt']
                    REPLICAS['RSE'][rule['rse_expression']]['total_replica_replicating'] += rule['locks_replicating_cnt']
                    REPLICAS['RSE'][rule['rse_expression']]['total_replica_ok'] += rule['locks_ok_cnt']
            return(REPLICAS)

    def stats_usage_rules(self, all_rses) :    
        STORAGE = dict()
        STORAGE['USAGE'] = {}
        for x_rse in all_rses :
            rses = self.usage(x_rse)
            if rses['bytes'] != 0 :
                if rses['rse'] not in STORAGE['USAGE'] : 
                    STORAGE['USAGE'][rses['rse']] = { 
                        'total_bytes_used' : rses['bytes']
                    } 
                # else if it  is, update replica numbers
                else :
                    STORAGE['USAGE'][rses['rse']]['total_bytes_used'] += rses['bytes']
        return(STORAGE)