Exemplo n.º 1
0
def test__init__(df):
    assert df.namePattern == 'np', 'namePattern not initialised as np'
    assert df.lfn == 'lfn', 'lfn not initialised as lfn'
    assert df.localDir == 'ld', 'localDir not initialised as ld'

    d1 = DiracFile()
    assert d1.namePattern == '', 'namePattern not default initialised as empty'
    assert d1.lfn == '', 'lfn not default initialised as empty'
    assert d1.localDir is None, 'localDir not default initialised as None'
    assert d1.locations == [], 'locations not initialised as empty list'

    d2 = DiracFile(namePattern='np', lfn='lfn', localDir='ld')
    assert d2.namePattern == 'np', 'namePattern not keyword initialised as np, initialized as: %s\n%s' % (d2.namePattern, str(d2))
    assert d2.lfn == 'lfn', 'lfn not keyword initialised as lfn, initialized as: %s\n%s' % (d2.lfn, str(d2))
    assert d2.localDir == 'ld', 'localDir not keyword initialised as ld, initializes as %s\n%s' % (d2.localDir, str(d2.localDir))
Exemplo n.º 2
0
def uploadLocalFile(job, namePattern, localDir, should_del=True):
    """
    Upload a locally available file to the grid as a DiracFile.
    Randomly chooses an SE.

    Args:
        namePattern (str): name of the file
        localDir (str): localDir of the file
        should_del = (bool): should we delete the local file?
    Return
        DiracFile: a DiracFile of the uploaded LFN on the grid
    """

    new_df = DiracFile(namePattern, localDir=localDir)
    trySEs = getConfig('DIRAC')['allDiracSE']
    random.shuffle(trySEs)
    new_lfn = os.path.join(getInputFileDir(job), namePattern)
    returnable = None
    for SE in trySEs:
        #Check that the SE is writable
        if execute('checkSEStatus("%s", "%s")' % (SE, 'Write')):
            try:
                returnable = new_df.put(force=True, uploadSE=SE, lfn=new_lfn)[0]
                break
            except GangaDiracError as err:
                raise GangaException("Upload of input file as LFN %s to SE %s failed" % (new_lfn, SE)) 
    if not returnable:
        raise GangaException("Failed to upload input file to any SE")
    if should_del:
        os.unlink(os.path.join(localDir, namePattern))

    return returnable
Exemplo n.º 3
0
 def test_getDataFile(self):
     from GangaLHCb.Lib.LHCbDataset.LHCbDatasetUtils import getDataFile
     lfn = DiracFile('a')
     pfn = LocalFile('a')
     assert getDataFile(lfn) == lfn
     assert getDataFile(pfn) == pfn
     assert getDataFile('lfn:a') == strToDataFile('lfn:a')
Exemplo n.º 4
0
    def replicate(self, destSE=''):
        '''Replicate all LFNs to destSE.  For a list of valid SE\'s, type
        ds.replicate().'''

        if not destSE:
            from GangaDirac.Lib.Files.DiracFile import DiracFile
            DiracFile().replicate('')
            return
        if not self.hasLFNs():
            raise GangaException('Cannot replicate dataset w/ no LFNs.')

        retry_files = []

        for f in self.files:
            if not isDiracFile(f):
                continue
            try:
                result = f.replicate( destSE=destSE )
            except Exception as err:
                msg = 'Replication error for file %s (will retry in a bit).' % f.lfn
                logger.warning(msg)
                logger.warning("Error: %s" % str(err))
                retry_files.append(f)

        for f in retry_files:
            try:
                result = f.replicate( destSE=destSE )
            except Exception as err:
                msg = '2nd replication attempt failed for file %s. (will not retry)' % f.lfn
                logger.warning(msg)
                logger.warning(str(err))
Exemplo n.º 5
0
 def getOutputDataLFNs(self):
     """Get a list of outputdata that has been uploaded by Dirac. Excludes
     the outputsandbox if it is there."""
     lfns = super(Dirac, self).getOutputDataLFNs()
     ds = LHCbDataset()
     for f in lfns:
         ds.files.append(DiracFile(lfn=f))
     return GPIProxyObjectFactory(ds)
Exemplo n.º 6
0
    def upload(self, lfn, diracSE, guid=None):

        from GangaDirac.Lib.Files.DiracFile import DiracFile
        diracFile = DiracFile(namePattern=self.name, lfn=lfn)

        diracFile.put(force=True)

        return diracFile
Exemplo n.º 7
0
    def _create_subjob(self, job, dataset):
        logger.debug("_create_subjob")

        datatmp = []
        if isinstance(dataset, LHCbDataset):
            for i in dataset:
                if isinstance(i, DiracFile):
                    datatmp.extend(i)
                else:
                    logger.error(
                        "Unkown file-type %s, cannot perform split with file %s"
                        % (type(i), str(i)))
                    from Ganga.Core.exceptions import GangaException
                    raise GangaException(
                        "Unkown file-type %s, cannot perform split with file %s"
                        % (type(i), str(i)))
        elif isinstance(dataset, list):
            from Ganga.GPIDev.Base.Proxy import isType
            for i in dataset:
                if type(i) is str:
                    datatmp.append(DiracFile(lfn=i))
                elif isType(i, DiracFile()):
                    datatmp.extend(i)
                else:
                    x = GangaException("Unknown(unexpected) file object: %s" %
                                       i)
                    raise x
        else:
            logger.error("Unkown dataset type, cannot perform split here")
            from Ganga.Core.exceptions import GangaException
            raise GangaException(
                "Unkown dataset type, cannot perform split here")

        logger.debug("Creating new Job in Splitter")
        j = Job()
        j.copyFrom(stripProxy(job))
        j.splitter = None
        j.merger = None
        j.inputsandbox = []  # master added automatically
        j.inputfiles = []
        j.inputdata = LHCbDataset(files=datatmp[:],
                                  persistency=self.persistency,
                                  depth=self.depth)
        j.inputdata.XMLCatalogueSlice = self.XMLCatalogueSlice

        return j
Exemplo n.º 8
0
def df():
    load_config_files()

    from GangaDirac.Lib.Files.DiracFile import DiracFile
    f = DiracFile('np', 'ld', 'lfn')
    f.locations = ['location']
    f.guid = 'guid'
    yield f
    clear_config()
Exemplo n.º 9
0
 def getOutputData(self, outputDir=None, names=None, force=False):
     """Retrieve data stored on SE to outputDir (default=job output workspace).
     If names=None, then all outputdata is downloaded otherwise names should
     be a list of files to download. If force is True then download performed
     even if data already exists."""
     downloaded_files = super(Dirac, self).getOutputData(outputDir, names, force)
     ds = LHCbDataset()
     for f in downloaded_files:
         ds.files.append(DiracFile(lfn=f))
     return GPIProxyObjectFactory(ds)
Exemplo n.º 10
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need a parent job that has completed to get the output files
        incl_pat_list = []
        excl_pat_list = []
        for parent in parent_units:
            if len(parent.active_job_ids) == 0 or parent.status != "completed":
                return None

            for inds in self.inputdata:
                from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput
                if isType(
                        inds, TaskChainInput
                ) and inds.input_trf_id == parent._getParent().getID():
                    incl_pat_list += inds.include_file_mask
                    excl_pat_list += inds.exclude_file_mask

        # go over the output files and copy the appropriates over as input
        # files
        flist = []
        import re
        for parent in parent_units:
            job = getJobByID(parent.active_job_ids[0])
            if job.subjobs:
                job_list = job.subjobs
            else:
                job_list = [job]

            for sj in job_list:
                for f in sj.outputfiles:

                    # match any dirac files that are allowed in the file mask
                    if isType(f, DiracFile):
                        if len(incl_pat_list) > 0:
                            for pat in incl_pat_list:
                                if re.search(pat, f.lfn):
                                    flist.append("LFN:" + f.lfn)
                        else:
                            flist.append("LFN:" + f.lfn)

                        if len(excl_pat_list) > 0:
                            for pat in excl_pat_list:
                                if re.search(
                                        pat,
                                        f.lfn) and "LFN:" + f.lfn in flist:
                                    flist.remove("LFN:" + f.lfn)

        # just do one unit that uses all data
        unit = LHCbUnit()
        unit.name = "Unit %d" % len(self.units)
        unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist])

        return unit
Exemplo n.º 11
0
def strToDataFile(name, allowNone=True):
    if len(name) >= 4 and name[:4].upper() == 'LFN:':
        return DiracFile(lfn=name[4:])
    elif len(name) >= 4 and name[:4].upper() == 'PFN:':
        logger.warning("PFN is slightly ambiguous, constructing LocalFile")
        return LocalFile(name[4:])
    else:
        if allowNone:
            return None
        else:
            raise GangaException( "Cannot construct file object: %s" % str(name) )
Exemplo n.º 12
0
    def _create_subjob(self, job, dataset):
        logger.debug("_create_subjob")
        datatmp = []

        logger.debug("dataset size: %s" % str(len(dataset)))
        #logger.debug( "dataset: %s" % str(dataset) )

        from GangaLHCb.Lib.LHCbDataset.LHCbDataset import LHCbDataset

        if isinstance(dataset, LHCbDataset):
            for i in dataset:
                if isType(i, DiracFile):
                    datatmp.append(i)
                else:
                    logger.error("Unkown file-type %s, cannot perform split with file %s" % (type(i), str(i)))
                    from Ganga.Core.exceptions import GangaException
                    raise GangaException("Unkown file-type %s, cannot perform split with file %s" % (type(i), str(i)))
        elif isinstance(dataset, (list, GangaList)):
            for this_file in dataset:
                if type(this_file) is str:
                    datatmp.append(allComponentFilters['gangafiles'](this_file, None))
                elif isType(this_file, IGangaFile):
                    datatmp.append(this_file)
                else:
                    logger.error("Unexpected type: %s" % str(type(this_file)))
                    logger.error("Wanted object to inherit from type: %s: %s" % (str(type(IGangaFile()))))
                    from Ganga.Core.exceptions import GangaException
                    x = GangaException("Unknown(unexpected) file object: %s" % this_file)
                    raise x
        elif type(dataset) is str:
            datatmp.append(DiracFile(lfn=dataset))
        else:
            logger.error("Unkown dataset type, cannot perform split here")
            from Ganga.Core.exceptions import GangaException
            logger.error("Dataset found: " + str(dataset))
            raise GangaException("Unkown dataset type, cannot perform split here")

        logger.debug("Creating new Job in Splitter")
        j = Job()
        logger.debug("Copying From Job")
        j.copyFrom(stripProxy(job), ['splitter', 'subjobs', 'inputdata', 'inputsandbox', 'inputfiles'])
        logger.debug("Unsetting Splitter")
        j.splitter = None
        #logger.debug("Unsetting Merger")
        #j.merger = None
        #j.inputsandbox = [] ## master added automatically
        #j.inputfiles = []
        logger.debug("Setting InputData")
        j.inputdata = LHCbDataset(files=datatmp[:],
                                  persistency=self.persistency,
                                  depth=self.depth)
        #j.inputdata.XMLCatalogueSlice = self.XMLCatalogueSlice
        logger.debug("Returning new subjob")
        return j
Exemplo n.º 13
0
def getDiracFiles():
    import os
    from GangaDirac.Lib.Files.DiracFile import DiracFile
    from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList
    filename = DiracFile.diracLFNBase().replace('/', '-') + '.lfns'
    logger.info(
        'Creating list, this can take a while if you have a large number of SE files, please wait...'
    )
    execute('dirac-dms-user-lfns &> /dev/null', shell=True, timeout=None)
    g = GangaList()
    with open(filename[1:], 'r') as lfnlist:
        lfnlist.seek(0)
        g.extend(
            (DiracFile(lfn='%s' % lfn.strip()) for lfn in lfnlist.readlines()))
    return addProxy(g)
Exemplo n.º 14
0
def generateDiracScripts(app):
    """
    Construct a DIRAC scripts which must be unique to each job to have unique checksum.
    This generates a unique file, uploads it to DRIAC and then stores the LFN in app.uploadedInput
    Args:
        app (GaudiExec): This expects a GaudiExec app to be passed so that the constructed
    """
    generateJobScripts(app, appendJobScripts=True)

    job = app.getJobObject()

    new_df = uploadLocalFile(job, app.jobScriptArchive.namePattern,
                             app.jobScriptArchive.localDir)

    app.jobScriptArchive = new_df

    app.is_prepared.addAssociatedFile(DiracFile(lfn=new_df.lfn))
Exemplo n.º 15
0
def uploadLocalFile(job, namePattern, localDir, should_del=True):
    """
    Upload a locally available file to the grid as a DiracFile

    Args:
        namePattern (str): name of the file
        localDir (str): localDir of the file
        should_del = (bool): should we delete the local file?
    Return
        DiracFile: a DiracFile of the uploaded LFN on the grid
    """

    new_df = DiracFile(namePattern, localDir=localDir)
    random_SE = random.choice(getConfig('DIRAC')['allDiracSE'])
    new_lfn = os.path.join(getInputFileDir(job), namePattern)
    returnable = new_df.put(force=True, uploadSE=random_SE, lfn=new_lfn)[0]

    if should_del:
        os.unlink(os.path.join(localDir, namePattern))

    return returnable
Exemplo n.º 16
0
    def getDataset(self):
        '''Gets the dataset from the bookkeeping for current dict.'''
        if not self.dict:
            return None
        cmd = 'bkQueryDict(%s)' % self.dict
        result = get_result(cmd, 'BK query error.', 'BK query error.')
        files = []
        value = result['Value']
        if 'LFNs' in value:
            files = value['LFNs']
        if not type(files) is list:
            if 'LFNs' in files:  # i.e. a dict of LFN:Metadata
                files = files['LFNs'].keys()

        from GangaDirac.Lib.Files.DiracFile import DiracFile
        this_list = [DiracFile(lfn=f) for f in files]

        from GangaLHCb.Lib.LHCbDataset import LHCbDataset
        ds = LHCbDataset(files=this_list, fromRef=True)

        return addProxy(ds)
Exemplo n.º 17
0
def generateDiracInput(app):
    """
    Construct a DIRAC input which does not need to be unique to each job but is required to have a unique checksum.
    This generates a unique file, uploads it to DRIAC and then stores the LFN in app.uploadedInput
    Args:
        app (GaudiExec): This expects a GaudiExec app to be passed so that the constructed
    """

    input_files, input_folders = collectPreparedFiles(app)

    job = app.getJobObject()

    if input_folders:
        raise ApplicationConfigurationError(
            'Prepared folders not supported yet, please fix this in future')
    else:
        prep_dir = app.getSharedPath()
        addTimestampFile(prep_dir)
        prep_file = _pseudo_session_id + '.tgz'
        tmp_dir = tempfile.gettempdir()
        compressed_file = os.path.join(
            tmp_dir, 'diracInputFiles_' + os.path.basename(prep_file))

        if not job.master:
            rjobs = job.subjobs
        else:
            rjobs = [job]

        with tarfile.open(compressed_file, "w:gz") as tar_file:
            for name in input_files:
                # FIXME Add support for subfiles here once it's working across multiple IGangaFile objects in a consistent way
                # Not hacking this in for now just in-case we end up with a mess as a result
                tar_file.add(name, arcname=os.path.basename(name))

    new_df = uploadLocalFile(job, os.path.basename(compressed_file), tmp_dir)

    app.uploadedInput = new_df
    app.is_prepared.addAssociatedFile(DiracFile(lfn=new_df.lfn))
Exemplo n.º 18
0
 def test_isLFN(self):
     from GangaLHCb.Lib.LHCbDataset.LHCbDatasetUtils import isLFN
     DiracFile('test')
     isLFN(DiracFile('test'))
     assert isLFN(DiracFile('test')), 'should be true'
     assert not isLFN(PhysicalFile('test')), 'should be false'
Exemplo n.º 19
0
def OfflineGangaDiracSplitter(_inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """

    if maxFiles is not None and maxFiles > 0:
        inputs = _inputs[:maxFiles]
    else:
        inputs = _inputs

    # First FIND ALL LFN REPLICAS AND SE<->SITE MAPPINGS AND STORE THIS IN MEMORY
    # THIS IS DONE IN PARALLEL TO AVOID OVERLOADING DIRAC WITH THOUSANDS OF
    # REQUESTS AT ONCE ON ONE CONNECTION

    wanted_common_site = configDirac['OfflineSplitterMaxCommonSites']
    iterative_limit = configDirac['OfflineSplitterLimit']
    good_fraction = configDirac['OfflineSplitterFraction']
    uniqueSE = configDirac['OfflineSplitterUniqueSE']

    split_files = []

    if inputs is None:
        raise SplittingError(
            "Cannot Split Job as the inputdata appears to be None!")

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata"
        )

    file_replicas = {}

    logger.info("Requesting LFN replica info")

    allLFNData = {}

    # Perform a lookup of where LFNs are all stored
    allLFNs, LFNdict = lookUpLFNReplicas(inputs, allLFNData)

    for _lfn in allLFNData:
        if allLFNData[_lfn] is None:
            logger.error(
                "Error in Getting LFN Replica information, aborting split")
            raise SplittingError(
                "Error in Getting LFN Replica information, aborting split")

    bad_lfns = []

    # Sort this information and store is in the relevant Ganga objects
    errors = sortLFNreplicas(bad_lfns, allLFNs, LFNdict, ignoremissing,
                             allLFNData, inputs)

    if len(bad_lfns) != 0:
        if ignoremissing is False:
            logger.error("Errors found getting LFNs:\n%s" % str(errors))
            raise SplittingError(
                "Error trying to split dataset with invalid LFN and ignoremissing = False"
            )

    # This finds all replicas for all LFNs...
    # This will probably struggle for LFNs which don't exist
    # Bad LFN should have been removed by this point however
    all_lfns = [
        LFNdict[this_lfn].locations for this_lfn in LFNdict
        if this_lfn not in bad_lfns
    ]

    logger.info("Got replicas")

    for this_input in inputs:
        if this_input.lfn not in bad_lfns:
            file_replicas[this_input.lfn] = this_input.locations

    logger.info("found all replicas")

    logger.info("Calculating site<->SE Mapping")

    site_to_SE_mapping = {}
    SE_to_site_mapping = {}

    # Now lets generate a dictionary of some chosen site vs LFN to use in
    # constructing subsets
    site_dict, allSubSets, allChosenSets = calculateSiteSEMapping(
        file_replicas, wanted_common_site, uniqueSE, site_to_SE_mapping,
        SE_to_site_mapping)

    logger.debug("Found all SE in use")

    # BELOW IS WHERE THE ACTUAL SPLITTING IS DONE

    logger.info("Calculating best data subsets")

    iterations = 0
    # Loop over all LFNs
    while len(site_dict.keys()) > 0:

        # LFN left to be used
        # NB: Can't modify this list and iterate over it directly in python
        LFN_instances = site_dict.keys()
        # Already used LFN
        chosen_lfns = set()

        for iterating_LFN in LFN_instances:

            # If this has previously been selected lets ignore it and move on
            if iterating_LFN in chosen_lfns:
                continue

            # Use this seed to try and construct a subset
            req_sitez = allChosenSets[iterating_LFN]
            _this_subset = []

            #logger.debug("find common LFN for: " + str(allChosenSets[iterating_LFN]))

            # Construct subset
            # Starting with i, populate subset with LFNs which have an
            # overlap of at least 2 SE
            for this_LFN in LFN_instances:
                if this_LFN in chosen_lfns:
                    continue
                if req_sitez.issubset(site_dict[this_LFN]):
                    if len(_this_subset) >= filesPerJob:
                        break
                    _this_subset.append(this_LFN)

            limit = int(math.floor(float(filesPerJob) * good_fraction))

            #logger.debug("Size limit: %s" % str(limit))

            # If subset is too small throw it away
            if len(_this_subset) < limit:
                #logger.debug("%s < %s" % (str(len(_this_subset)), str(limit)))
                allChosenSets[iterating_LFN] = generate_site_selection(
                    site_dict[iterating_LFN], wanted_common_site, uniqueSE,
                    site_to_SE_mapping, SE_to_site_mapping)
                continue
            else:
                logger.debug("found common LFN for: " +
                             str(allChosenSets[iterating_LFN]))
                logger.debug("%s > %s" % (str(len(_this_subset)), str(limit)))
                # else Dataset was large enough to be considered useful
                logger.debug("Generating Dataset of size: %s" %
                             str(len(_this_subset)))
                ## Construct DiracFile here as we want to keep the above combination
                allSubSets.append([
                    DiracFile(lfn=str(this_LFN)) for this_LFN in _this_subset
                ])

                for lfn in _this_subset:
                    site_dict.pop(lfn)
                    allChosenSets.pop(lfn)
                    chosen_lfns.add(lfn)

        # Lets keep track of how many times we've tried this
        iterations = iterations + 1

        # Can take a while so lets not let threads become un-locked
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

        # If on final run, will exit loop after this so lets try and cleanup
        if iterations >= iterative_limit:

            if good_fraction < 0.5:
                good_fraction = good_fraction * 0.75
                iterations = 0
            elif wanted_common_site > 1:
                logger.debug("Reducing Common Site Size")
                wanted_common_site = wanted_common_site - 1
                iterations = 0
                good_fraction = 0.75
            else:
                good_fraction = good_fraction * 0.75

            logger.debug("good_fraction: %s" % str(good_fraction))

    split_files = allSubSets

    avg = float()
    for this_set in allSubSets:
        avg += float(len(this_set))
    avg /= float(len(allSubSets))

    logger.info("Average Subset size is: %s" % (str(avg)))

    # FINISHED SPLITTING CHECK!!!

    check_count = 0
    for i in split_files:
        check_count = check_count + len(i)

    if check_count != len(inputs) - len(bad_lfns):
        logger.error("SERIOUS SPLITTING ERROR!!!!!")
        raise SplitterError("Files Missing after Splitting!")
    else:
        logger.info("File count checked! Ready to Submit")

    # RETURN THE RESULT

    logger.info("Created %s subsets" % str(len(split_files)))

    #logger.info( "Split Files: %s" % str(split_files) )

    for dataset in split_files:
        yield dataset
Exemplo n.º 20
0
def df():
    f = DiracFile('np', 'ld', 'lfn')
    f.locations = ['location']
    f.guid = 'guid'
    return f
Exemplo n.º 21
0
 def _setup_subjob_dataset(self, dataset):
     return LHCbDataset(files=[DiracFile(lfn=f) for f in dataset])
Exemplo n.º 22
0
def DiracSplitter(inputs, filesPerJob, maxFiles, ignoremissing):
    """
    Generator that yields a datasets for dirac split jobs
    """
    #logger.debug( "DiracSplitter" )
    #logger.debug( "inputs: %s" % str( inputs ) )
    split_files = []
    i = inputs.__class__()

    if len(inputs.getLFNs()) != len(inputs.files):
        raise SplittingError(
            "Error trying to split dataset using DIRAC backend with non-DiracFile in the inputdata"
        )

    all_files = igroup(inputs.files[:maxFiles],
                       getConfig('DIRAC')['splitFilesChunks'],
                       leftovers=True)

    #logger.debug( "Looping over all_files" )
    #logger.debug( "%s" % str( all_files ) )

    for files in all_files:

        i.files = files

        LFNsToSplit = i.getLFNs()

        if (len(LFNsToSplit)) > 1:

            result = execute('splitInputData(%s, %d)' %
                             (i.getLFNs(), filesPerJob))

            if not result_ok(result):
                logger.error('DIRAC:: Error splitting files: %s' % str(result))
                raise SplittingError('Error splitting files.')

            split_files += result.get('Value', [])

        else:

            split_files = [LFNsToSplit]

    if len(split_files) == 0:
        raise SplittingError('An unknown error occured.')

    # FIXME
    # check that all files were available on the grid
    big_list = []
    for l in split_files:
        big_list.extend(l)
    diff = set(inputs.getFileNames()[:maxFiles]).difference(big_list)
    if len(diff) > 0:
        for f in diff:
            logger.warning('Ignored file: %s' % f)
        if not ignoremissing:
            raise SplittingError('Some files not found!')
    ###

    logger.debug("Split Files: %s" % str(split_files))

    for _dataset in split_files:
        dataset = []
        for _lfn in _dataset:
            dataset.append(DiracFile(lfn=_lfn))
        yield dataset
Exemplo n.º 23
0
def performSplitting(site_dict, filesPerJob, allChosenSets, wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping):
    """
    This is the main method which loops through the LFNs and creates subsets which are returned a list of list of LFNs

    Args:
        site_dict (dict): This is a dict with LFNs as keys and sites for each LFN as value
        filesPerJob (int): Max files per jobs as defined by splitter
        allChosenSets (dict): A dict with LFNs as keys and a sub-set of sites where each LFN is replicated
        wanted_common_site (int): Number of sites which we want to have in common for each LFN

        uniqueSE (bool): Should we check to make sure sites don't share an SE
        site_to_SE_mapping (dict): Dict which has sites as keys and SE as values
        SE_to_site_mapping (dict): Dict which has sites as values and SE as keys

    Returns:
        allSubSets (list): Return a list of subsets each subset being a list of LFNs
    """

    good_fraction = configDirac['OfflineSplitterFraction']
    iterative_limit = configDirac['OfflineSplitterLimit']

    allSubSets = []

    iterations = 0
    # Loop over all LFNs
    while len(site_dict.keys()) > 0:

        # LFN left to be used
        # NB: Can't modify this list and iterate over it directly in python
        LFN_instances = site_dict.keys()
        # Already used LFN
        chosen_lfns = set()

        for iterating_LFN in LFN_instances:

            # If this has previously been selected lets ignore it and move on
            if iterating_LFN in chosen_lfns:
                continue

            # Use this seed to try and construct a subset
            req_sitez = allChosenSets[iterating_LFN]
            _this_subset = []

            #logger.debug("find common LFN for: " + str(allChosenSets[iterating_LFN]))

            # Construct subset
            # Starting with i, populate subset with LFNs which have an
            # overlap of at least 2 SE
            for this_LFN in LFN_instances:
                if this_LFN in chosen_lfns:
                    continue
                if req_sitez.issubset(site_dict[this_LFN]):
                    if len(_this_subset) >= filesPerJob:
                        break
                    _this_subset.append(this_LFN)

            limit = int(math.floor(float(filesPerJob) * good_fraction))

            #logger.debug("Size limit: %s" % str(limit))

            # If subset is too small throw it away
            if len(_this_subset) < limit:
                #logger.debug("%s < %s" % (str(len(_this_subset)), str(limit)))
                allChosenSets[iterating_LFN] = generate_site_selection(site_dict[iterating_LFN], wanted_common_site, uniqueSE, site_to_SE_mapping, SE_to_site_mapping)
                continue
            else:
                logger.debug("found common LFN for: " + str(allChosenSets[iterating_LFN]))
                logger.debug("%s > %s" % (str(len(_this_subset)), str(limit)))
                # else Dataset was large enough to be considered useful
                logger.debug("Generating Dataset of size: %s" % str(len(_this_subset)))
                ## Construct DiracFile here as we want to keep the above combination
                allSubSets.append([DiracFile(lfn=str(this_LFN)) for this_LFN in _this_subset])

                for lfn in _this_subset:
                    site_dict.pop(lfn)
                    allChosenSets.pop(lfn)
                    chosen_lfns.add(lfn)

        # Lets keep track of how many times we've tried this
        iterations = iterations + 1

        # Can take a while so lets not let threads become un-locked
        import Ganga.Runtime.Repository_runtime
        Ganga.Runtime.Repository_runtime.updateLocksNow()

        # If on final run, will exit loop after this so lets try and cleanup
        if iterations >= iterative_limit:

            if good_fraction < 0.5:
                good_fraction = good_fraction * 0.75
                iterations = 0
            elif wanted_common_site > 1:
                logger.debug("Reducing Common Site Size")
                wanted_common_site = wanted_common_site - 1
                iterations = 0
                good_fraction = 0.75
            else:
                good_fraction = good_fraction * 0.75

            logger.debug("good_fraction: %s" % str(good_fraction))

    return allSubSets
Exemplo n.º 24
0
    def getDataset(self):
        '''Gets the dataset from the bookkeeping for current path, etc.'''
        if not self.path:
            return None
        if not self.type in ['Path', 'RunsByDate', 'Run', 'Production']:
            raise GangaException('Type="%s" is not valid.' % self.type)
        if not self.type is 'RunsByDate':
            if self.startDate:
                msg = 'startDate not supported for type="%s".' % self.type
                raise GangaException(msg)
            if self.endDate:
                msg = 'endDate not supported for type="%s".' % self.type
                raise GangaException(msg)
            if self.selection:
                msg = 'selection not supported for type="%s".' % self.type
                raise GangaException(msg)
        cmd = "getDataset('%s','%s','%s','%s','%s','%s')" % (
            self.path, self.dqflag, self.type, self.startDate, self.endDate,
            self.selection)
        from Ganga.GPIDev.Lib.GangaList.GangaList import GangaList
        knownLists = [tuple, list, GangaList]
        if isType(self.dqflag, knownLists):
            cmd = "getDataset('%s',%s,'%s','%s','%s','%s')" % (
                self.path, self.dqflag, self.type, self.startDate,
                self.endDate, self.selection)
        result = get_result(
            cmd,
            'BK query error.',
            credential_requirements=self.credential_requirements)

        logger.debug("Finished Running Command")

        files = []
        value = result
        if 'LFNs' in value:
            files = value['LFNs']
        if not type(files) is list:  # i.e. a dict of LFN:Metadata
            # if 'LFNs' in files: # i.e. a dict of LFN:Metadata
            files = files.keys()

        logger.debug("Creating DiracFile objects")

        ## Doesn't work not clear why
        from GangaDirac.Lib.Files.DiracFile import DiracFile
        #new_files = []
        #def _createDiracLFN(this_file):
        #    return DiracFile(lfn = this_file)
        #GangaObject.__createNewList(new_files, files, _createDiracLFN)

        logger.debug("Creating new list")
        new_files = [DiracFile(lfn=f) for f in files]

        #new_files = [DiracFile(lfn=_file) for _file in files]
        #for f in files:
        #    new_files.append(DiracFile(lfn=f))
        #ds.extend([DiracFile(lfn = f)])

        logger.info("Constructing LHCbDataset")

        from GangaLHCb.Lib.LHCbDataset import LHCbDataset
        logger.debug("Imported LHCbDataset")
        ds = LHCbDataset(files=new_files, fromRef=True)

        logger.debug("Returning Dataset")

        return addProxy(ds)