Exemplo n.º 1
0
    def fromDAS(cls, name, dataset, instance = 'global', prefix='root://cms-xrd-global.cern.ch/', texName = None, maxN = None, dbFile=None, overwrite=False, skipCheck = False):
        ''' Make sample from DAS. 
        '''
        # https://github.com/CERN-PH-CMG/cmg-cmssw/blob/0f1d3bf62e7ec91c2e249af1555644b7f414ab50/CMGTools/Production/python/dataset.py#L437

        maxN = maxN if maxN is not None and maxN>0 else None
        limit = maxN if maxN else 0
        DASname = dataset.rstrip('/')

        n_cache_files = 0 
        # Don't use the cache on partial queries
        if dbFile is not None and ( maxN<0 or maxN is None ):
            cache = Database(dbFile, "fileCache", ["name"]) 
            n_cache_files = cache.contains({'name':name})
        else:
            cache = None

        if n_cache_files and not overwrite:
            files = [ f["value"] for f in cache.getDicts({'name':name}) ]
            logger.info('Found sample %s in cache %s, return %i files.', name, dbFile, len(files))
        else:
#            def _dasPopen(dbs):
#                if 'LSB_JOBID' in os.environ:
#                    raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs
#                if 'X509_USER_PROXY' in os.environ:
#                    dbs += " --key {0} --cert {0}".format(os.environ['X509_USER_PROXY'])
#                logger.info('DAS query\t: %s',  dbs)
#                return os.popen(dbs)
#
#            sampleName = dataset.rstrip('/')
#            query, qwhat = sampleName, "dataset"
#            if "#" in sampleName: qwhat = "block"
#
#            dbs='das_client --query="file %s=%s instance=prod/%s" --limit %i'%(qwhat,query, instance, limit)
#            dbsOut = _dasPopen(dbs).readlines()
            
            if overwrite:
                cache.removeObjects({"name":name})

            def _dasPopen(dbs):
                if 'LSB_JOBID' in os.environ:
                    raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs
                logger.info('DAS query\t: %s',  dbs)
                return os.popen(dbs)

            query, qwhat = DASname, "dataset"
            if "#" in DASname: qwhat = "block"

            dbs='dasgoclient -query="file %s=%s instance=prod/%s" --limit %i'%(qwhat,query, instance, limit)
            dbsOut = _dasPopen(dbs).readlines()
            
            files = []
            for line in dbsOut:
                if line.startswith('/store/'):
                    line = line.rstrip()
                    filename = line
                    try:
                        if skipCheck or helpers.checkRootFile(prefix+filename):
                            files.append(filename)
                    except IOError:
                        logger.warning( "IOError for file %s. Skipping.", filename )

                    if cache is not None:
                        cache.add({"name":name}, filename, save=True)

        if limit>0: files=files[:limit]

        result = cls(name, files=[prefix+file for file in files], texName = texName)
        result.DASname = DASname
        return result
Exemplo n.º 2
0
    def nanoAODfromDAS(cls, name, DASname, instance = 'global', redirector='root://hephyse.oeaw.ac.at/', dbFile=None, overwrite=False, treeName = "Events", maxN = None, \
            selectionString = None, weightString = None, xSection=-1,
            isData = False, color = 0, texName = None, multithreading=True, genWeight='genWeight', json=None):
        '''
        get nanoAOD from DAS and make a local copy on afs 
        '''
        from multiprocessing import Pool
        import json
        maxN = maxN if maxN is not None and maxN>0 else None
        limit = maxN if maxN else 0

        n_cache_files = 0 
        # Don't use the cache on partial queries
        if dbFile is not None and ( maxN<0 or maxN is None ):
            cache = Database(dbFile, "fileCache", ["name", "DAS", "normalization"]) 
            n_cache_files = cache.contains({'name':name, 'DAS':DASname})
        else:
            cache = None


        if n_cache_files and not overwrite:
            files = [ f["value"] for f in cache.getDicts({'name':name, 'DAS':DASname}) ]
            normalization = cache.getDicts({'name':name, 'DAS':DASname})[0]["normalization"]
            
            logger.info('Found sample %s in cache %s, return %i files.', name, dbFile, len(files))

        else:
            if overwrite:
                cache.removeObjects({"name":name, 'DAS':DASname})

            def _dasPopen(dbs):
                if 'LSB_JOBID' in os.environ:
                    raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs
                logger.info('DAS query\t: %s',  dbs)
                return os.popen(dbs)

            sampleName = DASname.rstrip('/')
            query, qwhat = sampleName, "dataset"
            if "#" in sampleName: qwhat = "block"

            dbs='dasgoclient -query="file %s=%s instance=prod/%s" --limit %i'%(qwhat,query, instance, limit)
            dbsOut = _dasPopen(dbs).readlines()
            
            files = []
            for line in dbsOut:
                if line.startswith('/store/'):
                    line = line.rstrip()
                    filename = redirector+'/'+line
                    files.append(filename)
            
            if DASname.endswith('SIM') or not 'Run20' in DASname:
                # need to read the proper normalization for MC
                logger.info("Reading normalization. This is slow, so grab a coffee.")
                tmp_sample = cls(name=name, files=files, treeName = treeName, selectionString = selectionString, weightString = weightString,
                    isData = isData, color=color, texName = texName, xSection = xSection, normalization=1)
                normalization = tmp_sample.getYieldFromDraw('(1)', genWeight)['val']
                logger.info("Got normalization %s", normalization)
            else:
                # for data, we can just use the number of events, although no normalization is needed anyway.
                dbs='dasgoclient -query="summary %s=%s instance=prod/%s" --format=json'%(qwhat,query, instance)
                jdata = json.load(_dasPopen(dbs))['data'][0]['summary'][0]
                normalization = int(jdata['nevents'])

        if overwrite or n_cache_files<1:
            for f in files:
                if cache is not None:
                    cache.add({"name":name, 'DAS':DASname, 'normalization':str(normalization)}, f, save=True)
            
        if limit>0: files=files[:limit]
        sample = cls(name=name, files=files, treeName = treeName, selectionString = selectionString, weightString = weightString,
            isData = isData, color=color, texName = texName, xSection = xSection, normalization=float(normalization))
        sample.DAS = DASname
        sample.json = json
        return sample
Exemplo n.º 3
0
    def fromDPMDirectory(cls,
                         name,
                         directory,
                         prefix='root://hephyse.oeaw.ac.at/',
                         texName=None,
                         maxN=None,
                         dbFile=None,
                         overwrite=False,
                         skipCheck=False):

        maxN = maxN if maxN is not None and maxN > 0 else None
        limit = maxN if maxN else 0

        n_cache_files = 0
        # Don't use the cache on partial queries
        if dbFile is not None and (maxN < 0 or maxN is None):
            cache = Database(dbFile, "fileCache", ["name"])
            n_cache_files = cache.contains({'name': name})
        else:
            cache = None

        if n_cache_files and not overwrite:
            files = [f["value"] for f in cache.getDicts({'name': name})]
            logger.info('Found sample %s in cache %s, return %i files.', name,
                        dbFile, len(files))
        else:
            if overwrite:
                cache.removeObjects({"name": name})

            def _dasPopen(dbs):
                if 'LSB_JOBID' in os.environ:
                    raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs
                logger.info('DAS query\t: %s', dbs)
                return os.popen(dbs)

            files = []
            dbs = 'xrdfs %s ls %s' % (prefix, directory)
            dbsOut = _dasPopen(dbs).readlines()

            for line in dbsOut:
                if line.startswith('/store/'):
                    line = line.rstrip()
                    filename = line
                    try:
                        if skipCheck or helpers.checkRootFile(prefix +
                                                              filename):
                            files.append(filename)
                    except IOError:
                        logger.warning("IOError for file %s. Skipping.",
                                       filename)

                    if cache is not None:
                        cache.add({"name": name}, filename, save=True)

        if limit > 0: files = files[:limit]

        result = cls(name,
                     files=[prefix + file for file in files],
                     texName=texName)
        result.DASname = prefix + directory.rstrip("/")
        return result
Exemplo n.º 4
0
    def nanoAODfromDAS(cls, name, DASname, instance = 'global', redirector='root://hephyse.oeaw.ac.at/', dbFile=None, overwrite=False, treeName = "Events", maxN = None, \
            selectionString = None, weightString = None, xSection=-1,
            isData = False, color = 0, texName = None, multithreading=True, genWeight='genWeight', json=None, lazy=False):
        '''
        get nanoAOD from DAS and make a local copy on afs 
        overwrite = true    : old entries will be overwritten, no matter what the old entry contains. 
        overwrite = 'update': file-list and normalization are checked, and only if they potentially changed the old entry is overwritten.
        '''

        from RootTools.fwlite.Database import Database
        import json

        maxN = maxN if maxN is not None and maxN > 0 else None
        limit = maxN if maxN else 0

        n_cache_files = 0
        # Don't use the cache on partial queries
        if dbFile is not None and (maxN < 0 or maxN is None):
            cache = Database(dbFile, "fileCache",
                             ["name", "DAS", "normalization", "nEvents"])
            n_cache_files = cache.contains({'name': name, 'DAS': DASname})
        else:
            cache = None

        # first check if there are already files in the cache
        normalizationFromCache = 0.
        if n_cache_files:
            cache_dicts = cache.getDicts({'name': name, 'DAS': DASname})
            filesFromCache = [f["value"] for f in cache_dicts]
            normalizationFromCache = cache_dicts[0]["normalization"]
            nEventsFromCache = cache_dicts[0]["nEvents"]
        else:
            filesFromCache = []

        # if we don't want to overwrite, and there's a filelist in the cache we're already done
        if n_cache_files and not overwrite:
            files = filesFromCache
            normalization = normalizationFromCache
            nEvents = nEventsFromCache

            logger.info('Found sample %s in cache %s, return %i files.', name,
                        dbFile, len(files))

        else:
            # only entered if overwrite is not set or sample not in the cache yet
            def _dasPopen(dbs):
                if 'LSB_JOBID' in os.environ:
                    raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs
                logger.info('DAS query\t: %s', dbs)
                return os.popen(dbs)

            sampleName = DASname.rstrip('/')
            query, qwhat = sampleName, "dataset"
            if "#" in sampleName: qwhat = "block"

            dbs = 'dasgoclient -query="file %s=%s instance=prod/%s" --limit %i' % (
                qwhat, query, instance, limit)
            dbsOut = _dasPopen(dbs).readlines()

            files = []
            for line in dbsOut:
                if line.startswith('/store/'):
                    #line = line.rstrip()
                    #filename = redirector+'/'+line
                    files.append(line.rstrip())

            if (sorted(files) == sorted(filesFromCache)) and float(
                    normalizationFromCache) > 0.0 and overwrite == 'update':
                # if the files didn't change we don't need to read the normalization again (slowest part!). If the norm was 0 previously, also get it again.
                logger.info("File list for %s didn't change. Skipping.", name)
                normalization = normalizationFromCache
                nEvents = nEventsFromCache
                logger.info('Sample %s from cache %s returned %i files.', name,
                            dbFile, len(files))

            else:
                if overwrite:
                    # remove old entry
                    cache.removeObjects({"name": name, 'DAS': DASname})
                    logger.info("Removed old DB entry.")

                if instance == 'global':
                    filesOnLocalT2 = True  # ignore locality check

                    ## check if dataset is available in local site, otherwise don't read a normalization
                    #dbs='dasgoclient -query="site %s=%s instance=prod/%s" --format=json'%(qwhat,query, instance)
                    #jdata = json.load(_dasPopen(dbs))
                    #
                    #filesOnLocalT2 = False
                    #for d in jdata['data']:
                    #    if d['site'][0]['name'] == localSite and d['site'][0].has_key('replica_fraction'):
                    #        fraction = d['site'][0]['replica_fraction']
                    #        if float(str(fraction).replace('%','')) < 100.:
                    #            filesOnLocalT2 = False
                    #            break
                    #        else:
                    #            filesOnLocalT2 = True
                else:
                    # if we produced the samples ourselves we don't need to check this
                    filesOnLocalT2 = True

                #if filesOnLocalT2:
                #    logger.info("Files are available at %s", localSite)

                if DASname.endswith('SIM') or not 'Run20' in DASname:
                    # need to read the proper normalization for MC
                    logger.info(
                        "Reading normalization. This is slow, so grab a coffee."
                    )
                    tmp_sample = cls(name=name,
                                     files=[redirector + f for f in files],
                                     treeName=treeName,
                                     selectionString=selectionString,
                                     weightString=weightString,
                                     isData=isData,
                                     color=color,
                                     texName=texName,
                                     xSection=xSection,
                                     normalization=1)
                    normalization = tmp_sample.getYieldFromDraw(
                        '(1)', genWeight)['val']
                    logger.info("Got normalization %s", normalization)
                    # still getting number of events
                    dbs = 'dasgoclient -query="summary %s=%s instance=prod/%s" --format=json' % (
                        qwhat, query, instance)
                    jdata = json.load(_dasPopen(dbs))['data'][0]['summary'][0]
                    nEvents = int(jdata['nevents'])
                else:
                    # for data, we can just use the number of events, although no normalization is needed anyway.
                    dbs = 'dasgoclient -query="summary %s=%s instance=prod/%s" --format=json' % (
                        qwhat, query, instance)
                    jdata = json.load(_dasPopen(dbs))['data'][0]['summary'][0]
                    normalization = int(jdata['nevents'])
                    nEvents = normalization

                for f in files:
                    if cache is not None:
                        cache.add(
                            {
                                "name": name,
                                'DAS': DASname,
                                'normalization': str(normalization),
                                'nEvents': nEvents
                            },
                            f,
                            save=True)

                logger.info('Found sample %s in cache %s, return %i files.',
                            name, dbFile, len(files))

        if limit > 0: files = files[:limit]
        sample = cls(name=name,
                     files=[redirector + '/' + f for f in files],
                     treeName=treeName,
                     selectionString=selectionString,
                     weightString=weightString,
                     isData=isData,
                     color=color,
                     texName=texName,
                     normalization=float(normalization),
                     xSection=xSection)
        sample.DAS = DASname
        sample.json = json
        sample.nEvents = int(nEvents)
        return sample
Exemplo n.º 5
0
    def nanoAODfromDAS(cls, name, DASname, instance = 'global', redirector='root://hephyse.oeaw.ac.at/', dbFile=None, overwrite=False, treeName = "Events", maxN = None, \
            selectionString = None, weightString = None, xSection=-1,
            isData = False, color = 0, texName = None, multithreading=True, genWeight='genWeight', json=None, localSite='T2_AT_Vienna'):
        '''
        get nanoAOD from DAS and make a local copy on afs 
        if overwrite is true, old entries will be overwritten, no matter what the old entry contains. if overwrite=='update', file-list and normalization are checked, and only if they potentially changed the old entry is overwritten.
        '''
        from multiprocessing import Pool
        from RootTools.fwlite.Database import Database
        import json

        maxN = maxN if maxN is not None and maxN>0 else None
        limit = maxN if maxN else 0

        n_cache_files = 0 
        # Don't use the cache on partial queries
        if dbFile is not None and ( maxN<0 or maxN is None ):
            cache = Database(dbFile, "fileCache", ["name", "DAS", "normalization"]) 
            n_cache_files = cache.contains({'name':name, 'DAS':DASname})
        else:
            cache = None

        # first check if there are already files in the cache
        normalizationFromCache = 0.
        if n_cache_files:
            filesFromCache          = [ f["value"] for f in cache.getDicts({'name':name, 'DAS':DASname}) ]
            normalizationFromCache  = cache.getDicts({'name':name, 'DAS':DASname})[0]["normalization"]
        else:
            filesFromCache = []

        # if we don't want to overwrite, and there's a filelist in the cache we're already done
        if n_cache_files and not overwrite:
            files           = filesFromCache
            normalization   = normalizationFromCache
            
            logger.info('Found sample %s in cache %s, return %i files.', name, dbFile, len(files))

        else:
            # only entered if overwrite is not set or sample not in the cache yet
            def _dasPopen(dbs):
                if 'LSB_JOBID' in os.environ:
                    raise RuntimeError, "Trying to do a DAS query while in a LXBatch job (env variable LSB_JOBID defined)\nquery was: %s" % dbs
                logger.info('DAS query\t: %s',  dbs)
                return os.popen(dbs)

            sampleName = DASname.rstrip('/')
            query, qwhat = sampleName, "dataset"
            if "#" in sampleName: qwhat = "block"

            dbs='dasgoclient -query="file %s=%s instance=prod/%s" --limit %i'%(qwhat,query, instance, limit)
            dbsOut = _dasPopen(dbs).readlines()
            
            files = []
            for line in dbsOut:
                if line.startswith('/store/'):
                    #line = line.rstrip()
                    #filename = redirector+'/'+line
                    files.append(line.rstrip())
            
            if (sorted(files) == sorted(filesFromCache)) and float(normalizationFromCache) > 0.0 and overwrite=='update':
                # if the files didn't change we don't need to read the normalization again (slowest part!). If the norm was 0 previously, also get it again.
                logger.info("File list for %s didn't change. Skipping.", name)
                normalization = normalizationFromCache
                logger.info('Sample %s from cache %s returned %i files.', name, dbFile, len(files))

            else:
                if overwrite:
                    # remove old entry
                    cache.removeObjects({"name":name, 'DAS':DASname})
                    logger.info("Removed old DB entry.")

                if instance == 'global':
                    # check if dataset is available in local site, otherwise don't read a normalization
                    dbs='dasgoclient -query="site %s=%s instance=prod/%s" --format=json'%(qwhat,query, instance)
                    jdata = json.load(_dasPopen(dbs))
                    
                    filesOnLocalT2 = False
                    for d in jdata['data']:
                        if d['site'][0]['name'] == localSite and d['site'][0].has_key('replica_fraction'):
                            fraction = d['site'][0]['replica_fraction']
                            if float(str(fraction).replace('%','')) < 100.:
                                filesOnLocalT2 = False
                                break
                            else:
                                filesOnLocalT2 = True
                else:
                    # if we produced the samples ourselves we don't need to check this
                    filesOnLocalT2 = True
                
                if filesOnLocalT2:
                    logger.info("Files are available at %s", localSite)

                if DASname.endswith('SIM') or not 'Run20' in DASname:
                    # need to read the proper normalization for MC
                    logger.info("Reading normalization. This is slow, so grab a coffee.")
                    tmp_sample = cls(name=name, files=[ redirector + f for f in files], treeName = treeName, selectionString = selectionString, weightString = weightString,
                        isData = isData, color=color, texName = texName, xSection = xSection, normalization=1)
                    normalization = tmp_sample.getYieldFromDraw('(1)', genWeight)['val']
                    logger.info("Got normalization %s", normalization)
                else:
                    # for data, we can just use the number of events, although no normalization is needed anyway.
                    dbs='dasgoclient -query="summary %s=%s instance=prod/%s" --format=json'%(qwhat,query, instance)
                    jdata = json.load(_dasPopen(dbs))['data'][0]['summary'][0]
                    normalization = int(jdata['nevents'])

                for f in files:
                    if cache is not None:
                        cache.add({"name":name, 'DAS':DASname, 'normalization':str(normalization)}, f, save=True)

                logger.info('Found sample %s in cache %s, return %i files.', name, dbFile, len(files))

            
        if limit>0: files=files[:limit]
        sample = cls(name=name, files=[ redirector+'/'+f for f in files], treeName = treeName, selectionString = selectionString, weightString = weightString,
            isData = isData, color=color, texName = texName, normalization=float(normalization), xSection = xSection)
        sample.DAS = DASname
        sample.json = json
        return sample