Exemplo n.º 1
0
def mk_secondary_lumimask(dset):

    dq = das_query("file dataset=%s instance=prod/phys03" % dset,
                   cmd='dasgoclient --dasmaps=./')
    assert 'data' in dq.keys()
    fs = [str(f['file'][0]['name']) for f in dq['data']]
    #fs = fs[:2]
    print('N files:', len(fs))

    lumis = []
    dqs = [
        das_query("lumi file=%s instance=prod/phys03" % f,
                  cmd='dasgoclient --dasmaps=./') for f in fs
    ]
    for dq in dqs:
        for data in dq['data']:
            for lumi in data['lumi'][0]['lumi_section_num']:
                lumis.append([data['lumi'][0]['run_number'], lumi])

    jsonList = LumiList(lumis=lumis)
    #print(jsonList)
    output_file = dset.split('/')[2].split('-')[1].split('_')[0]
    #print(output_file)
    jsonList.writeJSON(output_dir + output_file +
                       '_3photons_imgskim_lumi_list.json')
Exemplo n.º 2
0
    def addToDataset(self, catalog, dsetName, files):
        if dsetName in catalog:
            if self.force_:
                catalog[dsetName]["files"] = files
            else:
                self.mergeDataset(catalog[dsetName], {"files": files})
            #---Recover missing info
            if "dset_type" not in catalog[
                    dsetName] or not catalog[dsetName]["dset_type"]:
                dset_type = das_query(
                    "datatype dataset=%s instance=prod/phys03" % dsetName)
                catalog[dsetName][
                    "dset_type"] = dset_type['data'][0]['datatype'][0][
                        'data_type'] if 'data' in dset_type else None
            if ("parent_n_units" not in catalog[dsetName]
                    or catalog[dsetName]["parent_n_units"]
                    == None) and catalog[dsetName]["dset_type"] != None:
                catalog[dsetName]["parent_n_units"] = self.getParentInfo(
                    catalog[dsetName]["dset_type"], dsetName)
        else:
            #---First import
            dset_type = das_query("datatype dataset=%s instance=prod/phys03" %
                                  dsetName)
            dset_type = dset_type['data'][0]['datatype'][0][
                'data_type'] if 'data' in dset_type else None
            parent_info = self.getParentInfo(dset_type,
                                             dsetName) if dset_type else None

            catalog[dsetName] = {
                "files": files,
                "parent_n_units": parent_info,
                "dset_type": dset_type
            }
Exemplo n.º 3
0
 def getParentInfo(self, dset_type, dsetName):
     parent_n_info = 'nlumis' if dset_type=='data' else 'nevents'
     parent_dset = das_query("parent dataset=%s instance=prod/phys03" % dsetName)['data'][0]['parent'][0]['name']
     parent_info = das_query("dataset dataset=%s instance=prod/phys03" % parent_dset)
     try:
         parent_info = parent_info['data'][-1]['dataset'][0][parent_n_info]
     except KeyError:
         try:
             parent_info = das_query("dataset dataset=%s" % parent_dset)
             parent_info = parent_info['data'][-1]['dataset'][0][parent_n_info]
         except KeyError:
             parent_info = None
     
     return parent_info
Exemplo n.º 4
0
 def getFilesFomDAS(self,dsetName):
     """
     Read dataset files from DAS.
     @dsetName: dataset name
     """
     ## response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_, ckey=x509(), cert=x509())
     ## response = das_query("https://cmsweb.cern.ch","file dataset=%s instance=%s | grep file.name,file.nevents" % (dsetName,self.dbs_instance_), 0, 0, False, ckey=x509(), cert=x509())
     response = das_query("file dataset=%s instance=%s | grep file.name,file.nevents" % (dsetName,self.dbs_instance_))
     
     files=[]
     for d in response["data"]:
         for jf in d["file"]:
             if "nevents" in jf:
                 files.append({ "name" : jf["name"], "nevents" : jf["nevents"] })
                 break                
     return files
Exemplo n.º 5
0
 def getFilesFomDAS(self,dsetName):
     """
     Read dataset files from DAS.
     @dsetName: dataset name
     """
     ## response = das_query("https://cmsweb.cern.ch","file dataset=%s | grep file.name,file.nevents" % dsetName, 0, 0, False, self.dbs_instance_, ckey=x509(), cert=x509())
     ## response = das_query("https://cmsweb.cern.ch","file dataset=%s instance=%s | grep file.name,file.nevents" % (dsetName,self.dbs_instance_), 0, 0, False, ckey=x509(), cert=x509())
     response = das_query("file dataset=%s instance=%s | grep file.name,file.nevents" % (dsetName,self.dbs_instance_))
     
     files=[]
     for d in response["data"]:
         for jf in d["file"]:
             if "nevents" in jf:
                 files.append({ "name" : jf["name"], "nevents" : jf["nevents"] })
                 break                
     return files
Exemplo n.º 6
0
def file_from_das(dataset):
    if options.use_parent:
        format_ = dataset.split("/")[2]
        if 'SIM' in format_:
            dataset.replace("NANOAODSIM", "MINIAODSIM")
        else:
            dataset.replace("NANOAOD", "MINIAOD")
    response = das_query("file dataset=%s | grep file.name,file.nevents" %
                         (dataset))
    root_file_list = []
    nevents = 0
    for d in response.get("data", []):
        for jf in d["file"]:
            if "nevents" in jf:
                nevents += jf["nevents"]
                root_file_list.append({
                    "name": str(jf["name"]),
                    "nevents": int(jf["nevents"])
                })
                break
    return root_file_list, nevents
Exemplo n.º 7
0
    def importFromDAS(self, list_datasets):
        """
        Import datasets from DAS to the catalog.
        @datasets: wildecard to be usd in dataset query
        """
        #
        ret, out = commands.getstatusoutput("voms-proxy-info -e")
        if ret != 0:
            print "\n\nNo valid voms proxy found. This is needed to query DAS.\nPlease create a valid proxy running the following command:\nvoms-proxy-init -voms cms\n"
            sys.exit(-1)

        catalog = self.readCatalog()

        print "Importing from das %s" % list_datasets
        datasets = []
        for dataset in list_datasets:
            if "*" in dataset:
                # response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_, ckey=x509(), cert=x509())
                # response = das_query("https://cmsweb.cern.ch","dataset dataset=%s instance=%s | grep dataset.name" % (dataset, self.dbs_instance_), 0, 0, False, ckey=x509(), cert=x509())
                # response = das_query("https://cmsweb.cern.ch","dataset dataset=%s instance=%s | grep dataset.name" % (dataset, self.dbs_instance_), 0, 0, False, ckey=x509(), cert=x509())
                response = das_query(
                    "dataset dataset=%s instance=%s | grep dataset.name" %
                    (dataset, self.dbs_instance_))
                ## print response
                for d in response["data"]:
                    ## print d
                    datasets.append(d["dataset"][0]["name"])
            else:
                datasets.append(dataset)

        print "Datasets to import"
        print "\n".join(datasets)
        for dsetName in datasets:
            print "Importing %s" % dsetName
            files = self.getFilesFomDAS(dsetName)
            self.addToDataset(catalog, dsetName, files)

        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
Exemplo n.º 8
0
def produceJobs(dataset, totalN, jobN):
    query = das_query("file dataset=%s/NANOAODSIM" % dataset)
    if not 'data' in query:
        raise Exception(
            'Your das query has not worked properly - check your proxy is valid'
        )

    files = [each['file'][0] for each in query['data']]

    if totalN == None:
        totalN = sum([f['nevents'] for f in files])

    jobs = []

    end_request = False
    counter = 0  #count number of events assigned to jobs so far

    for f in files:
        if not end_request:
            fileN = f['nevents']
            for start in range(0, fileN, jobN):
                nEvents_to_end_request = totalN - counter
                nEvents_to_end_file = fileN - start
                possible_job_sizes = [
                    nEvents_to_end_request, nEvents_to_end_file, jobN
                ]

                job_size = min(possible_job_sizes)

                job = {"name": f['name'], "start": start, "jobsize": job_size}
                jobs.append(job)

                counter += job_size
                if counter == totalN:
                    end_request = True
                    break
        else:
            break
    return jobs
Exemplo n.º 9
0
    def importFromDAS(self,list_datasets):
        """
        Import datasets from DAS to the catalog.
        @datasets: wildecard to be usd in dataset query
        """
        # 
        ret,out = commands.getstatusoutput("voms-proxy-info -e")
        if ret != 0:
            print "\n\nNo valid voms proxy found. This is needed to query DAS.\nPlease create a valid proxy running the following command:\nvoms-proxy-init -voms cms\n"
            sys.exit(-1)
        
        catalog = self.readCatalog()
        
        print "Importing from das %s" % list_datasets
        datasets = []
        for dataset in list_datasets:
            if "*" in dataset:
                # response = das_query("https://cmsweb.cern.ch","dataset dataset=%s | grep dataset.name" % dataset, 0, 0, False, self.dbs_instance_, ckey=x509(), cert=x509())
                # response = das_query("https://cmsweb.cern.ch","dataset dataset=%s instance=%s | grep dataset.name" % (dataset, self.dbs_instance_), 0, 0, False, ckey=x509(), cert=x509())
                # response = das_query("https://cmsweb.cern.ch","dataset dataset=%s instance=%s | grep dataset.name" % (dataset, self.dbs_instance_), 0, 0, False, ckey=x509(), cert=x509())
                response = das_query("dataset dataset=%s instance=%s | grep dataset.name" % (dataset, self.dbs_instance_))
                ## print response
                for d in response["data"]:
                    ## print d
                    datasets.append( d["dataset"][0]["name"] )
            else:
                datasets.append(dataset)

        print "Datasets to import"
        print "\n".join(datasets)
        for dsetName in datasets:
            print "Importing %s" % dsetName
            files = self.getFilesFomDAS(dsetName)
            self.addToDataset(catalog,dsetName,files)
            
        print "Writing catalog"
        self.writeCatalog(catalog)
        print "Done"
                      action="store_true",
                      help="Extract files from .txt file, specified above")
    return parser.parse_args()


(opt, args) = get_options()

# Create list of files
files = []
if opt.doFilesFromList:
    listOfFiles = open(samples[opt.sample], "r")
    for line in listOfFiles:
        files.append(line[:-1])
else:
    das_sample = samples[opt.sample]
    for fdata in das_query("file dataset=%s" % das_sample,
                           cmd="dasgoclient --dasmaps=./")['data']:
        files.append("root://cms-xrd-global.cern.ch/%s" %
                     fdata['file'][0]['name'])

scaleWeights = [
    'NNLO_1_1_NLO_1_1', 'NNLO_1_1_NLO_1_2', 'NNLO_1_1_NLO_1_0p5',
    'NNLO_1_1_NLO_2_1', 'NNLO_1_1_NLO_2_2', 'NNLO_1_1_NLO_0p5_1',
    'NNLO_1_1_NLO_0p5_0p5', 'NNLO_2_2_NLO_1_1', 'NNLO_2_2_NLO_1_2',
    'NNLO_2_2_NLO_1_0p5', 'NNLO_2_2_NLO_2_1', 'NNLO_2_2_NLO_2_2',
    'NNLO_2_2_NLO_0p5_1', 'NNLO_2_2_NLO_0p5_0p5', 'NNLO_0p5_0p5_NLO_1_1',
    'NNLO_0p5_0p5_NLO_1_2', 'NNLO_0p5_0p5_NLO_1_0p5', 'NNLO_0p5_0p5_NLO_2_1',
    'NNLO_0p5_0p5_NLO_2_2', 'NNLO_0p5_0p5_NLO_0p5_1',
    'NNLO_0p5_0p5_NLO_0p5_0p5'
]

# Define histograms
Exemplo n.º 11
0
for gen in opt.generators.split(","):
  for year in opt.years.split(","):
    proc = "%s_%s_%s%s"%(opt.productionMode,year,gen,opt.ext)
    das_sample = samples[proc]
    if das_sample == "NA":
      print " --> (%s,%s,%s): Sample not available"%(opt.productionMode,year,gen)
      # Add column to table
      for pm in pmSplit[opt.productionMode]:
        if gen == "mg": sampleColumns = 'amc@NLO (%s)'%year
        else: sampleColumns = 'POWHEG (%s)'%year
        for b in bins[pm]: sampleColumns += " & -"
        filesout[pm].write("        %s \\\\ \\hline\n"%sampleColumns)
    else:
      # Create list of files from datasample
      files = []
      for fdata in das_query("file dataset=%s"%das_sample, cmd="dasgoclient --dasmaps=./")['data']: files.append("root://cms-xrd-global.cern.ch/%s"%fdata['file'][0]['name'])

      # Create list of dataframes
      frs = []
      _vars = ['genWeight','HTXS_stage_0','HTXS_stage1_1_cat_pTjet30GeV','HTXS_Higgs_pt','HTXS_njets30']
      for fname in files:
	f_upr = upr.open(fname)
	t = f_upr['Events']
	frs.append( t.pandas.df(_vars) )
      fr = pd.concat( frs, sort=False )
      # Add 1.2 bins
      fr['HTXS_stage1_2_cat_pTjet30GeV'] = fr.apply(lambda x: convert_to_1p2(x['HTXS_stage1_1_cat_pTjet30GeV'],x['HTXS_Higgs_pt']), axis=1)
      # If production mode is ggH, add NNLOPS reweighting
      if(opt.doNNLOPS)&(opt.productionMode == 'ggH'):
        if gen == "mg": fr['genWeight_NNLOPS'] = fr.apply(lambda x: NNLOPS_rwgt(NNLOPSWeights_amcatnlo,x['genWeight'],x['HTXS_njets30'],x['HTXS_Higgs_pt']), axis=1)
        elif gen == "powheg": fr['genWeight_NNLOPS'] = fr.apply(lambda x: NNLOPS_rwgt(NNLOPSWeights_powheg,x['genWeight'],x['HTXS_njets30'],x['HTXS_Higgs_pt']), axis=1)
Exemplo n.º 12
0
    def customize(self,process):
        self.parse()

        # keep useParent and secondaryDataset as exclusive options for the moment
        if self.options.useParentDataset and self.options.secondaryDataset != "":
            raise Exception("useParentDataset cannot be set together with a secondaryDataset")

        isFwlite = False
        hasOutput = False
        hasTFile = False
        sp_unused = ""
        if hasattr(process,"fwliteInput"):
            isFwlite = True
        if not isFwlite:
            hasOutput = hasattr(process,"out")            
            hasTFile = hasattr(process,"TFileService")
        
        if hasOutput and hasTFile:
            tfile = self.outputFile.replace(".root","_histos.root")
        else:
            tfile = self.outputFile
            
        if self.dryRun:
            import sys
            if self.dataset and self.dataset != "":
                name,xsec,totEvents,files,maxEvents,sp_unused = self.dataset
                if self.getMaxJobs:
                    print "maxJobs:%d" % ( min(len(files),self.nJobs) )                    
                if len(files) != 0:
                    if isFwlite:
                        print "hadd:%s" % self.outputFile
                    else:
                        if hasOutput:
                            print "edm:%s" % self.outputFile
                        if hasTFile or self.tfileOut:
                            print "hadd:%s" % tfile
                    ## sys.exit(0)
            else:
                sys.exit(1)
        
                
        files = self.inputFiles
        if self.dataset and self.dataset != "":
            dsetname,xsec,totEvents,files,maxEvents,sp_unused = self.dataset
            if type(xsec) == float or xsec == None:
                print 
                print "Error: cross section not found for dataset %s" % dsetname
                print
                
            putarget = None
            samplepu = None
            if self.puTarget != "":
                putarget = map(float, self.puTarget.split(","))
                
            processId = self.getProcessId(dsetname)
            self.processId = processId

            #----------

            if self.options.processIndex != None:
                self.processIndex = self.options.processIndex
            else:
                # not specified on the command line, try to take it 
                # from the cross section file, otherwise use smallest int32 as default value
                # in order not to confuse it with data (index 0)

                if isinstance(xsec, dict):
                    self.processIndex = xsec.get('itype', -0x7FFFFFFF)
                else:
                    # note that in some cases (process not defined in cross_sections.json ?)
                    # this can still be a float
                    self.processIndex = -0x7FFFFFFF

            #----------

            if isinstance(xsec, dict) and "itype" in xsec:
                for name,obj in process.__dict__.iteritems():
                    if hasattr(obj, "sampleIndex"):
                        obj.sampleIndex = xsec["itype"]

            
            isdata = self.processType == "data"
            if isdata or self.targetLumi > 0. or putarget:
                ## look for analyzers which have lumiWeight as attribute
                for name,obj in process.__dict__.iteritems():
                    
                    if hasattr(obj,"lumiWeight"):
                        if  isdata:
                            obj.lumiWeight = 1.
                        else:
                            wei = xsec["xs"]/float(totEvents)*self.targetLumi
                            wei *= xsec.get("br",1.)
                            wei *= xsec.get("kf",1.)
                            obj.lumiWeight = wei

                    if hasattr(obj,"intLumi"):
                        if isdata:
                            obj.intLumi= 0 # should not be used in final fits.
                            # setting to 0 will cause error if someone tries to use
                            #it for normalization downsteram
                        else:
                            obj.intLumi=self.targetLumi

                    if putarget and not isdata:
                        puObj = None
                        if hasattr(obj,"puReWeight"):
                            puObj = obj
                        elif hasattr(obj,"globalVariables") and hasattr(obj.globalVariables,"puReWeight"):
                            puObj = obj.globalVariables
                        if puObj:
                            if not samplepu:
#                                print dsetname
#                                print self.pu_distribs.keys()
#                                hack2017 = True
#                                found_hack2017 = False
#                                if hack2017:
                                # if self.options.PUyear=="2017":
                                #     print dsetname.split("/")[1]
                                #    # print self.pu_distribs.keys()
                                #     print self.pu_distribs_hack_2017.keys()
                                #    # matches = filter(lambda x: x == dsetname.split("/")[1],self.pu_distribs.keys())
                                #     matches = filter(lambda x: x == dsetname.split("/")[1],self.pu_distribs_hack_2017.keys())
                                #     if len(matches) == 1:
                                #         found_hack2017 = True
                                #         print "FOUND HACK2017 PILEUP DISTRIBUTION WITH KEY:",matches[0]
                                # if not found_hack2017:
                                matches = filter(lambda x: x in dsetname, self.pu_distribs.keys() )
                                print matches
                                if len(matches) > 1:
                                    print "Multiple matches, check if they're all the same"
                                    allsame = True
                                    for i in range(1,len(matches)):
                                        if self.pu_distribs[matches[0]] != self.pu_distribs[matches[i]]:
                                            allsame = False
                                    if allsame:
                                        print "They're all the same so we just take the 0th one:",matches[0]
                                        matches = [matches[0]]
                                    else:
                                        print "Not all the same... so we return to the old behavior and take an exact match, otherwise leave empty..."
                                        matches = filter(lambda x: x == dsetname, matches)
                                if len(matches) != 1:
                                    raise Exception("Could not determine sample pu distribution for reweighting. Possible matches are [%s]. Selected [%s]\n dataset: %s" % 
                                                        ( ",".join(self.pu_distribs.keys()), ",".join(matches), dsetname ) )
                                # if self.options.PUyear=="2017": samplepu = self.pu_distribs_hack_2017[matches[0]]
                                # else :
                            samplepu = self.pu_distribs[matches[0]]
                            puObj.puReWeight = True
                            puObj.puBins = cms.vdouble( map(float, samplepu.probFunctionVariable) )
                            puObj.mcPu   = samplepu.probValue
                            puObj.dataPu = cms.vdouble(putarget)
                            puObj.useTruePu = cms.bool(True)
                    
            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processId"):
                    obj.processId = str(processId)

            for name,obj in process.__dict__.iteritems():
                if hasattr(obj,"processIndex"):
                    obj.processIndex = int(self.processIndex)
                    
            lumisToSkip = None
            if isdata:
                lumisToSkip = self.samplesMan.getLumisToSkip(dsetname)
                process.source.lumisToSkip = lumisToSkip.getVLuminosityBlockRange()

            if isdata and self.lumiMask != "":
                if isFwlite:
                    sys.exit("Lumi mask not supported in FWlite",-1)

                import FWCore.PythonUtilities.LumiList as LumiList
                target = LumiList.LumiList(filename = self.lumiMask)
                if lumisToSkip: 
                    target = target.__sub__(lumisToSkip)                    
                process.source.lumisToProcess = target.getVLuminosityBlockRange()

                print process.source.lumisToProcess

        flist = []
        sflist = []

        # get the runs and lumis contained in each file of the secondary dataset
        if self.options.secondaryDataset:
            secondary_files = [fdata['file'][0]['name'] for fdata in das_query("file dataset=%s instance=prod/phys03" % self.options.secondaryDataset, 
                                                                               cmd='dasgoclient --dasmaps=./')['data']]
            runs_and_lumis = {}
            for s in secondary_files:
                runs_and_lumis[str(s)] = {data['lumi'][0]['run_number'] : data['lumi'][0]['lumi_section_num']
                                          for data in das_query("lumi file=%s instance=prod/phys03" % s, cmd='dasgoclient --dasmaps=./')['data']}

        for f in files:
            if len(f.split(":",1))>1:
                flist.append(str(f))
            else:
                flist.append(str("%s%s" % (self.filePrepend,f)))
            # keep useParent and secondaryDataset as exclusive options for the moment
            if self.options.useParentDataset:
                parent_files = das_query("parent file=%s instance=prod/phys03" % f, cmd='dasgoclient --dasmaps=./')['data']
                for parent_f in parent_files:
                    parent_f_name = str(parent_f['parent'][0]['name'])
                    sflist.append('root://cms-xrd-global.cern.ch/'+parent_f_name if 'root://' not in parent_f_name else parent_f_name)
            elif self.options.secondaryDataset != "":
                # match primary file to the corresponding secondary file(s)
                f_runs_and_lumis = {data['lumi'][0]['run_number'] : data['lumi'][0]['lumi_section_num']
                                    for data in das_query("lumi file=%s instance=prod/phys03" % f, cmd='dasgoclient --dasmaps=./')['data']}
                for s_name, s_runs_and_lumis in runs_and_lumis.items():
                    matched_runs = set(f_runs_and_lumis.keys()).intersection(s_runs_and_lumis.keys())
                    for run in matched_runs:
                        if any(lumi in f_runs_and_lumis[run] for lumi in s_runs_and_lumis[run]):
                            sflist.append(s_name)

        ## mitigate server glitches by copying the input files (microAOD) on the worker node
        if self.copyInputMicroAOD and not self.dryRun:
            commands.getstatusoutput('mkdir -p input_files/')
            for i,f in enumerate(flist):
                status, out = commands.getstatusoutput('xrdcp %s ./input_files/'%f)
                print(out)
                flocal = 'file:./input_files/'+f.split('/')[-1]
                flist[i] = flocal

        if len(flist) > 0:
            ## fwlite
            if isFwlite:
                ## process.fwliteInput.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.fwliteInput.fileNames = flist
            ## full framework
            else:
                ## process.source.fileNames.extend([ str("%s%s" % (self.filePrepend,f)) for f in  files])
                process.source.fileNames = flist
                if len(sflist) > 0:
                    process.source.secondaryFileNames = cms.untracked.vstring(sflist)
 
        ## fwlite
        if isFwlite:
            process.fwliteInput.maxEvents = self.options.maxEvents
            process.fwliteOutput.fileName = self.outputFile
        ## full framework
        else:
            process.maxEvents.input = self.options.maxEvents
            
            if hasOutput:
                process.out.fileName = self.outputFile

            if hasTFile:
                process.TFileService.fileName = tfile
    
        if self.tfileOut:
            if hasTFile:
                print "Could not run with both TFileService and custom tfileOut"
                sys.exit(-1)
            name,attr = self.tfileOut
            setattr( getattr( process, name ), attr, tfile )
            

        if self.dumpPython != "":
            from gzip import open
            pyout = open("%s.gz" % self.dumpPython,"w+")
            pyout.write( process.dumpPython() )
            pyout.close()