def __init__(self, treeName, outputDir, executable_addMEM, samples, era, debug, running_method, max_files_per_job, mem_integrations_per_job, max_mem_integrations, num_parallel_jobs): self.treeName = treeName self.outputDir = outputDir self.executable_addMEM = executable_addMEM self.channel = "2lss_1tau" self.mem_integrations_per_job = mem_integrations_per_job self.max_files_per_job = max_files_per_job self.max_mem_integrations = max_mem_integrations self.samples = samples self.era = era self.debug = debug assert(running_method.lower() in [ "sbatch", "makefile"]), "Invalid running method: %s" % running_method self.running_method = running_method self.is_sbatch = False self.is_makefile = False if self.running_method.lower() == "sbatch": self.is_sbatch = True else: self.is_makefile = True self.makefile = os.path.join( self.outputDir, "Makefile_%s" % self.channel) self.num_parallel_jobs = num_parallel_jobs self.workingDir = os.getcwd() print "Working directory is: " + self.workingDir create_if_not_exists(self.outputDir) self.stdout_file = codecs.open(os.path.join( self.outputDir, "stdout_%s.log" % self.channel), 'w', 'utf-8') self.stderr_file = codecs.open(os.path.join( self.outputDir, "stderr_%s.log" % self.channel), 'w', 'utf-8') self.dirs = {} self.samples = samples self.cfgFile_addMEM_original = os.path.join(self.workingDir, "addMEM_2lss_1tau_cfg.py") self.cfgFiles_addMEM_modified = {} self.logFiles_addMEM = {} self.sbatchFile_addMEM = os.path.join( self.outputDir, "sbatch_addMEM_%s.py" % self.channel) self.inputFiles = {} self.outputFiles = {} self.hadd_records = {} self.filesToClean = [] for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(sample_name) for dir_type in [ DKEY_CFGS, DKEY_NTUPLES, DKEY_FINAL_NTUPLES, DKEY_LOGS, DKEY_HADD ]: initDict(self.dirs, [ key_dir, dir_type ]) self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, process_name) self.cvmfs_error_log = {}
def get_scratch_dir(self): scratch_dir = "/scratch/%s" % getpass.getuser() if not os.path.exists(scratch_dir): print "Directory '%s' does not yet exist, creating it !!" % scratch_dir run_cmd(command_create_scratchDir) scratch_dir = os.path.join( scratch_dir, "tthAnalysis" + "_" + date.today().isoformat() ) create_if_not_exists(scratch_dir) return scratch_dir
def get_scratch_dir(self): scratch_dir = "/scratch/%s" % getpass.getuser() if not os.path.exists(scratch_dir): logging.info("Directory '%s' does not yet exist, creating it !!" % scratch_dir) run_cmd(command_create_scratchDir) scratch_dir = os.path.join( scratch_dir, "%s_%s" % (self.analysisName, datetime.date.today().isoformat()), ) create_if_not_exists(scratch_dir) return scratch_dir
def create(self): create_if_not_exists(self.hadd_log_dir_path) if self.running_method.lower() == 'sbatch': create_if_not_exists(self.hadd_script_dir_path) createScript_sbatch_hadd( sbatch_script_file_name=self.hadd_script_path, input_file_names=list(self.channel_info.keys()), output_file_name=self.final_output_file, script_file_name=self.hadd_script_path.replace('.py', '.sh'), log_file_name=self. hadd_log_executable_path, # the basename will be replaced anyways? working_dir=None, waitForJobs=True, auxDirName='', pool_id=uuid.uuid4(), verbose=False, max_input_files_per_job=len(self.channel_info), dry_run=self.dry_run, use_home=self.use_home, min_file_size=-1, ) logging.info("Generated hadd config file: %s" % self.hadd_script_path) self.hadd_script_path = 'python %s' % self.hadd_script_path additional_cmds = '' else: self.hadd_script_path = 'hadd -f {} {}'.format( os.path.basename(self.final_output_file), ' '.join(list(self.channel_info.keys()))) additional_cmds = 'mv {} {}'.format( os.path.basename(self.final_output_file), self.final_output_file) with open(self.makefile_path, 'w') as makefile: hadd_script_cmd = '{}{}'.format( 'sleep 60\n\t' if self.running_method.lower() == 'makefile' else '', self.hadd_script_path) makeFileContents = jinja2.Template(makeFileTemplate).render( output_file=self.final_output_file, channel_info=self.channel_info, hadd_script=hadd_script_cmd, hadd_wrapper_log=self.hadd_log_wrapper_path, additional_cmds=additional_cmds, validate_channels=' '.join(self.channels_to_validate), output_dir=self.output_dir, ) makefile.write(makeFileContents) logging.info("Created the makefile: %s" % self.makefile_path)
def create(self): """Creates all necessary config files and runs the Ntuple production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_prodNtuple, process_name)) inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) for jobId in inputFileList.keys(): key_dir = getKey(sample_name) key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % (key_file, self.inputFiles[key_file]) continue ##print "sample = '%s', jobId = %i: number of input files = %i" % (sample_name, jobId, len(self.inputFiles[key_file])) ##print self.inputFiles[key_file] assert(len(self.inputFiles[key_file]) == 1), "There is more than one input file!" self.cfgFiles_prodNtuple_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "produceNtuple_%s_%s_%i_cfg.py" % \ (self.channel, process_name, jobId)) self.outputFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_NTUPLES], "%s_%i.root" % \ (process_name, jobId)) self.logFiles_prodNtuple[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "produceNtuple_%s_%s_%i.log" % \ (self.channel, process_name, jobId)) self.createCfg_prodNtuple(self.inputFiles[key_file], self.outputFiles[key_file], self.era, self.cfgFiles_prodNtuple_modified[key_file]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_prodNtuple) self.createScript_sbatch() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_prodNtuple(lines_makefile) #self.addToMakefile_clean(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def run(self, clean): record_software_state(self.sw_ver_file_cfg, self.sw_ver_file_out, DEPENDENCIES) target = 'all' if clean: if not os.path.isfile(self.makefile_path): logging.error( "The makefile %s is missing and therefore it's not possible to clean anything; " "run sync Ntuple production first!" % self.makefile_path) sys.exit(1) target = 'clean' nof_parallel_jobs = len(self.channel_info) make_cmd = "make -f %s -j %d %s 2>%s 1>%s" % \ (self.makefile_path, nof_parallel_jobs, target, self.stderr_file_path, self.stdout_file_path) if self.running_method.lower() == "makefile": run_dir = re.sub('^/home', '/scratch', self.config_dir) create_if_not_exists(run_dir) make_cmd = re.sub('^make', 'make -C {}'.format(run_dir), make_cmd) logging.info("Running the make command: %s" % make_cmd) run_cmd(make_cmd) logging.info("All done")
def create(self): """Creates all necessary config files and runs the Ntuple production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if is_mc and process_name not in self.pileup_histograms: raise ValueError("Missing PU distribution for %s in file %s" % (process_name, self.pileup)) logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) key_dir = getKey(sample_name) subDirs = list( map( lambda y: os.path.join(self.dirs[key_dir][DKEY_NTUPLES], '%04d' % y), set(map(lambda x: x // 1000, inputFileList.keys())))) for subDir in subDirs: create_if_not_exists(subDir) for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: logging.warning( "ntupleFiles['%s'] = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue self.cfgFiles_prodNtuple_modified[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "produceNtuple_%s_%i_cfg.py" % (process_name, jobId)) self.outputFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_NTUPLES], "%04d" % (jobId // 1000), "tree_%i.root" % jobId) self.logFiles_prodNtuple[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "produceNtuple_%s_%i.log" % (process_name, jobId)) hlt_paths = sample_info["hlt_paths"] if not is_mc else [] hlt_cuts = list( Triggers(self.era).triggers_flat ) if self.preselection_cuts["applyHLTcut"] else [] jobOptions = { 'inputFiles': self.inputFiles[key_file], 'cfgFile_modified': self.cfgFiles_prodNtuple_modified[key_file], 'outputFile': self.outputFiles[key_file], 'is_mc': is_mc, 'random_seed': jobId, 'process_name': process_name, 'category_name': sample_info["sample_category"], 'triggers': hlt_paths, 'HLTcuts': hlt_cuts, } self.createCfg_prodNtuple(jobOptions) num_jobs = 0 if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) num_jobs = self.createScript_sbatch() logging.info("Generated %i job(s)" % num_jobs) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_prodNtuple(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) (secondary_files, primary_store, secondary_store) = self.initializeInputFileIds( sample_name, sample_info) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info[ "xsection"] * self.lumi / sample_info["nof_events"] sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] for central_or_shift in self.central_or_shifts: for jobId in range(len(self.inputFileIds[sample_name])): if central_or_shift != "central" and not is_mc: continue inputFiles = generate_input_list( self.inputFileIds[sample_name][jobId], secondary_files, primary_store, secondary_store, self.debug) key_dir = getKey(sample_name) key_file = getKey(sample_name, central_or_shift, jobId) self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%i.root" % \ (process_name, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % \ (self.channel, process_name, central_or_shift, jobId)) self.createCfg_analyze( inputFiles, self.histogramFiles[key_file], sample_category, triggers, self.lepton_selection, self.hadTau_selection, is_mc, central_or_shift, lumi_scale, self.cfgFiles_analyze_modified[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() logging.info( "Creating configuration files for executing 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: self.createCfg_prep_dcard(histogramToFit) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_clean(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] for mode in self.modes: key_dir = getKey(process_name, mode) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ mode ]), process_name) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ mode ]), process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) ##print "self.dirs = ", self.dirs for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) for mode in self.modes: for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") for central_or_shift in self.central_or_shifts: inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): # build config files for executing analysis code key_dir = getKey(process_name, mode) key_analyze_job = getKey(process_name, mode, central_or_shift, jobId) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, mode, central_or_shift, jobId)), 'histogramFile' : os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%i.root" % \ (process_name, mode, central_or_shift, jobId)), 'logFile' : os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % \ (self.channel, process_name, mode, central_or_shift, jobId)), 'sample_category' : sample_category, 'mode' : mode, 'lepton_selection' : self.lepton_selection, 'hadTau_selection' : self.hadTau_selection, 'SVfit4tau_logM_wMassConstraint_MarkovChain' : self.SVfit4tau_logM_wMassConstraint_MarkovChain, 'SVfit4tau_logM_woMassConstraint_MarkovChain' : self.SVfit4tau_logM_woMassConstraint_MarkovChain, 'SVfit4tau_logM_wMassConstraint_VAMP' : self.SVfit4tau_logM_wMassConstraint_VAMP, 'use_HIP_mitigation_mediumMuonId' : False, 'is_mc' : is_mc, 'central_or_shift' : central_or_shift, 'lumi_scale' : 1., 'apply_genWeight' : sample_info["genWeight"] if (is_mc and "genWeight" in sample_info) else False, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job]) # initialize input and output file names for hadd_stage1 key_hadd_stage1 = getKey(process_name, mode) if not key_hadd_stage1 in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1] = [] self.inputFiles_hadd_stage1[key_hadd_stage1].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s_%s.root" % \ (self.channel, process_name, mode)) # initialize input and output file names for hadd_stage2 key_hadd_stage2 = getKey() if not key_hadd_stage2 in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2] = [] self.inputFiles_hadd_stage2[key_hadd_stage2].append(self.outputFile_hadd_stage1[key_hadd_stage1]) self.outputFile_hadd_stage2[key_hadd_stage2] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage2_%s.root" % \ (self.channel)) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(process_name) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_info, self.max_files_per_job) for apply_jetSmearing in self.apply_jetSmearing_options: jetSmearingLabel = None if apply_jetSmearing: jetSmearingLabel = "jetSmearingEnabled" else: jetSmearingLabel = "jetSmearingDisabled" for apply_metSmearing in self.apply_metSmearing_options: metSmearingLabel = None if apply_metSmearing: metSmearingLabel = "metSmearingEnabled" else: metSmearingLabel = "metSmearingDisabled" for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] isSignal = True if process_name.find( "signal") != -1 else False logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) sample_category = sample_info["sample_category"] inputFileList = inputFileLists[sample_name] numJobsPerFile = None if sample_info[ "process_name_specific"] == "signal_ggf_nonresonant_node_sm_hh_2b2v": numJobsPerFile = 500 elif sample_info[ "process_name_specific"] == "signal_ggf_nonresonant_cHHH1_hh_2b2v": numJobsPerFile = 100 elif sample_info[ "process_name_specific"] == "TTJets_DiLept": numJobsPerFile = 50 elif sample_info[ "process_name_specific"] == "TTJets_DiLept_ext1": numJobsPerFile = 50 elif sample_info["process_name_specific"] == "TTTo2L2Nu": numJobsPerFile = 10 else: raise ValueError("Invalid sample: %s" % sample_info["process_name_specific"]) numJobs = numJobsPerFile * len(inputFileList.keys()) for jobId in range(1, numJobs + 1): ntupleId = ((jobId - 1) / numJobsPerFile) + 1 maxSelEvents = 500 skipSelEvents = maxSelEvents * ( (jobId - 1) % numJobsPerFile) # build config files for executing analysis code key_dir = getKey(process_name) key_analyze_job = getKey(process_name, jetSmearingLabel, metSmearingLabel, jobId) ntupleFiles = inputFileList[ntupleId] if len(ntupleFiles) == 0: logging.warning( "No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join( self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % (self.channel, process_name, jetSmearingLabel, metSmearingLabel, jobId)) histogramFile_path = os.path.join( self.dirs[key_dir][DKEY_HIST], "analyze_%s_%s_%s_%s_%i.root" % (self.channel, process_name, jetSmearingLabel, metSmearingLabel, jobId)) logFile_path = os.path.join( self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % (self.channel, process_name, jetSmearingLabel, metSmearingLabel, jobId)) rleOutputFile_path = os.path.join(self.dirs[key_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % (self.channel, process_name, jetSmearingLabel, metSmearingLabel, jobId)) \ if self.select_rle_output else "" self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles': ntupleFiles, 'cfgFile_modified': cfgFile_modified_path, 'histogramFile': histogramFile_path, 'logFile': logFile_path, 'selEventsFileName_output': rleOutputFile_path, 'apply_jetSmearing': apply_jetSmearing, 'apply_metSmearing': apply_metSmearing, 'maxSelEvents': maxSelEvents, 'skipSelEvents': skipSelEvents } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1 = getKey(process_name, jetSmearingLabel, metSmearingLabel) if not key_hadd_stage1 in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1] = [] self.inputFiles_hadd_stage1[key_hadd_stage1].append( self.jobOptions_analyze[key_analyze_job] ['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s_%s_%s.root" % \ (self.channel, process_name, jetSmearingLabel, metSmearingLabel)) # add output files of hadd_stage1 to list of input files for hadd_stage2 key_hadd_stage1 = getKey(process_name, jetSmearingLabel, metSmearingLabel) key_hadd_stage2 = getKey("") if not key_hadd_stage2 in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2] = [] self.inputFiles_hadd_stage2[key_hadd_stage2].append( self.outputFile_hadd_stage1[key_hadd_stage1]) self.outputFile_hadd_stage2[ key_hadd_stage2] = os.path.join( self.dirs[DKEY_HIST], "histograms_harvested_stage2_%s.root" % self.channel) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.targets.extend(self.outputFile_hadd_stage2.values()) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) ( secondary_files, primary_store, secondary_store ) = self.initializeInputFileIds(sample_name, sample_info) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"] sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] apply_trigger_bits = (is_mc and self.era == "2016" and sample_info["reHLT"]) or not is_mc for hadTau_selection in self.hadTau_selections: for hadTau_frWeight in [ "enabled", "disabled" ]: if hadTau_frWeight == "enabled" and not hadTau_selection.startswith("Fakeable"): continue hadTau_selection_and_frWeight = get_hadTau_selection_and_frWeight(hadTau_selection, hadTau_frWeight) for hadTau_genMatch in self.hadTau_genMatches: for hadTau_charge_selection in self.hadTau_charge_selections: for central_or_shift in self.central_or_shifts: for jobId in range(len(self.inputFileIds[sample_name])): if hadTau_genMatch != "all" and not is_mc: continue if hadTau_genMatch == "all" and is_mc: continue if central_or_shift != "central" and not (hadTau_selection.startswith("Tight") and hadTau_charge_selection == "OS"): continue if central_or_shift != "central" and not is_mc: continue if hadTau_selection == "Fakeable_mcClosure" and not hadTau_frWeight == "enabled": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttH") and sample_category != "signal": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttW") and sample_category != "TTW": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttZ") and sample_category != "TTZ": continue sample_category_and_genMatch = sample_category + hadTau_genMatch key_dir = getKey(sample_name, hadTau_selection, hadTau_frWeight, hadTau_charge_selection) key_file = getKey(sample_name, hadTau_selection, hadTau_frWeight, hadTau_genMatch, hadTau_charge_selection, central_or_shift, jobId) self.ntupleFiles[key_file] = generate_input_list(self.inputFileIds[sample_name][jobId], secondary_files, primary_store, secondary_store, self.debug) self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, hadTau_selection_and_frWeight, hadTau_genMatch, hadTau_charge_selection, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%s_%s_%i.root" % \ (process_name, hadTau_selection_and_frWeight, hadTau_genMatch, hadTau_charge_selection, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%s_%s_%i.log" % \ (self.channel, process_name, hadTau_selection_and_frWeight, hadTau_genMatch, hadTau_charge_selection, central_or_shift, jobId)) self.createCfg_analyze(self.ntupleFiles[key_file], self.histogramFiles[key_file], sample_category, self.era, triggers, hadTau_selection, hadTau_genMatch, self.apply_hadTauGenMatching, hadTau_frWeight, hadTau_charge_selection, is_mc, central_or_shift, lumi_scale, apply_trigger_bits, self.cfgFiles_analyze_modified[key_file]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() logging.info("Creating configuration files for executing 'addBackgrounds'") process_names = [] process_names.extend(self.nonfake_backgrounds) process_names.extend([ "signal", "ttH_htt", "ttH_hww", "ttH_hzz" ]) # sum non-fake contributions for each MC sample separately # input processes: TT2t0e0m0j, TT1t1e0m0j, TT1t0e1m0j", TT0t2e0m0j, TT0t1e1m0j, TT0t0e2m0j; TTW2t0e0m0j,... # output processes: TT; ... for process_name in process_names: for hadTau_selection in self.hadTau_selections: for hadTau_frWeight in self.hadTau_frWeights: if hadTau_frWeight == "enabled" and not hadTau_selection.startswith("Fakeable"): continue hadTau_selection_and_frWeight = get_hadTau_selection_and_frWeight(hadTau_selection, hadTau_frWeight) for hadTau_charge_selection in self.hadTau_charge_selections: key = getKey(process_name, hadTau_selection, hadTau_frWeight, hadTau_charge_selection) self.histogramFile_addBackgrounds[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgrounds_%s_%s_%s_%s.root" % \ (self.channel, process_name, hadTau_selection_and_frWeight, hadTau_charge_selection)) self.cfgFile_addBackgrounds_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgrounds_%s_%s_%s_%s_cfg.py" % \ (self.channel, process_name, hadTau_selection_and_frWeight, hadTau_charge_selection)) histogramDir = getHistogramDir(hadTau_selection, hadTau_frWeight, hadTau_charge_selection) processes_input = [ "%s%s" % (process_name, genMatch) for genMatch in self.hadTau_genMatches_nonfakes ] self.process_output_addBackgrounds[key] = process_name self.createCfg_addBackgrounds(self.histogramFile_hadd_stage1, self.histogramFile_addBackgrounds[key], self.cfgFile_addBackgrounds_modified[key], [ histogramDir ], processes_input, self.process_output_addBackgrounds[key]) # sum fake contributions for the total of all MC sample # input processes: TT1t0e0m1j, TT0t1e0m1j, TT0t0e1m1j, TT0t0e0m2j; TTW1t0e0m1j,... # output process: fakes_mc for hadTau_selection in self.hadTau_selections: for hadTau_frWeight in self.hadTau_frWeights: if hadTau_frWeight == "enabled" and not hadTau_selection.startswith("Fakeable"): continue hadTau_selection_and_frWeight = get_hadTau_selection_and_frWeight(hadTau_selection, hadTau_frWeight) for hadTau_charge_selection in self.hadTau_charge_selections: key = getKey(hadTau_selection, hadTau_frWeight, hadTau_charge_selection) self.histogramFile_addBackgrounds[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgrounds_%s_fakes_mc_%s_%s.root" % \ (self.channel, hadTau_selection_and_frWeight, hadTau_charge_selection)) self.cfgFile_addBackgrounds_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgrounds_%s_fakes_mc_%s_%s_cfg.py" % \ (self.channel, hadTau_selection_and_frWeight, hadTau_charge_selection)) histogramDir = getHistogramDir(hadTau_selection, hadTau_frWeight, hadTau_charge_selection) processes_input = [] for process_name in self.nonfake_backgrounds: for genMatch in self.hadTau_genMatches_fakes: processes_input.append("%s%s" % (process_name, genMatch)) self.process_output_addBackgrounds[key] = "fakes_mc" self.createCfg_addBackgrounds(self.histogramFile_hadd_stage1, self.histogramFile_addBackgrounds[key], self.cfgFile_addBackgrounds_modified[key], [ histogramDir ], processes_input, self.process_output_addBackgrounds[key]) logging.info("Creating configuration files for executing 'addBackgroundFakes'") for hadTau_charge_selection in self.hadTau_charge_selections: key = getKey("fakes_data", charge_selection) self.histogramFile_addFakes[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgroundJetToTauFakes_%s_%s.root" % \ (self.channel, hadTau_charge_selection)) self.cfgFile_addFakes_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgroundJetToTauFakes_%s_%s_cfg.py" % \ (self.channel, hadTau_charge_selection)) category_signal = "0l_2tau_%s_Tight" % hadTau_charge_selection category_sideband = "0l_2tau_%s_Fakeable_wFakeRateWeights" % hadTau_charge_selection self.createCfg_addFakes(self.histogramFile_hadd_stage1_5, self.histogramFile_addFakes[key], self.cfgFile_addFakes_modified[key], category_signal, category_sideband) logging.info("Creating configuration files for executing 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: self.createCfg_prep_dcard(histogramToFit) logging.info("Creating configuration files for executing 'makePlots'") self.createCfg_makePlots() if "SS" in self.hadTau_charge_selections: self.createCfg_makePlots(self.histogramDir_prep_dcard_SS, "SS") if "Fakeable_mcClosure" in self.hadTau_selections: self.createCfg_makePlots_mcClosure() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.addToMakefile_make_plots_mcClosure(lines_makefile) self.addToMakefile_clean(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def __init__(self, configDir, outputDir, executable_prodNtuple, cfgFile_prodNtuple, samples, max_files_per_job, era, preselection_cuts, leptonSelection, hadTauSelection, debug, running_method, version, num_parallel_jobs, pool_id=''): self.configDir = configDir self.outputDir = outputDir self.executable_prodNtuple = executable_prodNtuple self.max_num_jobs = 200000 self.samples = samples self.max_files_per_job = max_files_per_job self.era = era self.preselection_cuts = preselection_cuts self.leptonSelection = leptonSelection self.hadTauSelection = hadTauSelection self.debug = debug assert (running_method.lower() in ["sbatch", "makefile"]), "Invalid running method: %s" % running_method self.running_method = running_method self.is_sbatch = False self.is_makefile = False if self.running_method.lower() == "sbatch": self.is_sbatch = True else: self.is_makefile = True self.makefile = os.path.join(self.configDir, "Makefile_prodNtuple") self.num_parallel_jobs = num_parallel_jobs self.pool_id = pool_id if pool_id else uuid.uuid4() self.workingDir = os.getcwd() print "Working directory is: " + self.workingDir self.version = version create_if_not_exists(self.configDir) create_if_not_exists(self.outputDir) self.stdout_file = codecs.open( os.path.join(self.configDir, "stdout_prodNtuple.log"), 'w', 'utf-8') self.stderr_file = codecs.open( os.path.join(self.configDir, "stderr_prodNtuple.log"), 'w', 'utf-8') self.dirs = {} self.samples = samples self.cfgFile_prodNtuple_original = os.path.join( self.workingDir, cfgFile_prodNtuple) self.cfgFiles_prodNtuple_modified = {} self.logFiles_prodNtuple = {} self.sbatchFile_prodNtuple = os.path.join(self.configDir, "sbatch_prodNtuple.py") self.inputFiles = {} self.outputFiles = {} self.filesToClean = [] for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(sample_name) for dir_type in [DKEY_CFGS, DKEY_NTUPLES, DKEY_LOGS]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, process_name) for dir_type in [DKEY_CFGS, DKEY_LOGS]: initDict(self.dirs, [dir_type]) if dir_type in [DKEY_CFGS, DKEY_NTUPLES, DKEY_LOGS]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type) ##print "self.dirs = ", self.dirs self.cvmfs_error_log = {}
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] for charge_selection in self.charge_selections: key_dir = getKey(process_name, charge_selection) for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, "_".join([charge_selection]), process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, "_".join([charge_selection]), process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) ##print "self.dirs = ", self.dirs for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_name, sample_info, self.max_files_per_job, self.debug) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info[ "xsection"] * self.lumi / sample_info["nof_events"] apply_genWeight = sample_info["apply_genWeight"] if ( is_mc and "apply_genWeight" in sample_info.keys()) else False sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] apply_trigger_bits = ( is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc for charge_selection in self.charge_selections: for central_or_shift in self.central_or_shifts: inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): if central_or_shift != "central" and not is_mc: continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttH" ) and sample_category != "signal": continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttW" ) and sample_category != "TTW": continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttZ" ) and sample_category != "TTZ": continue # build config files for executing analysis code key_dir = getKey(process_name, charge_selection) key_analyze_job = getKey(process_name, charge_selection, central_or_shift, jobId) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % ( key_job, ntupleFiles) continue self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, charge_selection, central_or_shift, jobId)), 'histogramFile' : os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%i.root" % \ (process_name, charge_selection, central_or_shift, jobId)), 'logFile' : os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % \ (self.channel, process_name, charge_selection, central_or_shift, jobId)), 'sample_category' : sample_category, 'triggers' : sample_info["triggers"], 'charge_selection' : charge_selection, 'jet_minPt' : self.jet_minPt, 'jet_maxPt' : self.jet_maxPt, 'jet_minAbsEta' : self.jet_minAbsEta, 'jet_maxAbsEta' : self.jet_maxAbsEta, 'hadTau_selection_denominator' : self.hadTau_selection_denominator, 'hadTau_selections_numerator' : self.hadTau_selections_numerator, 'absEtaBins' : self.absEtaBins, ##'use_HIP_mitigation_mediumMuonId' : sample_info["use_HIP_mitigation_mediumMuonId"], 'use_HIP_mitigation_mediumMuonId' : True, 'is_mc' : is_mc, 'central_or_shift' : central_or_shift, 'lumi_scale' : 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"], 'apply_genWeight' : sample_info["genWeight"] if (is_mc and "genWeight" in sample_info.keys()) else False, 'apply_trigger_bits' : (is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job]) # initialize input and output file names for hadd_stage1 key_hadd_stage1 = getKey(process_name, charge_selection) if not key_hadd_stage1 in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1] = [] self.inputFiles_hadd_stage1[key_hadd_stage1].append( self.jobOptions_analyze[key_analyze_job] ['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s_%s.root" % \ (self.channel, process_name, charge_selection)) # initialize input and output file names for hadd_stage2 key_hadd_stage1 = getKey(process_name, charge_selection) key_hadd_stage2 = getKey(charge_selection) if not key_hadd_stage2 in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2] = [] self.inputFiles_hadd_stage2[key_hadd_stage2].append( self.outputFile_hadd_stage1[key_hadd_stage1]) self.outputFile_hadd_stage2[key_hadd_stage2] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage2_%s_%s.root" % \ (self.channel, charge_selection)) logging.info( "Creating configuration files for executing 'comp_jetToTauFakeRate'" ) for charge_selection in self.charge_selections: key_comp_jetToTauFakeRate_job = getKey(charge_selection) key_hadd_stage2 = getKey(charge_selection) self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job] = { 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "comp_jetToTauFakeRate_%s_cfg.py" % charge_selection), 'outputFile': os.path.join( self.dirs[DKEY_HIST], "comp_jetToTauFakeRate_%s.root" % charge_selection), 'logFile': os.path.join( self.dirs[DKEY_LOGS], "comp_jetToTauFakeRate_%s.log" % charge_selection), 'looseRegion': "jetToTauFakeRate_%s/denominator/" % charge_selection, 'tightRegion': "jetToTauFakeRate_%s/numerator/" % charge_selection, 'absEtaBins': self.absEtaBins, 'ptBins': self.ptBins } self.createCfg_comp_jetToTauFakeRate( self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job]) self.targets.append(self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job]['outputFile']) logging.info("Creating configuration files to run 'makePlots'") for charge_selection in self.charge_selections: key_makePlots_job = getKey(charge_selection) key_hadd_stage2 = getKey(charge_selection) self.jobOptions_make_plots[key_makePlots_job] = { 'executable': self.executable_make_plots, 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join(self.dirs[DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile': os.path.join(self.dirs[DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir': "jetToTauFakeRate_%s" % charge_selection, 'label': None, 'make_plots_backgrounds': ["TT", "TTW", "TTZ", "EWK", "Rares"], } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) self.cfgFile_make_plots = self.cfgFile_make_plots_denominator for absEtaBin in ["absEtaLt1_5", "absEta1_5to9_9"]: key_makePlots_job = getKey(charge_selection, absEtaBin, "denominator") key_hadd_stage2 = getKey(charge_selection) self.jobOptions_make_plots[key_makePlots_job] = { 'executable': self.executable_make_plots, 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "makePlots_%s_%s_denominator_%s_cfg.py" % (self.channel, charge_selection, absEtaBin)), 'outputFile': os.path.join( self.dirs[DKEY_PLOT], "makePlots_%s_%s_denominator_%s.png" % (self.channel, charge_selection, absEtaBin)), 'histogramDir': "jetToTauFakeRate_%s/denominator/%s" % (charge_selection, absEtaBin), 'label': None, 'make_plots_backgrounds': ["TT", "TTW", "TTZ", "EWK", "Rares"], } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) for hadTau_selection_numerator in self.hadTau_selections_numerator: key_makePlots_job = getKey(charge_selection, absEtaBin, "numerator", hadTau_selection_numerator) key_hadd_stage2 = getKey(charge_selection) self.jobOptions_make_plots[key_makePlots_job] = { 'executable': self.executable_make_plots, 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "makePlots_%s_%s_numerator_%s_%s_cfg.py" % (self.channel, charge_selection, hadTau_selection_numerator, absEtaBin)), 'outputFile': os.path.join( self.dirs[DKEY_PLOT], "makePlots_%s_%s_numerator_%s_%s.png" % (self.channel, charge_selection, hadTau_selection_numerator, absEtaBin)), 'histogramDir': "jetToTauFakeRate_%s/numerator/%s/%s" % (charge_selection, hadTau_selection_numerator, absEtaBin), 'label': None, 'make_plots_backgrounds': ["TT", "TTW", "TTZ", "EWK", "Rares"], } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_comp_jetToTauFakeRate) self.sbatchFile_comp_jetToTauFakeRate = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_comp_jetToTauFakeRate.py") self.createScript_sbatch(self.executable_comp_jetToTauFakeRate, self.sbatchFile_comp_jetToTauFakeRate, self.jobOptions_comp_jetToTauFakeRate) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_comp_jetToTauFakeRate(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if not is_mc: continue logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList_map = generateInputFileList(sample_info, 1) key_dir = getKey(process_name) key_file = getKey(process_name) self.inputFiles[key_file] = list( itertools.chain(*inputFileList_map.values())) if len(self.inputFiles[key_file]) == 0: logging.warning("'%s' = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue outputFile = os.path.join(self.dirs[key_dir][DKEY_RESULTS], "%s.txt" % process_name) self.outputFiles[key_file] = outputFile if os.path.isfile(outputFile): logging.info('File {} already exists --> skipping job'.format( outputFile)) continue self.cfgFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "refGenWeight_%s_cfg.txt" % (process_name)) self.logFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "refGenWeight_%s.log" % (process_name)) self.scriptFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "refGenWeight_%s_cfg.sh" % (process_name)) self.plotFiles[key_file] = ' '.join([ os.path.join(self.dirs[key_dir][DKEY_PLOTS], "refGenWeight_%s.%s" % (process_name, extension)) for extension in ['pdf', 'png'] ]) self.jobOptions_sbatch[key_file] = { 'inputFiles': self.inputFiles[key_file], 'cfgFile_path': self.cfgFiles[key_file], 'cmdParams': "-i {} -o {} -p {} -v".format( self.cfgFiles[key_file], self.outputFiles[key_file], self.plotFiles[key_file], ), 'outputFile': self.outputFiles[key_file], 'logFile': self.logFiles[key_file], 'scriptFile': self.scriptFiles[key_file], } self.createCfg(self.jobOptions_sbatch[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) self.num_jobs['refGenWeight'] += self.createScript_sbatch( self.executable, self.sbatchFile, self.jobOptions_sbatch) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile(lines_makefile) self.addToMakefile_final(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the MEM -- either locally or on the batch system """ statistics = {} for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue if not os.path.exists(sample_info['local_paths'][0]['path']): logging.warning("Skipping sample {sample_name}".format(sample_name = sample_name)) continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_addMEM, process_name)) inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) # typically, the analysis ends here and starts looping b/c the smallest unit of work processes at least one file # we need, however, to split the file into event ranges in such a way that each job performs # mem_integrations_per_job MEM integrations # so what we are going to do is to open each set of files in inputFileList, read the variable # requestMEM_2lss_1tau and try to gather the event ranges such that each event range # performs up to mem_integrations_per_job integrations per job memEvtRangeDict = self.memJobList(inputFileList) for jobId in memEvtRangeDict.keys(): key_dir = getKey(sample_name) key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = memEvtRangeDict[jobId]['input_fileset'] # there should always be a job assert(self.inputFiles[key_file] > 0), "More than one input file: %s ?? !!" % \ ', '.join(self.inputFiles[key_file]) #TODO: is this assertion really needed? in principle, no ... assert(len(self.inputFiles[key_file]) == 1), "There is more than one input file!" self.cfgFiles_addMEM_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i_cfg.py" % \ (self.channel, process_name, jobId)) self.outputFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_NTUPLES], "%s_%i.root" % \ (process_name, jobId)) self.logFiles_addMEM[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "addMEM_%s_%s_%i.log" % \ (self.channel, process_name, jobId)) self.createCfg_addMEM( self.inputFiles[key_file], memEvtRangeDict[jobId]['event_range'][0], memEvtRangeDict[jobId]['event_range'][1], self.outputFiles[key_file], self.era, self.cfgFiles_addMEM_modified[key_file], ) # associate the output file with the fileset_id fileset_id = memEvtRangeDict[jobId]['fileset_id'] hadd_output = os.path.join( self.dirs[key_dir][DKEY_FINAL_NTUPLES], '%s_%i.root' % ('tree', fileset_id) ) if hadd_output not in self.hadd_records: self.hadd_records[hadd_output] = {} self.hadd_records[hadd_output]['output_files'] = [] self.hadd_records[hadd_output]['fileset_id'] = fileset_id self.hadd_records[hadd_output]['output_files'].append(self.outputFiles[key_file]) #self.filesToClean.append(self.outputFiles[key_file]) # let's sum the number of integration per sample nofEntriesMap = {} for v in memEvtRangeDict.values(): if v['fileset_id'] not in nofEntriesMap: nofEntriesMap[v['fileset_id']] = v['nof_entries'] statistics[process_name] = { 'nof_int' : sum([entry['nof_int'] for entry in memEvtRangeDict.values()]), 'nof_entries' : sum(nofEntriesMap.values()), 'nof_jobs' : len(memEvtRangeDict), 'nof_events_pass' : sum([entry['nof_events_pass'] for entry in memEvtRangeDict.values()]), 'nof_int_pass' : sum([entry['nof_int_pass'] for entry in memEvtRangeDict.values()]), } if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addMEM) self.createScript_sbatch() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_addMEM(lines_makefile) self.addToMakefile_hadd(lines_makefile) self.createMakefile(lines_makefile) ws_len = max([len(kk) + 1 for kk in statistics.keys()]) total_nof_integrations_sum = sum(x['nof_int'] for x in statistics.values()) total_nof_entires = sum(x['nof_entries'] for x in statistics.values()) total_nof_integrations_avg = float(total_nof_integrations_sum) / total_nof_entires total_nof_jobs = sum(x['nof_jobs'] for x in statistics.values()) total_nof_pass = sum(x['nof_events_pass'] for x in statistics.values()) total_nof_int_pass_avg = float(sum(x['nof_int_pass'] for x in statistics.values())) / total_nof_pass for k, v in statistics.iteritems(): print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d (%.2f%%) evt pass; %.2f int/evt pass)' % (k, ' ' * (ws_len - len(k)), v['nof_int'], v['nof_entries'], v['nof_jobs'], float(v['nof_int']) / v['nof_entries'], v['nof_events_pass'], (100 * float(v['nof_events_pass']) / v['nof_entries']), float(v['nof_int_pass']) / v['nof_events_pass'])) print('%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d evt pass; %.2f int/evt pass)' % ('total', ' ' * (ws_len - len('total')), total_nof_integrations_sum, total_nof_entires, total_nof_jobs, total_nof_integrations_avg, total_nof_pass, total_nof_int_pass_avg)) if total_nof_integrations_sum > self.max_mem_integrations: logging.error("Will not start the jobs (max nof integrations exceeded)!") return False else: logging.info("Done") return True
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) for lepton_selection in self.lepton_selections: central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]: continue if central_or_shift_or_dummy != "central" and central_or_shift_or_dummy not in central_or_shift_extensions: if not is_mc: continue if not self.accept_central_or_shift(central_or_shift_or_dummy, sample_info): continue key_dir = getKey(process_name_or_dummy, lepton_selection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection ]), process_name_or_dummy) for subdirectory in [ "prepareDatacards" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]: initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_COMBINE_OUTPUT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) for lepton_selection in self.lepton_selections: for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") inputFileList = inputFileLists[sample_name] for central_or_shift in self.central_or_shifts: if central_or_shift != "central" and not is_mc: continue # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, lepton_selection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'leptonSelection' : lepton_selection, 'applyFakeRateWeights' : "disabled", 'central_or_shift' : central_or_shift, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection) hadd_stage1_job_tuple = (process_name, lepton_selection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1.keys(): self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple) # initialize input and output file names for hadd_stage2 key_hadd_stage1_job = getKey(process_name, lepton_selection) key_hadd_stage2_dir = getKey("hadd", lepton_selection) key_hadd_stage2_job = getKey(lepton_selection) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2.keys(): self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s.root" % lepton_selection) logging.info("Creating configuration files to run 'prepareDatacards'") processesToCopy = [] for process in self.prep_dcard_processesToCopy: processesToCopy.append(process) self.prep_dcard_processesToCopy = processesToCopy processesToCopy = [] for process in self.prep_dcard_signals: processesToCopy.append(process) self.prep_dcard_signals = processesToCopy for histogramToFit in self.histograms_to_fit: key_hadd_stage2_job = getKey("Tight") key_prep_dcard_dir = getKey("prepareDatacards") prep_dcard_job_tuple = (self.channel, histogramToFit) key_prep_dcard_job = getKey(histogramToFit) datacardFile = os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : datacardFile, 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : None } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) jobOptions_makefile = copy.deepcopy(self.jobOptions_postFit) jobOptions_makefile['fit_result'] = os.path.join( self.dirs[DKEY_COMBINE_OUTPUT], 'fit_{}'.format(histogramToFit), jobOptions_makefile['target'] ) jobOptions_makefile['hadd_stage2'] = self.outputFile_hadd_stage2[key_hadd_stage2_job] jobOptions_makefile['prepare_datacard'] = datacardFile jobOptions_makefile['data_datacard'] = os.path.join( self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_data_%s_%s.root" % prep_dcard_job_tuple ) jobOptions_makefile['pseudodata_datacard'] = os.path.join( self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_pseudodata_%s_%s.root" % prep_dcard_job_tuple ) jobOptions_makefile['makefile'] = os.path.join( self.dirs[DKEY_COMBINE_OUTPUT], 'Makefile_{}'.format(histogramToFit) ) jobOptions_makefile['stdout'] = os.path.join( self.dirs[DKEY_COMBINE_OUTPUT], 'stdout_{}.log'.format(histogramToFit) ) self.createCfg_postFit(jobOptions_makefile) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile, make_dependency = "phony_hadd_stage1") self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_postFit(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"] apply_genWeight = sample_info["apply_genWeight"] if (is_mc and "apply_genWeight" in sample_info.keys()) else False sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] apply_trigger_bits = (is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc for lepton_and_hadTau_selection in self.lepton_and_hadTau_selections: lepton_selection = lepton_and_hadTau_selection hadTau_selection = lepton_and_hadTau_selection if self.applyFakeRateWeights == "2lepton": hadTau_selection = "Tight" hadTau_selection = "|".join([ hadTau_selection, self.hadTau_selection_part2 ]) for lepton_and_hadTau_frWeight in self.lepton_and_hadTau_frWeights: if lepton_and_hadTau_frWeight == "enabled" and not lepton_and_hadTau_selection.startswith("Fakeable"): continue if lepton_and_hadTau_selection == "Fakeable_mcClosure" and not lepton_and_hadTau_frWeight == "enabled": continue lepton_and_hadTau_selection_and_frWeight = get_lepton_and_hadTau_selection_and_frWeight(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight) for lepton_charge_selection in self.lepton_charge_selections: for central_or_shift in self.central_or_shifts: inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) for jobId in inputFileList.keys(): if central_or_shift != "central" and not (lepton_and_hadTau_selection.startswith("Tight") and lepton_charge_selection == "SS"): continue if central_or_shift != "central" and not is_mc: continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttH") and sample_category != "signal": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttW") and sample_category != "TTW": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttZ") and sample_category != "TTZ": continue key_dir = getKey(sample_name, lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) key_file = getKey(sample_name, lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection, central_or_shift, jobId) self.ntupleFiles[key_file] = inputFileList[jobId] if len(self.ntupleFiles[key_file]) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % (key_file, self.ntupleFiles[key_file]) continue self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%s_%i.root" % \ (process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%s_%i.log" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection, central_or_shift, jobId)) self.rleOutputFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%s_%i.txt" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection, central_or_shift, jobId)) if self.select_rle_output else "" applyFakeRateWeights = self.applyFakeRateWeights if lepton_and_hadTau_frWeight == "disabled": applyFakeRateWeights = "disabled" self.createCfg_analyze(self.ntupleFiles[key_file], self.histogramFiles[key_file], sample_category, self.era, triggers, lepton_selection, self.apply_leptonGenMatching, lepton_charge_selection, hadTau_selection, self.apply_hadTauGenMatching, applyFakeRateWeights, is_mc, central_or_shift, lumi_scale, apply_genWeight, apply_trigger_bits, self.cfgFiles_analyze_modified[key_file], self.rleOutputFiles[key_file]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() for key in self.histogramFiles.keys(): self.inputFiles_hadd_stage1.append(self.histogramFiles[key]) logging.info("Creating configuration files for executing 'addBackgrounds'") process_names = [] process_names.extend(self.nonfake_backgrounds) process_names.extend([ "signal", "ttH_htt", "ttH_hww", "ttH_hzz" ]) # sum non-fake contributions for each MC sample separately # input processes: TT2l0j,... # output processes: TT; ... for process_name in process_names: for lepton_and_hadTau_selection in self.lepton_and_hadTau_selections: for lepton_and_hadTau_frWeight in self.lepton_and_hadTau_frWeights: if lepton_and_hadTau_frWeight == "enabled" and not lepton_and_hadTau_selection.startswith("Fakeable"): continue if lepton_and_hadTau_selection == "Fakeable_mcClosure" and not lepton_and_hadTau_frWeight == "enabled": continue lepton_and_hadTau_selection_and_frWeight = get_lepton_and_hadTau_selection_and_frWeight(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight) for lepton_charge_selection in self.lepton_charge_selections: key = getKey(process_name, lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) self.histogramFile_addBackgrounds[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgrounds_%s_%s_%s_%s.root" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection)) self.cfgFile_addBackgrounds_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgrounds_%s_%s_%s_%s_cfg.py" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection)) histogramDir = getHistogramDir(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) processes_input = [ "%s%s" % (process_name, genMatch) for genMatch in self.lepton_and_hadTau_genMatches_nonfakes ] # CV: treat fakes in ttH signal events as "signal", not as "background" ##if process_name in [ "signal", "ttH_htt", "ttH_hww", "ttH_hzz" ]: ## processes_input.extend([ "%s%s" % (process_name, genMatch) for genMatch in self.lepton_and_hadTau_genMatches_fakes ]) self.process_output_addBackgrounds[key] = process_name self.createCfg_addBackgrounds(self.histogramFile_hadd_stage1, self.histogramFile_addBackgrounds[key], self.cfgFile_addBackgrounds_modified[key], [ histogramDir ], processes_input, self.process_output_addBackgrounds[key]) # sum fake contributions for each MC sample separately # input processes: TT1l1j,TT0l2j,... # output processes: fakes_TT; ... for process_name in process_names: for lepton_and_hadTau_selection in self.lepton_and_hadTau_selections: for lepton_and_hadTau_frWeight in self.lepton_and_hadTau_frWeights: if lepton_and_hadTau_frWeight == "enabled" and not lepton_and_hadTau_selection.startswith("Fakeable"): continue if lepton_and_hadTau_selection == "Fakeable_mcClosure" and not lepton_and_hadTau_frWeight == "enabled": continue lepton_and_hadTau_selection_and_frWeight = get_lepton_and_hadTau_selection_and_frWeight(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight) for lepton_charge_selection in self.lepton_charge_selections: key = getKey("fakes_%s" % process_name, lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) self.histogramFile_addBackgrounds[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgrounds_%s_fakes_%s_%s_%s.root" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection)) self.cfgFile_addBackgrounds_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgrounds_%s_fakes_%s_%s_%s_cfg.py" % \ (self.channel, process_name, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection)) histogramDir = getHistogramDir(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) processes_input = [ "%s%s" % (process_name, genMatch) for genMatch in self.lepton_and_hadTau_genMatches_fakes ] self.process_output_addBackgrounds[key] = "fakes_%s" % process_name self.createCfg_addBackgrounds(self.histogramFile_hadd_stage1, self.histogramFile_addBackgrounds[key], self.cfgFile_addBackgrounds_modified[key], [ histogramDir ], processes_input, self.process_output_addBackgrounds[key]) # sum fake contributions for the total of all MC samples # input processes: TT1l1j,TT0l2j,... # output process: fakes_mc for lepton_and_hadTau_selection in self.lepton_and_hadTau_selections: for lepton_and_hadTau_frWeight in self.lepton_and_hadTau_frWeights: if lepton_and_hadTau_frWeight == "enabled" and not lepton_and_hadTau_selection.startswith("Fakeable"): continue if lepton_and_hadTau_selection == "Fakeable_mcClosure" and not lepton_and_hadTau_frWeight == "enabled": continue lepton_and_hadTau_selection_and_frWeight = get_lepton_and_hadTau_selection_and_frWeight(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight) for lepton_charge_selection in self.lepton_charge_selections: key = getKey(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) self.histogramFile_addBackgrounds[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgrounds_%s_fakes_mc_%s_%s.root" % \ (self.channel, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection)) self.cfgFile_addBackgrounds_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgrounds_%s_fakes_mc_%s_%s_cfg.py" % \ (self.channel, lepton_and_hadTau_selection_and_frWeight, lepton_charge_selection)) histogramDir = getHistogramDir(lepton_and_hadTau_selection, lepton_and_hadTau_frWeight, lepton_charge_selection) processes_input = [] for process_name in self.nonfake_backgrounds: for genMatch in self.lepton_and_hadTau_genMatches_fakes: processes_input.append("%s%s" % (process_name, genMatch)) self.process_output_addBackgrounds[key] = "fakes_mc" self.createCfg_addBackgrounds(self.histogramFile_hadd_stage1, self.histogramFile_addBackgrounds[key], self.cfgFile_addBackgrounds_modified[key], [ histogramDir ], processes_input, self.process_output_addBackgrounds[key]) logging.info("Creating configuration files for executing 'addBackgroundLeptonFakes'") for lepton_charge_selection in self.lepton_charge_selections: key = getKey("fakes_data", lepton_charge_selection) self.histogramFile_addFakes[key] = os.path.join(self.outputDir, DKEY_HIST, "addBackgroundLeptonFakes_%s_%s.root" % \ (self.channel, lepton_charge_selection)) self.cfgFile_addFakes_modified[key] = os.path.join(self.outputDir, DKEY_CFGS, "addBackgroundLeptonFakes_%s_%s_cfg.py" % \ (self.channel, lepton_charge_selection)) category_signal = "2lss_1tau_%s_Tight" % lepton_charge_selection category_sideband = "2lss_1tau_%s_Fakeable_wFakeRateWeights" % lepton_charge_selection self.createCfg_addFakes(self.histogramFile_hadd_stage1_5, self.histogramFile_addFakes[key], self.cfgFile_addFakes_modified[key], category_signal, category_sideband) logging.info("Creating configuration files for executing 'addBackgroundLeptonFlips'") self.createCfg_addFlips(self.histogramFile_hadd_stage1_5, self.histogramFile_addFlips, self.cfgFile_addFlips_modified) logging.info("Creating configuration files for executing 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: self.createCfg_prep_dcard(histogramToFit) logging.info("Creating configuration files for executing 'makePlots'") self.createCfg_makePlots() if "OS" in self.lepton_charge_selections: make_plots_backgrounds = self.make_plots_backgrounds if "flips_data" in make_plots_backgrounds: make_plots_backgrounds.remove("flips_data") self.createCfg_makePlots(self.histogramDir_prep_dcard_OS, "OS", make_plots_backgrounds) if "Fakeable_mcClosure" in self.lepton_and_hadTau_selections: self.createCfg_makePlots_mcClosure() self.inputFiles_hadd_stage2 = [ self.histogramFile_hadd_stage1_5 ] + self.histogramFile_addFakes.values() + [ self.histogramFile_addFlips ] logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.addToMakefile_make_plots_mcClosure(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] central_or_shifts_extended = [""] central_or_shifts_extended.extend(self.central_or_shifts) central_or_shifts_extended.extend( ["hadd", "copyHistograms", "addBackgrounds"]) for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [process_name, "hadd"] for process_name_or_dummy in process_name_extended: if process_name_or_dummy in [ "hadd" ] and central_or_shift_or_dummy != "": continue if central_or_shift_or_dummy in [ "hadd", "copyHistograms", "addBackgrounds" ] and process_name_or_dummy in ["hadd"]: continue key_dir = getKey(process_name_or_dummy, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name_or_dummy, central_or_shift_or_dummy) for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_SYNC, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_HADD_RT]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0 frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100 * numDirectories_created >= frac * numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_info, self.max_files_per_job) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): for central_or_shift in self.central_or_shifts: logging.info(" ... for systematic uncertainty %s" % central_or_shift) key_analyze_dir = getKey(process_name, central_or_shift) analyze_job_tuple = (process_name, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: print("Warning: no ntupleFiles --> skipping job !!") continue syncOutput = os.path.join( self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s.root' % (self.channel, central_or_shift)) syncOutputTree = self.output_tree if central_or_shift == "central" else os.path.join( central_or_shift, self.output_tree) self.inputFiles_sync['sync'].append(syncOutput) cfgFile_modified_path = os.path.join( self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%i.log" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles': ntupleFiles, 'cfgFile_modified': cfgFile_modified_path, 'histogramFile': '', 'logFile': logFile_path, 'syncTree': syncOutputTree, 'syncOutput': syncOutput, 'syncRLE': self.rle_select if self.rle_select and '%s' not in self.rle_select else '', 'useNonNominal': self.use_nonnominal, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job], sample_info) logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_syncNtuple(lines_makefile) outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel) self.outputFile_sync['sync'] = outputFile_sync_path self.addToMakefile_hadd_sync(lines_makefile) self.targets.extend(self.phoniesToAdd) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if not is_mc: continue logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) key_dir = getKey(process_name) outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO], "%s.root" % process_name) self.outputFiles[process_name] = { 'inputFiles': [], 'outputFile': outputFile, } if os.path.isfile(outputFile) and tools_is_file_ok( outputFile, min_file_size=2000): logging.info('File {} already exists --> skipping job'.format( outputFile)) continue for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: logging.warning("'%s' = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue self.cfgFiles_projection[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "project_%s_%i_cfg.txt" % (process_name, jobId)) self.outputFiles_tmp[key_file] = os.path.join( self.dirs[key_dir][DKEY_HISTO_TMP], "histogram_%i.root" % jobId) self.logFiles_projection[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "project_%s_%i.log" % (process_name, jobId)) self.scriptFiles_projection[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "project_%s_%i_cfg.sh" % (process_name, jobId)) projection_module = self.projection_module if projection_module == "count": projection_module = "countHistogramAll" if sample_name.startswith('/TTTo'): projection_module += "CompTopRwgt" elif sample_info['sample_category'].startswith('ttH'): projection_module += "CompHTXS" elif isSplitByNlheJet(process_name): projection_module += "SplitByLHENjet" elif isSplitByNlheHT(process_name): projection_module += "SplitByLHEHT" elif isSplitByNlheJetHT(process_name, sample_name): projection_module += "SplitByLHENjetHT" self.jobOptions_sbatch[key_file] = { 'histName': process_name, 'inputFiles': self.inputFiles[key_file], 'cfgFile_path': self.cfgFiles_projection[key_file], 'outputFile': self.outputFiles_tmp[key_file], 'logFile': self.logFiles_projection[key_file], 'scriptFile': self.scriptFiles_projection[key_file], 'projection_module': projection_module, } if self.projection_module != 'puHist': self.jobOptions_sbatch[key_file][ 'ref_genWeight'] = self.ref_genWeights[process_name] if process_name not in self.ref_genWeights: raise RuntimeError( "Unable to find reference LHE weight for process %s" % process_name) self.createCfg_project(self.jobOptions_sbatch[key_file]) self.outputFiles[process_name]['inputFiles'].append( self.outputFiles_tmp[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) self.num_jobs['project'] += self.createScript_sbatch( self.executable, self.sbatchFile_projection, self.jobOptions_sbatch) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_project(lines_makefile) self.addToMakefile_hadd(lines_makefile) if self.plot: self.addToMakefile_plot(lines_makefile) self.addToMakefile_finalHadd(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def submitJob(self, inputFiles, executable, cfgFile, outputFilePath, outputFiles, logFile=None, skipIfOutputFileExists=False): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ # raise if logfile missing if not logFile: if not self.logFileDir: raise ValueError( "Please call 'setLogFileDir' before calling 'submitJob' !!") logFile = os.path.join(self.logFileDir, os.path.basename( script_file).replace(".sh", ".log")) # if any of the output files exists, returns (Margus: BUG? Because only # that file should be skipped, not all?) if skipIfOutputFileExists: for outputFile in outputFiles: if os.path.exists(os.path.join(outputFilePath, outputFile)): print "output file = '%s' exists --> skipping !!" % os.path.join(outputFilePath, outputFile) return if not self.workingDir: raise ValueError( "Please call 'setWorkingDir' before calling 'submitJob' !!") # create scratch dir scratchDir = "/scratch/%s" % getpass.getuser() if not os.path.exists(scratchDir): print "Directory '%s' does not yet exist, creating it !!" % scratchDir run_cmd(command_create_scratchDir) scratchDir = os.path.join( scratchDir, "tthAnalysis" + "_" + date.today().isoformat()) create_if_not_exists(scratchDir) # create script for executing jobs script_file = cfgFile.replace(".py", ".sh") script_file = script_file.replace("_cfg", "") wrapper_log_file = logFile.replace('.log', '_wrapper.log') executable_log_file = logFile.replace('.log', '_executable.log') command = "%s --partition=%s --output=%s %s" % ( self.command_submit, self.queue, wrapper_log_file, script_file) script = jinja2.Template(job_template).render( working_dir = self.workingDir, scratch_dir = scratchDir, exec_name = executable, cfg_file = cfgFile, inputFiles = " ".join(inputFiles), outputDir = outputFilePath, outputFiles = " ".join(outputFiles), wrapper_log_file = wrapper_log_file, executable_log_file = executable_log_file, RUNNING_COMMAND = command ) print "writing sbatch script file = '%s'" % script_file with codecs.open(script_file, "w", "utf-8") as f: f.write(script) print "<submitJob>: command = %s" % command run_cmd_output = run_cmd(command) print "run_cmd_output: %s" % run_cmd_output ret_val = run_cmd_output.split()[-1] print "ret_val: %s" % ret_val job_id = ret_val.split()[-1] # print " jobId = %s" % jobId self.jobIds.append(job_id)
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"] sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] apply_trigger_bits = (is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc for lepton_selection in self.lepton_selections: for central_or_shift in self.central_or_shifts: inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) for jobId in inputFileList.keys(): if central_or_shift != "central" and not (lepton_selection == "Tight"): continue if central_or_shift != "central" and not is_mc: continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttH") and sample_category != "signal": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttW") and sample_category != "TTW": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttZ") and sample_category != "TTZ": continue key_dir = getKey(sample_name, lepton_selection) key_file = getKey(sample_name, lepton_selection, central_or_shift, jobId) self.ntupleFiles[key_file] = inputFileList[jobId] self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, lepton_selection, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%i.root" % \ (process_name, lepton_selection, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % \ (self.channel, process_name, lepton_selection, central_or_shift, jobId)) self.rleOutputFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % \ (self.channel, process_name, lepton_selection, central_or_shift, jobId)) if self.select_rle_output else "" self.createCfg_analyze(self.ntupleFiles[key_file], self.histogramFiles[key_file], sample_category, self.era, triggers, lepton_selection, is_mc, central_or_shift, lumi_scale, apply_trigger_bits, self.cfgFiles_analyze_modified[key_file], self.rleOutputFiles[key_file]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() print self.sbatchFile_analyze #logging.info("Creating configuration files for executing 'addBackgroundLeptonFakes'") #self.createCfg_addFakes(self.histogramFile_hadd_stage1, self.histogramFile_addFakes, self.cfgFile_addFakes_modified) #logging.info("Creating configuration files for executing 'addBackgroundLeptonFlips'") #self.createCfg_addFlips(self.histogramFile_hadd_stage1, self.histogramFile_addFlips, self.cfgFile_addFlips_modified) logging.info("Creating configuration files for executing 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: self.createCfg_prep_dcard(histogramToFit) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) ( secondary_files, primary_store, secondary_store ) = self.initializeInputFileIds(sample_name, sample_info) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"] sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] for central_or_shift in self.central_or_shifts: for jobId in range(len(self.inputFileIds[sample_name])): if central_or_shift != "central" and not is_mc: continue inputFiles = generate_input_list(self.inputFileIds[sample_name][jobId], secondary_files, primary_store, secondary_store, self.debug) key_dir = getKey(sample_name) key_file = getKey(sample_name, central_or_shift, jobId) self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%i.root" % \ (process_name, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % \ (self.channel, process_name, central_or_shift, jobId)) self.createCfg_analyze(inputFiles, self.histogramFiles[key_file], sample_category, triggers, self.lepton_selection, self.hadTau_selection, is_mc, central_or_shift, lumi_scale, self.cfgFiles_analyze_modified[key_file]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() logging.info("Creating configuration files for executing 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: self.createCfg_prep_dcard(histogramToFit) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_clean(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in ["additional_signal_overlap", "background_data_estimate"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % ( self.executable_analyze, process_name)) (secondary_files, primary_store, secondary_store) = self.initializeInputFileIds( sample_name, sample_info) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info[ "xsection"] * self.lumi / sample_info["nof_events"] sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] for lepton_selection in self.lepton_selections: key_dir = getKey(sample_name, lepton_selection) for central_or_shift in self.central_or_shifts: if self.select_root_output: rootOutputSingleFile = os.path.join(self.dirs[key_dir][DKEY_ROOT], "out_%s_%s_%s_%s.root" % (self.channel, process_name, lepton_selection, central_or_shift)) self.rootOutputAux[rootOutputSingleFile] = os.path.join(self.dirs[key_dir][DKEY_ROOT], "out_%s_%s_%s_%s_*.root" % (self.channel, process_name, lepton_selection, central_or_shift)) for jobId in range(len(self.inputFileIds[sample_name])): if central_or_shift != "central" and not is_mc: continue key_dir = getKey(sample_name, lepton_selection) key_file = getKey(sample_name, lepton_selection, central_or_shift, jobId) inputFiles = generate_input_list(self.inputFileIds[sample_name][jobId], secondary_files, primary_store, secondary_store, self.debug) self.ntupleFiles[key_file] = inputFiles self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % (self.channel, process_name, lepton_selection, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%i.root" % (process_name, lepton_selection, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % (self.channel, process_name, lepton_selection, central_or_shift, jobId)) self.rleOutputFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % (self.channel, process_name, lepton_selection, central_or_shift, jobId)) if self.select_rle_output else "" self.rootOutputFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_ROOT], "out_%s_%s_%s_%s_%i.root" % (self.channel, process_name, lepton_selection, central_or_shift, jobId)) if self.select_root_output else "" self.createCfg_analyze(inputFiles, self.histogramFiles[key_file], sample_category, self.era, triggers, lepton_selection, is_mc, central_or_shift, lumi_scale, self.cfgFiles_analyze_modified[key_file], self.rleOutputFiles[key_file], self.rootOutputFiles[key_file] ) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() lines_makefile = [] self.addToMakefile_analyze(lines_makefile) # # TODO hackfix (Margus) self.datacardFiles['this_value_is_useless_and_not_used_but_somehow_it_is_important_should_be_fixed'] = 'hadd_stage1' self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(process_name) for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, "", process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, "", process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_HADD_RT]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) ##print "self.dirs = ", self.dirs for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_name, sample_info, self.max_files_per_job, self.debug) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") is_signal = (sample_category == "signal") for central_or_shift in self.central_or_shifts: inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): if central_or_shift != "central" and not is_mc: continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttH" ) and sample_category != "signal": continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttW" ) and sample_category != "TTW": continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttZ" ) and sample_category != "TTZ": continue # build config files for executing analysis code key_dir = getKey(process_name) key_analyze_job = getKey(process_name, central_or_shift, jobId) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % ( key_job, ntupleFiles) continue self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, central_or_shift, jobId)), 'histogramFile' : os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%i.root" % \ (process_name, central_or_shift, jobId)), 'logFile' : os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % \ (self.channel, process_name, central_or_shift, jobId)), 'rleOutputFile' : os.path.join(self.dirs[key_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % \ (self.channel, process_name, central_or_shift, jobId)) if self.select_rle_output else "", 'sample_category' : sample_category, 'triggers' : sample_info["triggers"], 'hadTau_selection' : self.hadTau_selection_part2, ##'use_HIP_mitigation_mediumMuonId' : sample_info["use_HIP_mitigation_mediumMuonId"], 'use_HIP_mitigation_mediumMuonId' : True, 'is_mc' : is_mc, 'central_or_shift' : central_or_shift, 'lumi_scale' : 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"], 'apply_genWeight' : sample_info["genWeight"] if (is_mc and "genWeight" in sample_info.keys()) else False, 'apply_trigger_bits' : (is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job]) # initialize input and output file names for hadd_stage1 key_hadd_stage1 = getKey(process_name) if not key_hadd_stage1 in self.inputFiles_hadd_stage1.keys( ): self.inputFiles_hadd_stage1[key_hadd_stage1] = [] self.inputFiles_hadd_stage1[key_hadd_stage1].append( self.jobOptions_analyze[key_analyze_job] ['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s.root" % \ (self.channel, process_name)) # initialize input and output file names for hadd_stage2 key_hadd_stage1 = getKey(process_name) key_hadd_stage2 = getKey("all") if not key_hadd_stage2 in self.inputFiles_hadd_stage2.keys(): self.inputFiles_hadd_stage2[key_hadd_stage2] = [] self.inputFiles_hadd_stage2[key_hadd_stage2].append( self.outputFile_hadd_stage1[key_hadd_stage1]) self.outputFile_hadd_stage2[key_hadd_stage2] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage2_%s.root" % \ (self.channel)) logging.info("Creating configuration files to run 'prepareDatacards'") for evtSelection in self.evtSelections: for histogramToFit in self.histograms_to_fit: key_prep_dcard_job = getKey(evtSelection, histogramToFit) key_hadd_stage2 = getKey("all") self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % (self.channel, evtSelection, histogramToFit)), 'datacardFile': os.path.join( self.dirs[DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % (self.channel, evtSelection, histogramToFit)), 'histogramDir': "_".join([self.histogramDir_prep_dcard, evtSelection]), 'histogramToFit': histogramToFit, 'label': None } self.createCfg_prep_dcard( self.jobOptions_prep_dcard[key_prep_dcard_job]) logging.info("Creating configuration files to run 'makePlots'") for evtSelection in self.evtSelections: key_makePlots_job = getKey(evtSelection) key_hadd_stage2 = getKey("all") self.jobOptions_make_plots[key_makePlots_job] = { 'executable': self.executable_make_plots, 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "makePlots_%s_%s_cfg.py" % (self.channel, evtSelection)), 'outputFile': os.path.join( self.dirs[DKEY_PLOT], "makePlots_%s_%s.png" % (self.channel, evtSelection)), 'histogramDir': "_".join([self.histogramDir_prep_dcard, evtSelection]), 'label': evtSelection, 'make_plots_backgrounds': self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def __init__(self, treeName, outputDir, cfgDir, executable_addMEM, samples, era, debug, running_method, max_files_per_job, mem_integrations_per_job, max_mem_integrations, num_parallel_jobs, leptonSelection, hadTauSelection, isForBDTtraining, channel, pool_id=''): self.treeName = treeName self.outputDir = outputDir self.cfgDir = cfgDir self.executable_addMEM = executable_addMEM self.mem_integrations_per_job = mem_integrations_per_job self.max_files_per_job = max_files_per_job self.max_mem_integrations = max_mem_integrations self.samples = samples self.era = era self.debug = debug self.channel = channel self.leptonSelection = leptonSelection self.hadTauSelection = hadTauSelection self.hadTauDefinition = self.hadTauSelection.split('|')[0] self.hadTauWorkingPoint = self.hadTauSelection.split('|')[1] self.maxPermutations_branchName = "maxPermutations_addMEM_%s_lep%s_tau%s_%s" % ( self.channel, self.leptonSelection, self.hadTauDefinition, self.hadTauWorkingPoint, ) self.isForBDTtraining = isForBDTtraining if running_method.lower() not in ["sbatch", "makefile"]: raise ValueError("Invalid running method: %s" % running_method) self.running_method = running_method self.is_sbatch = False self.is_makefile = False if self.running_method.lower() == "sbatch": self.is_sbatch = True else: self.is_makefile = True self.makefile = os.path.join(self.cfgDir, "Makefile_%s" % self.channel) self.num_parallel_jobs = num_parallel_jobs self.pool_id = pool_id if pool_id else uuid.uuid4() self.workingDir = os.getcwd() logging.info("Working directory is: {workingDir}".format( workingDir=self.workingDir)) for dirPath in [self.outputDir, self.cfgDir]: create_if_not_exists(dirPath) self.stdout_file = codecs.open( os.path.join(self.cfgDir, "stdout_%s.log" % self.channel), 'w', 'utf-8') self.stderr_file = codecs.open( os.path.join(self.cfgDir, "stderr_%s.log" % self.channel), 'w', 'utf-8') self.dirs = {} self.samples = samples self.cfgFiles_addMEM_modified = {} self.shFiles_addMEM_modified = {} self.logFiles_addMEM = {} self.sbatchFile_addMEM = os.path.join( self.cfgDir, "sbatch_addMEM_%s.py" % self.channel) self.inputFiles = {} self.outputFiles = {} self.hadd_records = {} self.filesToClean = [] for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or \ sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(sample_name) for dir_type in [DKEY_NTUPLES, DKEY_FINAL_NTUPLES]: initDict(self.dirs, [key_dir, dir_type]) self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name) for dir_type in [DKEY_CFGS, DKEY_LOGS, DKEY_HADD, DKEY_HADD_RT]: initDict(self.dirs, [key_dir, dir_type]) self.dirs[key_dir][dir_type] = os.path.join( self.cfgDir, dir_type, self.channel, process_name) self.cvmfs_error_log = {}
def create(self): """Creates all necessary config files and runs the PU profile production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") if not is_mc: continue logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable, process_name)) inputFileList = generateInputFileList(sample_info, self.max_files_per_job) key_dir = getKey(process_name) outputFile = os.path.join(self.dirs[key_dir][DKEY_HISTO], "%s.root" % process_name) if os.path.isfile(outputFile) and tools_is_file_ok( outputFile, min_file_size=2000): logging.info('File {} already exists --> skipping job'.format( outputFile)) continue self.outputFiles[process_name] = { 'inputFiles': [], 'outputFile': outputFile } for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: logging.warning( "ntupleFiles['%s'] = %s --> skipping job !!" % (key_file, self.inputFiles[key_file])) continue self.cfgFiles_puProfile[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "puProfile_%s_%i_cfg.txt" % (process_name, jobId)) self.outputFiles_tmp[key_file] = os.path.join( self.dirs[key_dir][DKEY_HISTO_TMP], "histogram_%i.root" % jobId) self.logFiles_puProfile[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "puProfile_%s_%i.log" % (process_name, jobId)) self.scriptFiles_puProfile[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "puProfile_%s_%i_cfg.sh" % (process_name, jobId)) self.jobOptions_sbatch[key_file] = { 'histName': process_name, 'inputFiles': self.inputFiles[key_file], 'cfgFile_path': self.cfgFiles_puProfile[key_file], 'outputFile': self.outputFiles_tmp[key_file], 'logFile': self.logFiles_puProfile[key_file], 'scriptFile': self.scriptFiles_puProfile[key_file], } self.createCfg_puProfile(self.jobOptions_sbatch[key_file]) self.outputFiles[process_name]['inputFiles'].append( self.outputFiles_tmp[key_file]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable) self.num_jobs['puProfile'] += self.createScript_sbatch( self.executable, self.sbatchFile_puProfile, self.jobOptions_sbatch) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_puProfile(lines_makefile) self.addToMakefile_hadd(lines_makefile) self.addToMakefile_plot(lines_makefile) self.addToMakefile_finalHadd(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the MEM -- either locally or on the batch system """ statistics = {} for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) # read the file in, sample-by-sample # build the dictionary recursively # add rle file also to generated cfg files # print integrations per job as well! # consider more than 1 file per jobs -- the jobs are splitted by MEM integration anyways for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or \ sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue if not os.path.exists(sample_info['local_paths'][0]['path']): logging.warning("Skipping sample {sample_name}".format( sample_name=sample_name)) continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_addMEM, process_name)) is_mc = (sample_info["type"] == "mc") inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) # typically, the analysis ends here and starts looping b/c the smallest unit of work processes # at least one file; we need, however, to split the file into event ranges in such a way that # each job performs mem_integrations_per_job MEM integrations # so what we are going to do is to open each set of files in inputFileList, read the variable # requestMEM_*l_*tau and try to gather the event ranges such that each event range # performs up to mem_integrations_per_job integrations per job memEvtRangeDict = self.memJobList(inputFileList) for jobId in memEvtRangeDict.keys(): key_dir = getKey(sample_name) key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = memEvtRangeDict[jobId][ 'input_fileset'] # there should always be a job assert(self.inputFiles[key_file] > 0), "More than one input file: %s ?? !!" % \ ', '.join(self.inputFiles[key_file]) #assert(len(self.inputFiles[key_file]) == 1), "There is more than one input file!" self.cfgFiles_addMEM_modified[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i_cfg.py" % (self.channel, process_name, jobId)) self.shFiles_addMEM_modified[key_file] = os.path.join( self.dirs[key_dir][DKEY_CFGS], "addMEM_%s_%s_%i.sh" % (self.channel, process_name, jobId)) self.outputFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_NTUPLES], "%s_%i.root" % (process_name, jobId)) self.logFiles_addMEM[key_file] = os.path.join( self.dirs[key_dir][DKEY_LOGS], "addMEM_%s_%s_%i.log" % (self.channel, process_name, jobId)) self.createCfg_addMEM( self.inputFiles[key_file], memEvtRangeDict[jobId]['event_range'][0], memEvtRangeDict[jobId]['event_range'][1], self.outputFiles[key_file], self.era, is_mc, self.cfgFiles_addMEM_modified[key_file], ) # associate the output file with the fileset_id #UDPATE: ONE OUTPUT FILE PER SAMPLE! fileset_id = memEvtRangeDict[jobId]['fileset_id'] hadd_output_dir = os.path.join( self.dirs[key_dir][DKEY_FINAL_NTUPLES], str('%04d' % fileset_id // 1000)) if not os.path.exists(hadd_output_dir): os.makedirs(hadd_output_dir) hadd_output = os.path.join( hadd_output_dir, '%s_%i.root' % ('tree', fileset_id) # UDPATE: ADDED #hadd_output_dir, "tree.root" # UDPATE: REMOVED ) if hadd_output not in self.hadd_records: self.hadd_records[hadd_output] = {} self.hadd_records[hadd_output]['output_files'] = [] self.hadd_records[hadd_output]['fileset_id'] = fileset_id self.hadd_records[hadd_output]['output_files'].append( self.outputFiles[key_file]) self.hadd_records[hadd_output]['process_name'] = process_name #self.filesToClean.append(self.outputFiles[key_file]) # let's sum the number of integration per sample nofEntriesMap = {} for v in memEvtRangeDict.values(): if v['fileset_id'] not in nofEntriesMap: nofEntriesMap[v['fileset_id']] = { 'nof_entries': v['nof_entries'], } statistics[process_name] = { 'nof_int': sum([entry['nof_int'] for entry in memEvtRangeDict.values()]), 'nof_entries': sum([entry['nof_entries'] for entry in nofEntriesMap.values()]), 'nof_events_pass': sum([ entry['nof_events_pass'] for entry in memEvtRangeDict.values() ]), 'nof_int_pass': sum([ entry['nof_int_pass'] for entry in memEvtRangeDict.values() ]), 'nof_zero': sum([entry['nof_zero'] for entry in memEvtRangeDict.values()]), 'nof_jobs': len(memEvtRangeDict), } if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_addMEM) self.createScript_sbatch() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_addMEM(lines_makefile) self.addToMakefile_hadd(lines_makefile) self.createMakefile(lines_makefile) ws_len = max([len(kk) + 1 for kk in statistics.keys()]) total_nof_integrations_sum = sum(x['nof_int'] for x in statistics.values()) total_nof_entires = sum(x['nof_entries'] for x in statistics.values()) total_nof_zero_int = sum(x['nof_zero'] for x in statistics.values()) total_nof_jobs = sum(x['nof_jobs'] for x in statistics.values()) total_nof_pass = sum(x['nof_events_pass'] for x in statistics.values()) total_nof_int_pass_avg = float( sum(x['nof_int_pass'] for x in statistics.values())) / total_nof_pass total_nof_integrations_avg = float( total_nof_integrations_sum) / total_nof_entires total_nof_int_per_job = float( total_nof_integrations_sum) / total_nof_jobs for k, v in statistics.iteritems(): if v['nof_entries'] == 0: int_per_event = 0. evt_pass = 0. else: int_per_event = float(v['nof_int']) / v['nof_entries'] evt_pass = (100 * float(v['nof_events_pass']) / v['nof_entries']) if v['nof_events_pass'] == 0: nof_int_pass = 0. else: nof_int_pass = float(v['nof_int_pass']) / v['nof_events_pass'] print( '%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d (%.2f%%) evt pass; %.2f int/evt pass; %d evt 0int)' % ( k, ' ' * (ws_len - len(k)), v['nof_int'], v['nof_entries'], v['nof_jobs'], int_per_event, v['nof_events_pass'], evt_pass, nof_int_pass, v['nof_zero'], )) print( '%s%s: %d (%d entries; %d jobs; %.2f int/evt; %d evt pass; %.2f int/evt pass; ' '%.2f int/job pass; %d evt 0int)' % ( 'total', ' ' * (ws_len - len('total')), total_nof_integrations_sum, total_nof_entires, total_nof_jobs, total_nof_integrations_avg, total_nof_pass, total_nof_int_pass_avg, total_nof_int_per_job, total_nof_zero_int, )) if self.max_mem_integrations > 0 and total_nof_integrations_sum > self.max_mem_integrations: logging.error( "Will not start the jobs (max nof integrations exceeded)!") return False else: logging.info("Done") return True
def create(self): """Creates all necessary config files and runs the Ntuple production -- either locally or on the batch system """ for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] is_mc = (sample_info["type"] == "mc") logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_prodNtuple, process_name)) inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) key_dir = getKey(sample_name) subDirs = list( map( lambda y: os.path.join(self.dirs[key_dir][DKEY_NTUPLES], '%04d' % y), set(map(lambda x: x // 1000, inputFileList.keys())))) for subDir in subDirs: create_if_not_exists(subDir) for jobId in inputFileList.keys(): key_file = getKey(sample_name, jobId) self.inputFiles[key_file] = inputFileList[jobId] if len(self.inputFiles[key_file]) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % ( key_file, self.inputFiles[key_file]) continue self.cfgFiles_prodNtuple_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "produceNtuple_%s_%i_cfg.py" % \ (process_name, jobId)) self.outputFiles[key_file] = os.path.join( self.dirs[key_dir][DKEY_NTUPLES], "%04d" % (jobId // 1000), "tree_%i.root" % jobId) self.logFiles_prodNtuple[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "produceNtuple_%s_%i.log" % \ (process_name, jobId)) jobOptions = { 'inputFiles': self.inputFiles[key_file], 'cfgFile_modified': self.cfgFiles_prodNtuple_modified[key_file], 'outputFile': self.outputFiles[key_file], ##'use_HIP_mitigation_mediumMuonId' : sample_info["use_HIP_mitigation_mediumMuonId"], 'use_HIP_mitigation_mediumMuonId': True, 'is_mc': is_mc, 'random_seed': jobId } self.createCfg_prodNtuple(jobOptions) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_prodNtuple) self.createScript_sbatch() logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_prodNtuple(lines_makefile) #self.addToMakefile_clean(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") process_name = sample_info["process_name_specific"] logging.info("Building dictionaries for sample %s..." % process_name) for lepton_selection in self.lepton_selections: for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in ["Tight", "forBDTtraining"]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for leptonChargeSelection in self.leptonChargeSelections: central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shift_dedicated = self.central_or_shifts if self.runTHweights(sample_info) else self.central_or_shifts_external central_or_shifts_extended = central_or_shift_extensions + central_or_shift_dedicated for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd", "addBackgrounds" ] and process_name_or_dummy in [ "hadd" ]: continue if central_or_shift_or_dummy not in central_or_shift_extensions and not self.accept_systematics( central_or_shift_or_dummy, is_mc, lepton_selection, leptonChargeSelection, sample_info ): continue key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, leptonChargeSelection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight, leptonChargeSelection ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight, leptonChargeSelection ]), process_name_or_dummy) for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: if dir_type == DKEY_SYNC and not self.do_sync: continue initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) for lepton_selection in self.lepton_selections: hadTau_selection = "Tight" hadTau_selection = "|".join([hadTau_selection, self.hadTau_selection_part2]) electron_selection = lepton_selection muon_selection = lepton_selection if lepton_selection == "forBDTtraining": electron_selection = "Loose" muon_selection = "Loose" elif lepton_selection == "Fakeable_mcClosure_e": electron_selection = "Fakeable" muon_selection = "Tight" elif lepton_selection == "Fakeable_mcClosure_m": electron_selection = "Tight" muon_selection = "Fakeable" if "forBDTtraining" in lepton_selection: electron_selection = "Loose" muon_selection = "Loose" elif lepton_selection == "Fakeable_mcClosure_e": electron_selection = "Fakeable" muon_selection = "Tight" elif lepton_selection == "Fakeable_mcClosure_m": electron_selection = "Tight" muon_selection = "Fakeable" for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight", "forBDTtraining" ]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for leptonChargeSelection in self.leptonChargeSelections: for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") use_th_weights = self.runTHweights(sample_info) central_or_shift_dedicated = self.central_or_shifts if use_th_weights else self.central_or_shifts_external for central_or_shift in central_or_shift_dedicated: if not self.accept_systematics( central_or_shift, is_mc, lepton_selection, leptonChargeSelection, sample_info ): continue central_or_shifts_local = [] if central_or_shift == "central" and not use_th_weights: for central_or_shift_local in self.central_or_shifts_internal: if self.accept_systematics( central_or_shift_local, is_mc, lepton_selection, leptonChargeSelection, sample_info ): central_or_shifts_local.append(central_or_shift_local) logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift)) # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, leptonChargeSelection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, lepton_selection_and_frWeight, leptonChargeSelection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%s_%i.root" % analyze_job_tuple) applyFakeRateWeights = self.applyFakeRateWeights \ if self.isBDTtraining or lepton_selection.find("Tight") == -1 \ else "disabled" self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'electronSelection' : electron_selection, 'muonSelection' : muon_selection, 'apply_leptonGenMatching' : self.apply_leptonGenMatching, 'leptonChargeSelection' : leptonChargeSelection, 'applyFakeRateWeights' : applyFakeRateWeights, 'hadTauSelection' : hadTau_selection, 'central_or_shift' : central_or_shift, 'central_or_shifts_local' : central_or_shifts_local, 'fillGenEvtHistograms' : True, 'selectBDT' : self.isBDTtraining, 'apply_hlt_filter' : self.hlt_filter, 'selectBDT' : self.isBDTtraining, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight, leptonChargeSelection) hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight, leptonChargeSelection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s_%s.root" % hadd_stage1_job_tuple) if self.isBDTtraining: continue # add output files of hadd_stage1 to list of input files for hadd_stage1_5 key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight, leptonChargeSelection) key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight, leptonChargeSelection) hadd_stage1_5_job_tuple = (lepton_selection_and_frWeight, leptonChargeSelection) key_hadd_stage1_5_job = getKey(*hadd_stage1_5_job_tuple) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5_%s_%s.root" % hadd_stage1_5_job_tuple) if self.isBDTtraining: continue # sum fake background contributions for the total of all MC sample # input processes: TT_fake, TTW_fake, TTWW_fake, ... # output process: fakes_mc key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, leptonChargeSelection) key_addBackgrounds_dir = getKey("addBackgrounds") addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight, leptonChargeSelection) key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_fake" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_fakes_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_fakes_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, leptonChargeSelection) ], 'processes_input' : processes_input, 'process_output' : "fakes_mc" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]) # sum conversion background contributions for the total of all MC sample # input processes: TT_Convs, TTW_Convs, TTWW_Convs, ... # output process: Convs addBackgrounds_job_Convs_tuple = ("Convs", lepton_selection_and_frWeight, leptonChargeSelection) key_addBackgrounds_job_Convs = getKey(*addBackgrounds_job_Convs_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) processes_input = [] for sample_category in self.convs_backgrounds: processes_input.append("%s_Convs" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_Convs_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_Convs_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s.log" % addBackgrounds_job_Convs_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, leptonChargeSelection) ], 'processes_input' : processes_input, 'process_output' : "Convs" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]) # sum signal contributions from HH->4tau ("tttt"), HH->2W2tau ("wwtt"), and HH->4W ("wwww"), # separately for "nonfake" and "fake" contributions genMatch_categories = [ "nonfake", "fake" ] for genMatch_category in genMatch_categories: for signal_base, signal_input in self.signal_io.items(): addBackgrounds_job_signal_tuple = (lepton_selection_and_frWeight, leptonChargeSelection, signal_base, genMatch_category) key_addBackgrounds_job_signal = getKey(*addBackgrounds_job_signal_tuple) if key_addBackgrounds_job_signal in self.jobOptions_addBackgrounds_sum.keys(): continue processes_input = signal_input process_output = signal_base if genMatch_category == "fake": processes_input = [ process_input + "_fake" for process_input in processes_input ] process_output += "_fake" self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_signal] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_%s_cfg.py" % addBackgrounds_job_signal_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s_%s.root" % addBackgrounds_job_signal_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s_%s_%s.log" % addBackgrounds_job_signal_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight, leptonChargeSelection) ], 'processes_input' : processes_input, 'process_output' : process_output } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_signal]) key_hadd_stage2_job = getKey(lepton_selection_and_frWeight, leptonChargeSelection) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] if lepton_selection == "Tight": self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_signal]['outputFile']) # initialize input and output file names for hadd_stage2 key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight, leptonChargeSelection) key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight, leptonChargeSelection) hadd_stage2_job_tuple = (lepton_selection_and_frWeight, leptonChargeSelection) key_hadd_stage2_job = getKey(*hadd_stage2_job_tuple) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] if lepton_selection == "Tight": self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_Convs]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s_%s.root" % hadd_stage2_job_tuple) if self.isBDTtraining: if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.targets.extend(self.phoniesToAdd) self.addToMakefile_validate(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs logging.info("Creating configuration files to run 'addBackgroundFakes'") for leptonChargeSelection in self.leptonChargeSelections: key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled"), leptonChargeSelection) key_addFakes_dir = getKey("addBackgroundLeptonFakes") key_addFakes_job = getKey("data_fakes", leptonChargeSelection) category_sideband = "hh_4l_%s_Fakeable_wFakeRateWeights" % leptonChargeSelection self.jobOptions_addFakes[key_addFakes_job] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addFakes_dir][DKEY_CFGS], "addBackgroundLeptonFakes_%s_cfg.py" % leptonChargeSelection), 'outputFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_HIST], "addBackgroundLeptonFakes_%s.root" % leptonChargeSelection), 'logFile' : os.path.join(self.dirs[key_addFakes_dir][DKEY_LOGS], "addBackgroundLeptonFakes_%s.log" % leptonChargeSelection), 'category_signal' : "hh_4l_%s_Tight" % leptonChargeSelection, 'category_sideband' : category_sideband } self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job]) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), leptonChargeSelection) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile']) logging.info("Creating configuration files to run 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: key_prep_dcard_dir = getKey("prepareDatacards") if "OS" in self.leptonChargeSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") prep_dcard_job_tuple = (self.channel, "OS", histogramToFit) key_prep_dcard_job = getKey("OS", histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : '4l', } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) if "SS" in self.leptonChargeSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") prep_dcard_job_tuple = (self.channel, "SS", histogramToFit) key_prep_dcard_job = getKey("SS", histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard_SS, 'histogramToFit' : histogramToFit, 'label' : '4l SS', } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) # add shape templates for the following systematic uncertainties: # - 'CMS_ttHl_Clos_norm_e' # - 'CMS_ttHl_Clos_shape_e' # - 'CMS_ttHl_Clos_norm_m' # - 'CMS_ttHl_Clos_shape_m' for leptonChargeSelection in self.leptonChargeSelections: key_prep_dcard_job = getKey(leptonChargeSelection, histogramToFit) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), leptonChargeSelection) key_add_syst_fakerate_dir = getKey("addSystFakeRates") add_syst_fakerate_job_tuple = (self.channel, leptonChargeSelection, histogramToFit) key_add_syst_fakerate_job = getKey(leptonChargeSelection, histogramToFit) self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = { 'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'], 'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_%s_cfg.py" % add_syst_fakerate_job_tuple), 'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s_%s.root" % add_syst_fakerate_job_tuple), 'category' : self.channel, 'histogramToFit' : histogramToFit, 'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "addSystFakeRates.png") } histogramDir_nominal = None if leptonChargeSelection == "OS": histogramDir_nominal = self.histogramDir_prep_dcard elif leptonChargeSelection == "SS": histogramDir_nominal = self.histogramDir_prep_dcard_SS else: raise ValueError("Invalid parameter 'leptonChargeSelection' = %s !!" % leptonChargeSelection) for lepton_type in [ 'e', 'm' ]: lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type if lepton_mcClosure not in self.lepton_selections: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled") key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight, leptonChargeSelection) histogramDir_mcClosure = self.mcClosure_dir['%s_%s' % (lepton_mcClosure, leptonChargeSelection)] self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({ 'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections, 'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit), 'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'], 'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit) }) self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job]) logging.info("Creating configuration files to run 'makePlots'") key_makePlots_dir = getKey("makePlots") if "OS" in self.leptonChargeSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") key_makePlots_job = getKey("OS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard, 'label' : '4l', 'make_plots_backgrounds' : self.make_plots_backgrounds, } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) if "SS" in self.leptonChargeSelections: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "SS") key_makePlots_job = getKey("SS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_SS_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_SS.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard_SS, 'label' : "4l SS", 'make_plots_backgrounds' : self.make_plots_backgrounds, } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) if "Fakeable_mcClosure" in self.lepton_selections: #TODO key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled"), "OS") key_makePlots_job = getKey("Fakeable_mcClosure", "OS") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots_mcClosure, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_mcClosure_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_mcClosure_%s.png" % self.channel) } self.createCfg_makePlots_mcClosure(self.jobOptions_make_plots[key_makePlots_job]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds) self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds) self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes) self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel) self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_add_syst_fakerate(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.addToMakefile_validate(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
'label' : inclusive_sample, }) for sample_entry in samples.values(): sample_name = sample_entry['process_name_specific'] if sample_name not in valid_samples_to_sum_flat and sample_entry['type'] != 'data': valid_samples_to_sum.append({ 'xs' : sample_entry['xsection'], 'samples' : [ sample_name ], 'nentries' : { sample_name : sample_entry['nof_tree_events'] }, 'label' : sample_name, }) if not os.path.isdir(args.output): if not args.force: raise ValueError('Use -f/--force to create output directory %s' % args.output) create_if_not_exists(args.output) for valid_samples in valid_samples_to_sum: input_files = { sample_name : { 'input' : pattern.format(sample_name = sample_name), 'nentries' : valid_samples['nentries'][sample_name], } for sample_name in valid_samples['samples'] } output_files = [ os.path.join(args.output, '%s.%s' % (valid_samples['label'], ext)) for ext in args.extension ] expected_neff = lumi * valid_samples['xs'] plot(input_files, output_files, valid_samples['label'], expected_neff, args.mode)
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(process_name) for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) ##print "self.dirs = ", self.dirs for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_name, sample_info, self.max_files_per_job, self.debug) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): ##print "processing sample %s: jobId = %i" % (process_name, jobId) # build config files for executing analysis code key_dir = getKey(process_name) key_analyze_job = getKey(process_name, jobId) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % ( key_file, ntupleFiles) continue self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%i_cfg.py" % \ (self.channel, process_name, jobId)), 'histogramFile' : os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%i.root" % \ (process_name, jobId)), 'logFile' : os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%i.log" % \ (self.channel, process_name, jobId)), 'sample_category' : sample_category, 'hadTau_selection' : self.hadTau_selection, 'use_HIP_mitigation_mediumMuonId' : True, 'is_mc' : is_mc, 'lumi_scale' : 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"], 'apply_genWeight' : sample_info["genWeight"] if (is_mc and "genWeight" in sample_info) else False, 'selectBDT' : True, 'changeBranchNames' : self.changeBranchNames } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job]) # initialize input and output file names for hadd_stage1 key_hadd_stage1 = getKey(process_name) if not key_hadd_stage1 in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1] = [] self.inputFiles_hadd_stage1[key_hadd_stage1].append( self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s.root" % \ (self.channel, process_name)) self.targets.append( self.outputFile_hadd_stage1[key_hadd_stage1]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] for lepton_selection in self.lepton_selections: for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) central_or_shifts_extended = [ "" ] central_or_shifts_extended.extend(self.central_or_shifts) central_or_shifts_extended.extend([ "hadd", "addBackgrounds" ]) for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [ process_name, "hadd" ] for process_name_or_dummy in process_name_extended: key_dir = getKey(process_name_or_dummy, lepton_selection_and_frWeight, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_RLES, DKEY_SYNC ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, "_".join([ lepton_selection_and_frWeight ]), process_name_or_dummy, central_or_shift_or_dummy) for subdirectory in [ "addBackgrounds", "addBackgroundLeptonFakes", "prepareDatacards", "addSystFakeRates", "makePlots" ]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_ROOT, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [ key_dir, dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_LOGS ]: self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT, DKEY_SYNC ]: initDict(self.dirs, [ dir_type ]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0; frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100*numDirectories_created >= frac*numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList(sample_info, self.max_files_per_job) mcClosure_regex = re.compile('Fakeable_mcClosure_(?P<type>m|e)_wFakeRateWeights') for lepton_selection in self.lepton_selections: electron_selection = lepton_selection muon_selection = lepton_selection hadTauVeto_selection = "Tight" hadTauVeto_selection = "|".join([ hadTauVeto_selection, self.hadTauVeto_selection_part2 ]) if lepton_selection == "Fakeable_mcClosure_e": electron_selection = "Fakeable" muon_selection = "Tight" elif lepton_selection == "Fakeable_mcClosure_m": electron_selection = "Tight" muon_selection = "Fakeable" for lepton_frWeight in self.lepton_frWeights: if lepton_frWeight == "enabled" and not lepton_selection.startswith("Fakeable"): continue if lepton_frWeight == "disabled" and not lepton_selection in [ "Tight" ]: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_selection, lepton_frWeight) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") is_signal = (sample_category == "signal") for central_or_shift in self.central_or_shifts: inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): if central_or_shift != "central": isFR_shape_shift = (central_or_shift in systematics.FR_all) if not ((lepton_selection == "Fakeable" and isFR_shape_shift) or lepton_selection == "Tight"): continue if not is_mc and not isFR_shape_shift: continue if central_or_shift in systematics.LHE().ttH and sample_category != "signal": continue if central_or_shift in systematics.LHE().ttW and sample_category != "TTW": continue if central_or_shift in systematics.LHE().ttZ and sample_category != "TTZ": continue if central_or_shift in systematics.DYMCReweighting and not is_dymc_reweighting(sample_name): continue logging.info(" ... for '%s' and systematic uncertainty option '%s'" % (lepton_selection_and_frWeight, central_or_shift)) # build config files for executing analysis code key_analyze_dir = getKey(process_name, lepton_selection_and_frWeight, central_or_shift) analyze_job_tuple = (process_name, lepton_selection_and_frWeight, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning("No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue syncOutput = '' syncTree = '' syncRequireGenMatching = True if self.do_sync: mcClosure_match = mcClosure_regex.match(lepton_selection_and_frWeight) if lepton_selection_and_frWeight == 'Tight': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_SR.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_SR' % self.channel syncRequireGenMatching = True elif lepton_selection_and_frWeight == 'Fakeable_wFakeRateWeights': syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_Fake.root' % (self.channel, central_or_shift)) syncTree = 'syncTree_%s_Fake' % self.channel elif mcClosure_match: mcClosure_type = mcClosure_match.group('type') syncOutput = os.path.join(self.dirs[key_analyze_dir][DKEY_SYNC], '%s_%s_mcClosure_%s.root' % (self.channel, central_or_shift, mcClosure_type)) syncTree = 'syncTree_%s_mcClosure_%s' % (self.channel, mcClosure_type) else: continue if syncTree and central_or_shift != "central": syncTree = os.path.join(central_or_shift, syncTree) syncRLE = '' if self.do_sync and self.rle_select: syncRLE = self.rle_select % syncTree if not os.path.isfile(syncRLE): logging.warning("Input RLE file for the sync is missing: %s; skipping the job" % syncRLE) continue if syncOutput: self.inputFiles_sync['sync'].append(syncOutput) cfgFile_modified_path = os.path.join(self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" histogramFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, 'cfgFile_modified' : cfgFile_modified_path, 'histogramFile' : histogramFile_path, 'logFile' : logFile_path, 'selEventsFileName_output' : rleOutputFile_path, 'electronSelection' : electron_selection, 'muonSelection' : muon_selection, 'apply_leptonGenMatching' : self.apply_leptonGenMatching, 'hadTauSelection_veto' : hadTauVeto_selection, 'applyFakeRateWeights' : self.applyFakeRateWeights if not lepton_selection == "Tight" else "disabled", 'central_or_shift' : central_or_shift, 'syncOutput' : syncOutput, 'syncTree' : syncTree, 'syncRLE' : syncRLE, 'syncRequireGenMatching' : syncRequireGenMatching, 'useNonNominal' : self.use_nonnominal, 'apply_hlt_filter' : self.hlt_filter, } self.createCfg_analyze(self.jobOptions_analyze[key_analyze_job], sample_info, lepton_selection) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight) hadd_stage1_job_tuple = (process_name, lepton_selection_and_frWeight) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append(self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[key_hadd_stage1_job] = os.path.join(self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple) if self.do_sync: continue if is_mc: logging.info("Creating configuration files to run 'addBackgrounds' for sample %s" % process_name) sample_categories = [ sample_category ] if is_signal: sample_categories = [ "signal", "ttH", "ttH_htt", "ttH_hww", "ttH_hzz", "ttH_hmm", "ttH_hzg" ] for sample_category in sample_categories: # sum non-fake and fake contributions for each MC sample separately genMatch_categories = [ "nonfake", "conversions", "fake" ] for genMatch_category in genMatch_categories: key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight) key_addBackgrounds_dir = getKey(process_name, lepton_selection_and_frWeight, "addBackgrounds") addBackgrounds_job_tuple = None processes_input = None process_output = None if genMatch_category == "nonfake": # sum non-fake contributions for each MC sample separately # input processes: TT3l0g0j,... # output processes: TT; ... if sample_category in [ "signal" ]: lepton_genMatches = [] lepton_genMatches.extend(self.lepton_genMatches_nonfakes) lepton_genMatches.extend(self.lepton_genMatches_conversions) lepton_genMatches.extend(self.lepton_genMatches_fakes) processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in lepton_genMatches ] elif sample_category in [ "ttH" ]: lepton_genMatches = [] lepton_genMatches.extend(self.lepton_genMatches_nonfakes) lepton_genMatches.extend(self.lepton_genMatches_conversions) processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in lepton_genMatches ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in lepton_genMatches ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_nonfakes ] process_output = sample_category addBackgrounds_job_tuple = (process_name, sample_category, lepton_selection_and_frWeight) elif genMatch_category == "conversions": # sum fake contributions for each MC sample separately # input processes: TT2l1g0j, TT1l2g0j, TT0l3g0j; ... # output processes: TT_conversion; ... if sample_category in [ "signal" ]: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ] elif sample_category in [ "ttH" ]: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_conversions ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_conversions ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_conversions ] process_output = "%s_conversion" % sample_category addBackgrounds_job_tuple = (process_name, "%s_conversion" % sample_category, lepton_selection_and_frWeight) elif genMatch_category == "fake": # sum fake contributions for each MC sample separately # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l2g1j, TT0l1g2j, TT0l0g3j; ... # output processes: TT_fake; ... if sample_category in [ "signal" ]: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ] elif sample_category in [ "ttH" ]: processes_input = [] processes_input.extend([ "%s%s" % ("ttH_htt", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hww", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hzz", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hzg", genMatch) for genMatch in self.lepton_genMatches_fakes ]) processes_input.extend([ "%s%s" % ("ttH_hmm", genMatch) for genMatch in self.lepton_genMatches_fakes ]) else: processes_input = [ "%s%s" % (sample_category, genMatch) for genMatch in self.lepton_genMatches_fakes ] process_output = "%s_fake" % sample_category addBackgrounds_job_tuple = (process_name, "%s_fake" % sample_category, lepton_selection_and_frWeight) if processes_input: logging.info(" ...for genMatch option = '%s'" % genMatch_category) key_addBackgrounds_job = getKey(*addBackgrounds_job_tuple) cfgFile_modified = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_%s_cfg.py" % addBackgrounds_job_tuple) outputFile = os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s_%s.root" % addBackgrounds_job_tuple) self.jobOptions_addBackgrounds[key_addBackgrounds_job] = { 'inputFile' : self.outputFile_hadd_stage1[key_hadd_stage1_job], 'cfgFile_modified' : cfgFile_modified, 'outputFile' : outputFile, 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], os.path.basename(cfgFile_modified).replace("_cfg.py", ".log")), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ], 'processes_input' : processes_input, 'process_output' : process_output } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds[key_addBackgrounds_job]) # initialize input and output file names for hadd_stage1_5 key_hadd_stage1_5_dir = getKey("hadd", lepton_selection_and_frWeight) key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.jobOptions_addBackgrounds[key_addBackgrounds_job]['outputFile']) self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job] = os.path.join(self.dirs[key_hadd_stage1_5_dir][DKEY_HIST], "hadd_stage1_5_%s.root" % lepton_selection_and_frWeight) # add output files of hadd_stage1 for data to list of input files for hadd_stage1_5 if not is_mc: key_hadd_stage1_job = getKey(process_name, lepton_selection_and_frWeight) key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) if not key_hadd_stage1_5_job in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job] = [] self.inputFiles_hadd_stage1_5[key_hadd_stage1_5_job].append(self.outputFile_hadd_stage1[key_hadd_stage1_job]) if self.do_sync: continue # sum fake background contributions for the total of all MC sample # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ... # output process: fakes_mc key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) key_addBackgrounds_dir = getKey("addBackgrounds") addBackgrounds_job_fakes_tuple = ("fakes_mc", lepton_selection_and_frWeight) key_addBackgrounds_job_fakes = getKey(*addBackgrounds_job_fakes_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend([ "signal" ]) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_fake" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_fakes_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_fakes_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_fakes_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ], 'processes_input' : processes_input, 'process_output' : "fakes_mc" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]) # sum conversion background contributions for the total of all MC sample # input processes: TT2l0g1j, TT1l1g1j, TT1l0g2j, TT0l3j, TT0l3j, TT0l3j, TT0l3j; ... # output process: conversions addBackgrounds_job_conversions_tuple = ("conversions", lepton_selection_and_frWeight) key_addBackgrounds_job_conversions = getKey(*addBackgrounds_job_conversions_tuple) sample_categories = [] sample_categories.extend(self.nonfake_backgrounds) sample_categories.extend([ "signal" ]) processes_input = [] for sample_category in sample_categories: processes_input.append("%s_conversion" % sample_category) self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_CFGS], "addBackgrounds_%s_%s_cfg.py" % addBackgrounds_job_conversions_tuple), 'outputFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_HIST], "addBackgrounds_%s_%s.root" % addBackgrounds_job_conversions_tuple), 'logFile' : os.path.join(self.dirs[key_addBackgrounds_dir][DKEY_LOGS], "addBackgrounds_%s_%s.log" % addBackgrounds_job_conversions_tuple), 'categories' : [ getHistogramDir(lepton_selection, lepton_frWeight) ], 'processes_input' : processes_input, 'process_output' : "conversions" } self.createCfg_addBackgrounds(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions]) # initialize input and output file names for hadd_stage2 key_hadd_stage1_5_job = getKey(lepton_selection_and_frWeight) key_hadd_stage2_dir = getKey("hadd", lepton_selection_and_frWeight) key_hadd_stage2_job = getKey(lepton_selection_and_frWeight) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] if lepton_selection == "Tight": self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_conversions]['outputFile']) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job]) self.outputFile_hadd_stage2[key_hadd_stage2_job] = os.path.join(self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s.root" % lepton_selection_and_frWeight) if self.do_sync: if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_syncNtuple(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_syncNtuple(lines_makefile) outputFile_sync_path = os.path.join(self.outputDir, DKEY_SYNC, '%s.root' % self.channel) self.outputFile_sync['sync'] = outputFile_sync_path self.targets.append(outputFile_sync_path) self.addToMakefile_hadd_sync(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs logging.info("Creating configuration files to run 'addBackgroundFakes'") key_hadd_stage1_5_job = getKey(get_lepton_selection_and_frWeight("Fakeable", "enabled")) key_addFakes_job = getKey("fakes_data") category_sideband = "ttZctrl_Fakeable_wFakeRateWeights" self.jobOptions_addFakes[key_addFakes_job] = { 'inputFile' : self.outputFile_hadd_stage1_5[key_hadd_stage1_5_job], 'cfgFile_modified' : os.path.join(self.dirs[DKEY_CFGS], "addBackgroundLeptonFakes_cfg.py"), 'outputFile' : os.path.join(self.dirs[DKEY_HIST], "addBackgroundLeptonFakes.root"), 'logFile' : os.path.join(self.dirs[DKEY_LOGS], "addBackgroundLeptonFakes.log"), 'category_signal' : "ttZctrl_Tight", 'category_sideband' : category_sideband } self.createCfg_addFakes(self.jobOptions_addFakes[key_addFakes_job]) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) self.inputFiles_hadd_stage2[key_hadd_stage2_job].append(self.jobOptions_addFakes[key_addFakes_job]['outputFile']) logging.info("Creating configuration files to run 'prepareDatacards'") for histogramToFit in self.histograms_to_fit: key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) key_prep_dcard_dir = getKey("prepareDatacards") prep_dcard_job_tuple = (self.channel, histogramToFit) key_prep_dcard_job = getKey(histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_CFGS], "prepareDatacards_%s_%s_cfg.py" % prep_dcard_job_tuple), 'datacardFile' : os.path.join(self.dirs[key_prep_dcard_dir][DKEY_DCRD], "prepareDatacards_%s_%s.root" % prep_dcard_job_tuple), 'histogramDir' : self.histogramDir_prep_dcard, 'histogramToFit' : histogramToFit, 'label' : None } self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) # add shape templates for the following systematic uncertainties: # - 'CMS_ttHl_Clos_norm_e' # - 'CMS_ttHl_Clos_shape_e' # - 'CMS_ttHl_Clos_norm_m' # - 'CMS_ttHl_Clos_shape_m' key_prep_dcard_job = getKey(histogramToFit) key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) key_add_syst_fakerate_dir = getKey("addSystFakeRates") add_syst_fakerate_job_tuple = (self.channel, histogramToFit) key_add_syst_fakerate_job = getKey(histogramToFit) self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job] = { 'inputFile' : self.jobOptions_prep_dcard[key_prep_dcard_job]['datacardFile'], 'cfgFile_modified' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_CFGS], "addSystFakeRates_%s_%s_cfg.py" % add_syst_fakerate_job_tuple), 'outputFile' : os.path.join(self.dirs[key_add_syst_fakerate_dir][DKEY_DCRD], "addSystFakeRates_%s_%s.root" % add_syst_fakerate_job_tuple), 'category' : self.channel, 'histogramToFit' : histogramToFit, 'plots_outputFileName' : os.path.join(self.dirs[DKEY_PLOT], "addSystFakeRates.png") } histogramDir_nominal = self.histogramDir_prep_dcard for lepton_type in [ 'e', 'm' ]: lepton_mcClosure = "Fakeable_mcClosure_%s" % lepton_type if lepton_mcClosure not in self.lepton_selections: continue lepton_selection_and_frWeight = get_lepton_selection_and_frWeight(lepton_mcClosure, "enabled") key_addBackgrounds_job_fakes = getKey("fakes_mc", lepton_selection_and_frWeight) histogramDir_mcClosure = self.mcClosure_dir[lepton_mcClosure] self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job].update({ 'add_Clos_%s' % lepton_type : ("Fakeable_mcClosure_%s" % lepton_type) in self.lepton_selections, 'inputFile_nominal_%s' % lepton_type : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'histogramName_nominal_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_nominal, histogramToFit), 'inputFile_mcClosure_%s' % lepton_type : self.jobOptions_addBackgrounds_sum[key_addBackgrounds_job_fakes]['outputFile'], 'histogramName_mcClosure_%s' % lepton_type : "%s/sel/evt/fakes_mc/%s" % (histogramDir_mcClosure, histogramToFit) }) self.createCfg_add_syst_fakerate(self.jobOptions_add_syst_fakerate[key_add_syst_fakerate_job]) logging.info("Creating configuration files to run 'makePlots'") key_hadd_stage2_job = getKey(get_lepton_selection_and_frWeight("Tight", "disabled")) key_makePlots_dir = getKey("makePlots") key_makePlots_job = getKey('') self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile' : os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir' : self.histogramDir_prep_dcard, 'label' : "t#bar{t}Z control region", 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addBackgrounds) self.sbatchFile_addBackgrounds = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_%s.py" % self.channel) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds, self.jobOptions_addBackgrounds) self.sbatchFile_addBackgrounds_sum = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_sum_%s.py" % self.channel) self.createScript_sbatch(self.executable_addBackgrounds, self.sbatchFile_addBackgrounds_sum, self.jobOptions_addBackgrounds_sum) logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_addFakes) self.sbatchFile_addFakes = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_addFakes_%s.py" % self.channel) self.createScript_sbatch(self.executable_addFakes, self.sbatchFile_addFakes, self.jobOptions_addFakes) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) self.addToMakefile_add_syst_fakerate(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
if output_filenames: if len(basedirs) != len(output_filenames): raise ValueError( "The number of output file names (%d) does not coincide w/ the number of " "input base directories (%d); either drop the list of output filenames or " "provide the exact same number of output file names as input base directories" % \ (len(output_filenames), len(basedirs)) ) for output_filename in output_filenames: output_dir = os.path.dirname(output_filename) if not os.path.isdir(output_dir): logging.debug('Directory %s does not exist' % output_dir) if args.force: create_if_not_exists(output_dir) else: raise ValueError( 'Use -f/--force to create the output directory %s' % output_dir) for basedir_idx, basedir in enumerate(basedirs): logging.info('Finding sum of weights for %s' % basedir) filenames = get_filelist(basedir) weight_map = collections.OrderedDict() for filename in filenames: assert (exists(filename)) neg_weights = collections.OrderedDict() logging.debug('Processing %s ...' % filename) events = Events(filename)
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for key in self.dirs.keys(): for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info("Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"] apply_genWeight = sample_info["apply_genWeight"] if (is_mc and "apply_genWeight" in sample_info.keys()) else False sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] apply_trigger_bits = (is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc for charge_selection in self.charge_selections: for central_or_shift in self.central_or_shifts: inputFileList = generateInputFileList(sample_name, sample_info, self.max_files_per_job, self.debug) for jobId in inputFileList.keys(): if central_or_shift != "central" and not is_mc: continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttH") and sample_category != "signal": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttW") and sample_category != "TTW": continue if central_or_shift.startswith("CMS_ttHl_thu_shape_ttZ") and sample_category != "TTZ": continue key_dir = getKey(sample_name, charge_selection) key_file = getKey(sample_name, charge_selection, central_or_shift, jobId) self.ntupleFiles[key_file] = inputFileList[jobId] self.cfgFiles_analyze_modified[key_file] = os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, charge_selection, central_or_shift, jobId)) self.histogramFiles[key_file] = os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%i.root" % \ (process_name, charge_selection, central_or_shift, jobId)) self.logFiles_analyze[key_file] = os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % \ (self.channel, process_name, charge_selection, central_or_shift, jobId)) self.createCfg_analyze(self.ntupleFiles[key_file], self.histogramFiles[key_file], sample_category, self.era, triggers, charge_selection, self.jet_minPt, self.jet_maxPt, self.jet_minAbsEta, self.jet_maxAbsEta, self.hadTau_selections, self.absEtaBins, is_mc, central_or_shift, lumi_scale, apply_genWeight, apply_trigger_bits, self.cfgFiles_analyze_modified[key_file]) if self.is_sbatch: logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch() logging.info("Creating configuration files for executing 'comp_jetToTauFakeRate'") for charge_selection in self.charge_selections: self.histogramFile_comp_jetToTauFakeRate[charge_selection] = os.path.join( self.outputDir, DKEY_HIST, "comp_jetToTauFakeRate_%s.root" % charge_selection) self.histogramDir_numerator[charge_selection] = "jetToTauFakeRate_%s/numerator/" % charge_selection self.histogramDir_denominator[charge_selection] = "jetToTauFakeRate_%s/denominator/" % charge_selection self.cfgFile_comp_jetToTauFakeRate_modified[charge_selection] = os.path.join( self.outputDir, DKEY_CFGS, "comp_jetToTauFakeRate_%s_cfg.py" % charge_selection) self.createCfg_comp_jetToTauFakeRate(self.histogramFile_hadd_stage1, self.histogramFile_comp_jetToTauFakeRate[charge_selection], self.histogramDir_denominator[charge_selection], self.histogramDir_numerator[charge_selection], self.absEtaBins, self.ptBins, self.cfgFile_comp_jetToTauFakeRate_modified[charge_selection]) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_comp_jetToTauFakeRate(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done")
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(process_name) for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0 frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100 * numDirectories_created >= frac * numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_info, self.max_files_per_job) for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): ##print "processing sample %s: jobId = %i" % (process_name, jobId) # build config files for executing analysis code key_analyze_dir = getKey(process_name) analyze_job_tuple = (process_name, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning( "No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join( self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%i.log" % analyze_job_tuple) histogramFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%i.root" % analyze_job_tuple) self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles': ntupleFiles, 'cfgFile_modified': cfgFile_modified_path, 'histogramFile': histogramFile_path, 'histogramDir': 'analyze_hadTopTagger', 'logFile': logFile_path, 'hadTauSelection': self.hadTau_selection, 'lumiScale': 1., 'selectBDT': True, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, lepton_selection_and_frWeight) key_hadd_stage1_job = getKey(process_name) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[key_hadd_stage1_job].append( self.jobOptions_analyze[key_analyze_job]['histogramFile']) self.outputFile_hadd_stage1[ key_hadd_stage1_job] = os.path.join( self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s.root" % process_name) self.targets.append( self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info("Creating Makefile") lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] # for charge_selection in self.charge_selections: ## NO CHARGE SELECTION NEEDED HERE # key_dir = getKey(process_name, charge_selection) ## NO CHARGE SELECTION NEEDED HERE key_dir = getKey(process_name) for dir_type in [DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: # self.dirs[key_dir][dir_type] = os.path.join(self.configDir, dir_type, self.channel, # "_".join([ charge_selection ]), process_name) ## NO CHARGE SELECTION NEEDED HERE self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, process_name) else: # self.dirs[key_dir][dir_type] = os.path.join(self.outputDir, dir_type, self.channel, # "_".join([ charge_selection ]), process_name) ## NO CHARGE SELECTION NEEDED HERE self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, process_name) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_HIST, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_HADD_RT ]: ## DKEY_PLOT TO BE ADDED LATER self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) print "self.dirs = ", self.dirs for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) else: create_if_not_exists(self.dirs[key]) inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_name, sample_info, self.max_files_per_job, self.debug) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"] or sample_info["sample_category"] in [ "additional_signal_overlap", "background_data_estimate" ]: continue process_name = sample_info["process_name_specific"] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") lumi_scale = 1. if not (self.use_lumi and is_mc) else sample_info[ "xsection"] * self.lumi / sample_info["nof_events"] apply_genWeight = sample_info["apply_genWeight"] if ( is_mc and "apply_genWeight" in sample_info.keys()) else False sample_category = sample_info["sample_category"] triggers = sample_info["triggers"] apply_trigger_bits = ( is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc # for charge_selection in self.charge_selections: ## NO CHARGE SELECTION NEEDED HERE for central_or_shift in self.central_or_shifts: inputFileList = inputFileLists[sample_name] for jobId in inputFileList.keys(): if central_or_shift != "central" and not is_mc: continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttH" ) and sample_category != "signal": continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttW" ) and sample_category != "TTW": continue if central_or_shift.startswith( "CMS_ttHl_thu_shape_ttZ" ) and sample_category != "TTZ": continue # build config files for executing analysis code # key_dir = getKey(process_name, charge_selection) ## NO CHARGE SELECTION NEEDED HERE key_dir = getKey(process_name) # key_analyze_job = getKey(process_name, charge_selection, central_or_shift, jobId) ## NO CHARGE SELECTION NEEDED HERE key_analyze_job = getKey(process_name, central_or_shift, jobId) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: print "Warning: ntupleFiles['%s'] = %s --> skipping job !!" % ( key_job, ntupleFiles) continue self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles' : ntupleFiles, # 'cfgFile_modified' : os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%s_%i_cfg.py" % \ # (self.channel, process_name, charge_selection, central_or_shift, jobId)), ## NO CHARGE SELECTION NEEDED HERE # 'histogramFile' : os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%s_%i.root" % \ # (process_name, charge_selection, central_or_shift, jobId)), ## NO CHARGE SELECTION NEEDED HERE # 'logFile' : os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%s_%i.log" % \ # (self.channel, process_name, charge_selection, central_or_shift, jobId)), ## NO CHARGE SELECTION NEEDED HERE 'cfgFile_modified' : os.path.join(self.dirs[key_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % \ (self.channel, process_name, central_or_shift, jobId)), 'histogramFile' : os.path.join(self.dirs[key_dir][DKEY_HIST], "%s_%s_%i.root" % \ (process_name, central_or_shift, jobId)), 'logFile' : os.path.join(self.dirs[key_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % \ (self.channel, process_name, central_or_shift, jobId)), 'sample_category' : sample_category, 'triggers' : sample_info["triggers"], # 'charge_selection' : charge_selection, ## NO CHARGE SELECTION NEEDED HERE # 'jet_minPt' : self.jet_minPt, # 'jet_maxPt' : self.jet_maxPt, # 'jet_minAbsEta' : self.jet_minAbsEta, # 'jet_maxAbsEta' : self.jet_maxAbsEta, # 'hadTau_selections' : self.hadTau_selections, 'absEtaBins_e' : self.absEtaBins_e, 'absEtaBins_mu' : self.absEtaBins_mu, 'absPtBins_e' : self.absPtBins_e, 'absPtBins_mu' : self.absPtBins_mu, ##'use_HIP_mitigation_mediumMuonId' : sample_info["use_HIP_mitigation_mediumMuonId"], 'use_HIP_mitigation_mediumMuonId' : True, 'is_mc' : is_mc, 'central_or_shift' : central_or_shift, 'lumi_scale' : 1. if not (self.use_lumi and is_mc) else sample_info["xsection"] * self.lumi / sample_info["nof_events"], 'apply_genWeight' : sample_info["genWeight"] if (is_mc and "genWeight" in sample_info.keys()) else False, 'apply_trigger_bits' : (is_mc and (self.era == "2015" or (self.era == "2016" and sample_info["reHLT"]))) or not is_mc, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job]) # initialize input and output file names for hadd_stage1 # key_hadd_stage1 = getKey(process_name, charge_selection) key_hadd_stage1 = getKey( process_name) ## NO CHARGE SELECTION NEEDED HERE if not key_hadd_stage1 in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[key_hadd_stage1] = [] self.inputFiles_hadd_stage1[key_hadd_stage1].append( self.jobOptions_analyze[key_analyze_job] ['histogramFile']) # self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s_%s.root" % \ # (self.channel, process_name, charge_selection)) ## NO CHARGE SELECTION NEEDED HERE self.outputFile_hadd_stage1[key_hadd_stage1] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage1_%s_%s.root" % \ (self.channel, process_name)) # initialize input and output file names for hadd_stage1_5 key_hadd_stage1_5 = getKey('') if not key_hadd_stage1_5 in self.inputFiles_hadd_stage1_5: self.inputFiles_hadd_stage1_5[key_hadd_stage1_5] = [] for key_hadd_stage1 in self.outputFile_hadd_stage1.keys(): self.inputFiles_hadd_stage1_5[key_hadd_stage1_5].append( self.outputFile_hadd_stage1[key_hadd_stage1]) self.outputFile_hadd_stage1_5[key_hadd_stage1_5] = os.path.join( self.dirs[DKEY_HIST], "histograms_harvested_stage1_5.root") ## Creating configuration files to run 'addBackgrounds_LeptonFakeRate' [stage 1.5] key_addBackgrounds_job = getKey('') self.jobOptions_addBackgrounds_LeptonFakeRate[ key_addBackgrounds_job] = { 'inputFile': self.outputFile_hadd_stage1_5[key_hadd_stage1_5], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], os.path.basename( self.cfgFile_addBackgrounds_LeptonFakeRate)), 'outputFile': os.path.join(self.dirs[DKEY_HIST], "addBackground_LeptonFakeRate.root"), 'logFile': os.path.join( self.dirs[DKEY_LOGS], os.path.basename( self.cfgFile_addBackgrounds_LeptonFakeRate.replace( "_cfg.py", ".log"))), } self.createCfg_addBackgrounds_LeptonFakeRate( self. jobOptions_addBackgrounds_LeptonFakeRate[key_addBackgrounds_job]) # initialize input and output file names for hadd_stage2 # key_hadd_stage2 = getKey(charge_selection) ## NO CHARGE SELECTION NEEDED HERE # if not key_hadd_stage2 in self.inputFiles_hadd_stage2: ## NO CHARGE SELECTION NEEDED HERE # self.inputFiles_hadd_stage2[key_hadd_stage2] = [] ## NO CHARGE SELECTION NEEDED HERE # self.inputFiles_hadd_stage2[key_hadd_stage2].append(self.outputFile_hadd_stage1[key_hadd_stage1]) ## NO CHARGE SELECTION NEEDED HERE # self.outputFile_hadd_stage2[key_hadd_stage2] = os.path.join(self.dirs[DKEY_HIST], "histograms_harvested_stage2_%s_%s.root" % \ ## NO CHARGE SELECTION NEEDED HERE # (self.channel, charge_selection)) ## NO CHARGE SELECTION NEEDED HERE key_hadd_stage2 = getKey('') if not key_hadd_stage2 in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2] = [] for key_hadd_stage1_5 in self.outputFile_hadd_stage1_5.keys(): self.inputFiles_hadd_stage2[key_hadd_stage2].append( self.outputFile_hadd_stage1_5[key_hadd_stage1_5]) self.inputFiles_hadd_stage2[key_hadd_stage2].append( self.jobOptions_addBackgrounds_LeptonFakeRate[ key_addBackgrounds_job]['outputFile']) self.outputFile_hadd_stage2[key_hadd_stage2] = os.path.join( self.dirs[DKEY_HIST], "histograms_harvested_stage2.root") if self.prep_dcard: processesToCopy = [] signals = [] logging.info( "Creating configuration files to run 'prepareDatacards_LeptonFakeRate'" ) for process in self.prep_dcard_signals: signals.append(process) self.prep_dcard_signals = signals for process in self.prep_dcard_processesToCopy: processesToCopy.append(process) self.prep_dcard_processesToCopy = processesToCopy for histogramToFit in self.histograms_to_fit: key_prep_dcard_job = getKey(histogramToFit) self.jobOptions_prep_dcard[key_prep_dcard_job] = { 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "prepareDatacards_LeptonFakeRate_%s_cfg.py" % (histogramToFit)), 'datacardFile': os.path.join(self.dirs[DKEY_DCRD], "prepareDatacards_%s.root" % (histogramToFit)), 'histogramDir': (self.histogramDir_prep_dcard), 'histogramToFit': histogramToFit, 'label': None } # self.createCfg_prep_dcard(self.jobOptions_prep_dcard[key_prep_dcard_job]) ## DEF LINE self.createCfg_prep_dcard_LeptonFakeRate( self.jobOptions_prep_dcard[key_prep_dcard_job]) if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) # self.createScript_sbatch() self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) self.sbatchFile_addBackgrounds_LeptonFakeRate = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_addBackgrounds_LeptonFakeRate_%s.py" % self.channel) self.createScript_sbatch( self.executable_addBackgrounds_LeptonFakeRate, self.sbatchFile_addBackgrounds_LeptonFakeRate, self.jobOptions_addBackgrounds_LeptonFakeRate) # logging.info("Creating script for submitting '%s' jobs to batch system" % self.executable_comp_jetToTauFakeRate) # self.sbatchFile_comp_jetToTauFakeRate = os.path.join(self.dirs[DKEY_SCRIPTS], "sbatch_comp_jetToTauFakeRate.py") # self.createScript_sbatch(self.executable_comp_jetToTauFakeRate, self.sbatchFile_comp_jetToTauFakeRate, self.jobOptions_comp_jetToTauFakeRate) #### FAKE RATE COMP BLOCK COMMENTED OUT ######################## # logging.info("Creating configuration files for executing 'comp_jetToTauFakeRate'") # for charge_selection in self.charge_selections: # key_comp_jetToTauFakeRate_job = getKey(charge_selection) # key_hadd_stage2 = getKey(charge_selection) # self.jobOptions_comp_jetToTauFakeRate[key_comp_jetToTauFakeRate_job] = { # 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2], # 'cfgFile_modified' : os.path.join( # self.dirs[DKEY_CFGS], "comp_jetToTauFakeRate_%s_cfg.py" % charge_selection), # 'outputFile' : os.path.join( # self.dirs[DKEY_HIST], "comp_jetToTauFakeRate_%s.root" % charge_selection), # 'looseRegion' : "jetToTauFakeRate_%s/denominator/" % charge_selection, # 'tightRegion' : "jetToTauFakeRate_%s/numerator/" % charge_selection, # 'absEtaBins' : self.absEtaBins, # 'ptBins' : self.ptBins # } # self.createCfg_comp_jetToTauFakeRate(self.jobOptions_comp_jetToTauFakeRate[key_comp_jetToTauFakeRate_job]) # self.targets.append(self.jobOptions_comp_jetToTauFakeRate[key_comp_jetToTauFakeRate_job]['outputFile']) # logging.info("Creating configuration files to run 'makePlots'") # for charge_selection in self.charge_selections: # key_makePlots_job = getKey(charge_selection) # key_hadd_stage2 = getKey(charge_selection) # self.jobOptions_make_plots[key_makePlots_job] = { # 'executable' : self.executable_make_plots, # 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2], # 'cfgFile_modified' : os.path.join( # self.dirs[DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), # 'outputFile' : os.path.join( # self.dirs[DKEY_PLOT], "makePlots_%s.png" % self.channel), # 'histogramDir' : "jetToTauFakeRate_%s" % charge_selection, # 'label' : None, # 'make_plots_backgrounds' : [ "TT", "TTW", "TTZ", "EWK", "Rares" ], # } # self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) # self.cfgFile_make_plots = self.cfgFile_make_plots_denominator # for absEtaBin in [ "absEtaLt1_5", "absEta1_5to9_9" ]: # key_makePlots_job = getKey(charge_selection, absEtaBin, "denominator") # key_hadd_stage2 = getKey(charge_selection) # self.jobOptions_make_plots[key_makePlots_job] = { # 'executable' : self.executable_make_plots, # 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2], # 'cfgFile_modified' : os.path.join( # self.dirs[DKEY_CFGS], "makePlots_%s_%s_denominator_%s_cfg.py" % (self.channel, charge_selection, absEtaBin)), # 'outputFile' : os.path.join( # self.dirs[DKEY_PLOT], "makePlots_%s_%s_denominator_%s.png" % (self.channel, charge_selection, absEtaBin)), # 'histogramDir' : "jetToTauFakeRate_%s/denominator/%s" % (charge_selection, absEtaBin), # 'label' : None, # 'make_plots_backgrounds' : [ "TT", "TTW", "TTZ", "EWK", "Rares" ], # } # self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) # for hadTau_selection in self.hadTau_selections: # key_makePlots_job = getKey(charge_selection, absEtaBin, "numerator", hadTau_selection) # key_hadd_stage2 = getKey(charge_selection) # self.jobOptions_make_plots[key_makePlots_job] = { # 'executable' : self.executable_make_plots, # 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2], # 'cfgFile_modified' : os.path.join( # self.dirs[DKEY_CFGS], "makePlots_%s_%s_numerator_%s_%s_cfg.py" % (self.channel, charge_selection, hadTau_selection, absEtaBin)), # 'outputFile' : os.path.join( # self.dirs[DKEY_PLOT], "makePlots_%s_%s_numerator_%s_%s.png" % (self.channel, charge_selection, hadTau_selection, absEtaBin)), # 'histogramDir' : "jetToTauFakeRate_%s/numerator/%s/%s" % (charge_selection, hadTau_selection, absEtaBin), # 'label' : None, # 'make_plots_backgrounds' : [ "TT", "TTW", "TTZ", "EWK", "Rares" ], # } # self.createCfg_makePlots(self.jobOptions_make_plots[key_makePlots_job]) ######################################################### lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) # self.addToMakefile_hadd_stage1_5(lines_makefile) self.addToMakefile_backgrounds_from_data(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile) self.addToMakefile_prep_dcard(lines_makefile) # self.addToMakefile_comp_jetToTauFakeRate(lines_makefile) ## TO BE IMPLEMENTED LATER # self.addToMakefile_make_plots(lines_makefile) ## TO BE IMPLEMENTED LATER self.targets = [ outputFile for outputFile in self.outputFile_hadd_stage2.values() ] self.createMakefile(lines_makefile) logging.info("Done")
root_file.Close() return True def hadd(input_files, output_file): cmd_str = 'hadd -f %s %s' % (output_file, ' '.join(input_files)) stdout, stderr = run_cmd(cmd_str, do_not_log=True, return_stderr=True) if not stdout or stderr: raise RuntimeError('Error: %s' % stderr) output_root_dir = os.path.expanduser('~/sandbox/stitch_samples/root_files') output_plot_dir = os.path.expanduser('~/sandbox/stitch_samples/plots') for output_dir in [output_root_dir, output_plot_dir]: create_if_not_exists(output_dir) for sample_set_to_stich in samples_to_stitch: binning_keys = filter(lambda key: key != 'inclusive', sample_set_to_stich.keys()) sample_list = [] for key in sample_set_to_stich: if key == 'inclusive': sample_list.extend(sample_set_to_stich[key]['samples']) else: for binned_sample in sample_set_to_stich[key]: sample_list.extend(binned_sample['samples']) assert (len(sample_list) == len(set(sample_list))) binning = {}
def __init__( self, configDir, outputDir, cfgFile_prodNtuple, samples, max_files_per_job, era, preselection_cuts, leptonSelection, hadTauWP, check_output_files, running_method, version, num_parallel_jobs, pileup, golden_json, dry_run, isDebug, gen_matching_by_index, use_nonnominal, use_home, skip_tools_step, verbose=False, pool_id='', ): self.configDir = configDir self.outputDir = outputDir self.max_num_jobs = 200000 self.samples = samples self.max_files_per_job = max_files_per_job self.era = era self.preselection_cuts = preselection_cuts self.leptonSelection = leptonSelection self.hadTauWP = hadTauWP self.check_output_files = check_output_files self.verbose = verbose self.dry_run = dry_run self.isDebug = isDebug self.gen_matching_by_index = gen_matching_by_index self.use_nonnominal = use_nonnominal self.use_home = use_home self.pileup = pileup self.golden_json = golden_json if running_method.lower() not in ["sbatch", "makefile"]: raise ValueError("Invalid running method: %s" % running_method) if not os.path.isfile(self.pileup): raise ValueError('No such file: %s' % self.pileup) self.pileup_histograms = get_pileup_histograms(self.pileup) if not os.path.isfile(self.golden_json): raise ValueError('No such file: %s' % self.golden_json) self.running_method = running_method self.is_sbatch = self.running_method.lower() == "sbatch" self.is_makefile = not self.is_sbatch self.makefile = os.path.join(self.configDir, "Makefile_prodNtuple") self.num_parallel_jobs = num_parallel_jobs self.skip_tools_step = skip_tools_step self.pool_id = pool_id if pool_id else uuid.uuid4() self.workingDir = os.getcwd() logging.info("Working directory is: %s" % self.workingDir) self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src', 'tthAnalysis', 'HiggsToTauTau', 'test', 'templates') logging.info("Templates directory is: %s" % self.template_dir) self.version = version self.samples = samples create_if_not_exists(self.configDir) create_if_not_exists(self.outputDir) self.stdout_file_path = os.path.join(self.configDir, "stdout_prodNtuple.log") self.stderr_file_path = os.path.join(self.configDir, "stderr_prodNtuple.log") self.sw_ver_file_cfg = os.path.join(self.configDir, "VERSION_prodNtuple.log") self.sw_ver_file_out = os.path.join(self.outputDir, "VERSION_prodNtuple.log") self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out = get_log_version( (self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out)) self.cfgFile_prodNtuple_original = os.path.join( self.template_dir, cfgFile_prodNtuple) self.sbatchFile_prodNtuple = os.path.join(self.configDir, "sbatch_prodNtuple.py") self.cfgFiles_prodNtuple_modified = {} self.logFiles_prodNtuple = {} self.inputFiles = {} self.outputFiles = {} self.filesToClean = [] self.dirs = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] key_dir = getKey(sample_name) for dir_type in [DKEY_CFGS, DKEY_NTUPLES, DKEY_LOGS]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, process_name) for dir_type in [DKEY_CFGS, DKEY_LOGS]: initDict(self.dirs, [dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type) self.cvmfs_error_log = {} self.executable = "produceNtuple.sh"
def create(self): """Creates all necessary config files and runs the complete analysis workfow -- either locally or on the batch system """ for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] sample_category = sample_info["sample_category"] is_mc = (sample_info["type"] == "mc") logging.info("Building dictionaries for sample %s..." % process_name) for charge_selection in self.charge_selections: central_or_shift_extensions = ["", "hadd", "addBackgrounds"] central_or_shifts_extended = central_or_shift_extensions + self.central_or_shifts for central_or_shift_or_dummy in central_or_shifts_extended: process_name_extended = [process_name, "hadd"] for process_name_or_dummy in process_name_extended: if central_or_shift_or_dummy in [ "hadd" ] and process_name_or_dummy in ["hadd"]: continue if central_or_shift_or_dummy != "central" and central_or_shift_or_dummy not in central_or_shift_extensions: if not is_mc: continue if not self.accept_central_or_shift( central_or_shift_or_dummy, sample_info): continue key_dir = getKey(process_name_or_dummy, charge_selection, central_or_shift_or_dummy) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_RLES ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, "_".join([charge_selection]), process_name_or_dummy, central_or_shift_or_dummy) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, "_".join([charge_selection]), process_name_or_dummy) for subdirectory in ["comp_jetToTauFakeRate", "makePlots"]: key_dir = getKey(subdirectory) for dir_type in [ DKEY_CFGS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT ]: initDict(self.dirs, [key_dir, dir_type]) if dir_type in [DKEY_CFGS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT]: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, self.channel, subdirectory) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, self.channel, subdirectory) for dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_HIST, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: initDict(self.dirs, [dir_type]) if dir_type in [ DKEY_CFGS, DKEY_SCRIPTS, DKEY_LOGS, DKEY_DCRD, DKEY_PLOT, DKEY_HADD_RT ]: self.dirs[dir_type] = os.path.join(self.configDir, dir_type, self.channel) else: self.dirs[dir_type] = os.path.join(self.outputDir, dir_type, self.channel) numDirectories = 0 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: numDirectories += len(self.dirs[key]) else: numDirectories += 1 logging.info("Creating directory structure (numDirectories = %i)" % numDirectories) numDirectories_created = 0 frac = 1 for key in self.dirs.keys(): if type(self.dirs[key]) == dict: for dir_type in self.dirs[key].keys(): create_if_not_exists(self.dirs[key][dir_type]) numDirectories_created += len(self.dirs[key]) else: create_if_not_exists(self.dirs[key]) numDirectories_created = numDirectories_created + 1 while 100 * numDirectories_created >= frac * numDirectories: logging.info(" %i%% completed" % frac) frac = frac + 1 logging.info("Done.") inputFileLists = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue logging.info("Checking input files for sample %s" % sample_info["process_name_specific"]) inputFileLists[sample_name] = generateInputFileList( sample_info, self.max_files_per_job) self.inputFileIds = {} for sample_name, sample_info in self.samples.items(): if not sample_info["use_it"]: continue process_name = sample_info["process_name_specific"] inputFileList = inputFileLists[sample_name] logging.info( "Creating configuration files to run '%s' for sample %s" % (self.executable_analyze, process_name)) is_mc = (sample_info["type"] == "mc") sample_category = sample_info["sample_category"] for charge_selection in self.charge_selections: for central_or_shift in self.central_or_shifts: if central_or_shift != "central" and not is_mc: continue if not self.accept_central_or_shift( central_or_shift, sample_info): continue # build config files for executing analysis code key_analyze_dir = getKey(process_name, charge_selection, central_or_shift) for jobId in inputFileList.keys(): analyze_job_tuple = (process_name, charge_selection, central_or_shift, jobId) key_analyze_job = getKey(*analyze_job_tuple) ntupleFiles = inputFileList[jobId] if len(ntupleFiles) == 0: logging.warning( "No input ntuples for %s --> skipping job !!" % (key_analyze_job)) continue cfgFile_modified_path = os.path.join( self.dirs[key_analyze_dir][DKEY_CFGS], "analyze_%s_%s_%s_%i_cfg.py" % analyze_job_tuple) logFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_LOGS], "analyze_%s_%s_%s_%i.log" % analyze_job_tuple) histogramFile_path = os.path.join( self.dirs[key_analyze_dir][DKEY_HIST], "analyze_%s_%s_%s_%i.root" % analyze_job_tuple) rleOutputFile_path = os.path.join(self.dirs[key_analyze_dir][DKEY_RLES], "rle_%s_%s_%s_%i.txt" % analyze_job_tuple) \ if self.select_rle_output else "" self.jobOptions_analyze[key_analyze_job] = { 'ntupleFiles': ntupleFiles, 'cfgFile_modified': cfgFile_modified_path, 'histogramFile': histogramFile_path, 'logFile': logFile_path, 'chargeSelection': charge_selection, 'jet_minPt': self.jet_minPt, 'jet_maxPt': self.jet_maxPt, 'jet_minAbsEta': self.jet_minAbsEta, 'jet_maxAbsEta': self.jet_maxAbsEta, 'hadTau_selection_tight': self.hadTau_selection_tight, 'hadTauSelection_denominator': self.hadTau_selection_denominator, 'hadTauSelections_numerator': self.hadTau_selections_numerator, 'trigMatchingOptions': self.trigMatchingOptions, 'selEventsFileName_output': rleOutputFile_path, 'absEtaBins': self.absEtaBins, 'decayModes': self.decayModes, 'central_or_shift': central_or_shift, 'central_or_shifts_local': [], 'apply_hlt_filter': self.hlt_filter, } self.createCfg_analyze( self.jobOptions_analyze[key_analyze_job], sample_info) # initialize input and output file names for hadd_stage1 key_hadd_stage1_dir = getKey(process_name, charge_selection) hadd_stage1_job_tuple = (process_name, charge_selection) key_hadd_stage1_job = getKey(*hadd_stage1_job_tuple) if not key_hadd_stage1_job in self.inputFiles_hadd_stage1: self.inputFiles_hadd_stage1[ key_hadd_stage1_job] = [] self.inputFiles_hadd_stage1[ key_hadd_stage1_job].append( self.jobOptions_analyze[key_analyze_job] ['histogramFile']) self.outputFile_hadd_stage1[ key_hadd_stage1_job] = os.path.join( self.dirs[key_hadd_stage1_dir][DKEY_HIST], "hadd_stage1_%s_%s.root" % hadd_stage1_job_tuple) # initialize input and output file names for hadd_stage2 key_hadd_stage1_job = getKey(process_name, charge_selection) key_hadd_stage2_dir = getKey("hadd", charge_selection) key_hadd_stage2_job = getKey(charge_selection) if not key_hadd_stage2_job in self.inputFiles_hadd_stage2: self.inputFiles_hadd_stage2[key_hadd_stage2_job] = [] self.inputFiles_hadd_stage2[key_hadd_stage2_job].append( self.outputFile_hadd_stage1[key_hadd_stage1_job]) self.outputFile_hadd_stage2[ key_hadd_stage2_job] = os.path.join( self.dirs[key_hadd_stage2_dir][DKEY_HIST], "hadd_stage2_%s.root" % charge_selection) logging.info( "Creating configuration files for executing 'comp_jetToTauFakeRate'" ) for charge_selection in self.charge_selections: charge_key = "comp_%s" % charge_selection self.comp_input_files[charge_key] = [] for trigMatchingOption in self.trigMatchingOptions: key_hadd_stage2_job = getKey(charge_selection) key_comp_jetToTauFakeRate_dir = getKey("comp_jetToTauFakeRate") key_comp_jetToTauFakeRate_job = getKey(charge_selection, trigMatchingOption) self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job] = { 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified': os.path.join( self.dirs[DKEY_CFGS], "comp_jetToTauFakeRate_%s_%s_cfg.py" % (charge_selection, trigMatchingOption)), 'outputFile': os.path.join( self.dirs[DKEY_HIST], "comp_jetToTauFakeRate_%s_%s.root" % (charge_selection, trigMatchingOption)), 'logFile': os.path.join( self.dirs[DKEY_LOGS], "comp_jetToTauFakeRate_%s_%s.log" % (charge_selection, trigMatchingOption)), 'looseRegion': "jetToTauFakeRate_%s_%s/denominator/" % (charge_selection, trigMatchingOption), 'tightRegion': "jetToTauFakeRate_%s_%s/numerator/" % (charge_selection, trigMatchingOption), 'absEtaBins': self.absEtaBins, 'ptBins': self.ptBins, 'decayModes': self.decayModes, 'hadTauSelections': self.hadTau_selections_numerator, 'trigMatchingOption': trigMatchingOption, 'plots_outputFileName': os.path.join( self.dirs[key_comp_jetToTauFakeRate_dir] [DKEY_PLOT], "comp_jetToTauFakeRate_%s.png" % trigMatchingOption) } self.createCfg_comp_jetToTauFakeRate( self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job]) comp_output = self.jobOptions_comp_jetToTauFakeRate[ key_comp_jetToTauFakeRate_job]['outputFile'] self.targets.append(comp_output) self.comp_input_files[charge_key].append(comp_output) self.comp_output_files[charge_key] = os.path.join( self.dirs[DKEY_HIST], "comp_jetToTauFakeRate_%s.root" % charge_selection) logging.info("Creating configuration files to run 'makePlots'") for charge_selection in self.charge_selections: key_hadd_stage2_job = getKey(charge_selection) key_makePlots_dir = getKey("makePlots") key_makePlots_job = getKey(charge_selection) self.jobOptions_make_plots[key_makePlots_job] = { 'executable': self.executable_make_plots, 'inputFile': self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified': os.path.join(self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_cfg.py" % self.channel), 'outputFile': os.path.join(self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s.png" % self.channel), 'histogramDir': "jetToTauFakeRate_%s" % charge_selection, 'label': None, 'make_plots_backgrounds': self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) for trigMatchingOption in self.trigMatchingOptions: self.cfgFile_make_plots = self.cfgFile_make_plots_denominator for absEtaBin in ["absEtaLt1_5", "absEta1_5to9_9"]: key_hadd_stage2_job = getKey(charge_selection) key_makePlots_job = getKey(charge_selection, trigMatchingOption, absEtaBin, "denominator") self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join( self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_%s_%s_denominator_%s_cfg.py" % \ (self.channel, charge_selection, trigMatchingOption, absEtaBin)), 'outputFile' : os.path.join( self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_%s_%s_denominator_%s.png" % (self.channel, charge_selection, trigMatchingOption, absEtaBin)), 'histogramDir' : "jetToTauFakeRate_%s_%s/denominator/%s" % (charge_selection, trigMatchingOption, absEtaBin), 'label' : None, 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) for hadTau_selection_numerator in self.hadTau_selections_numerator: key_hadd_stage2_job = getKey(charge_selection) key_makePlots_job = getKey(charge_selection, trigMatchingOption, absEtaBin, "numerator", hadTau_selection_numerator) self.jobOptions_make_plots[key_makePlots_job] = { 'executable' : self.executable_make_plots, 'inputFile' : self.outputFile_hadd_stage2[key_hadd_stage2_job], 'cfgFile_modified' : os.path.join( self.dirs[key_makePlots_dir][DKEY_CFGS], "makePlots_%s_%s_%s_numerator_%s_%s_cfg.py" % \ (self.channel, charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin)), 'outputFile' : os.path.join( self.dirs[key_makePlots_dir][DKEY_PLOT], "makePlots_%s_%s_%s_numerator_%s_%s.png" % \ (self.channel, charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin)), 'histogramDir' : "jetToTauFakeRate_%s_%s/numerator/%s/%s" % (charge_selection, trigMatchingOption, hadTau_selection_numerator, absEtaBin), 'label' : None, 'make_plots_backgrounds' : self.make_plots_backgrounds } self.createCfg_makePlots( self.jobOptions_make_plots[key_makePlots_job]) self.sbatchFile_analyze = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_analyze_%s.py" % self.channel) self.sbatchFile_comp_jetToTauFakeRate = os.path.join( self.dirs[DKEY_SCRIPTS], "sbatch_comp_jetToTauFakeRate.py") if self.is_sbatch: logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_analyze) self.createScript_sbatch_analyze(self.executable_analyze, self.sbatchFile_analyze, self.jobOptions_analyze) logging.info( "Creating script for submitting '%s' jobs to batch system" % self.executable_comp_jetToTauFakeRate) self.createScript_sbatch(self.executable_comp_jetToTauFakeRate, self.sbatchFile_comp_jetToTauFakeRate, self.jobOptions_comp_jetToTauFakeRate) lines_makefile = [] self.addToMakefile_analyze(lines_makefile) self.addToMakefile_hadd_stage1(lines_makefile) self.addToMakefile_hadd_stage2(lines_makefile, make_dependency="phony_hadd_stage1", max_mem='4096M') self.addToMakefile_comp_jetToTauFakeRate(lines_makefile) self.addToMakefile_comp_hadd(lines_makefile) self.addToMakefile_make_plots(lines_makefile) self.createMakefile(lines_makefile) logging.info("Done.") return self.num_jobs
def __init__( self, configDir, outputDir, output_file, executable, projection_module, samples, max_files_per_job, era, plot, check_output_files, running_method, num_parallel_jobs, pool_id='', verbose=False, dry_run=False, use_home=False, submission_cmd=None, ): self.configDir = configDir self.outputDir = outputDir self.executable = executable self.projection_module = projection_module self.max_num_jobs = 200000 self.samples = samples self.max_files_per_job = max_files_per_job self.era = era self.plot = plot self.check_output_files = check_output_files self.verbose = verbose self.dry_run = dry_run self.use_home = use_home if running_method.lower() not in ["sbatch", "makefile"]: raise ValueError("Invalid running method: %s" % running_method) self.running_method = running_method self.is_sbatch = self.running_method.lower() == "sbatch" self.is_makefile = not self.is_sbatch self.makefile = os.path.join( self.configDir, "Makefile_{}".format(self.projection_module)) self.num_parallel_jobs = num_parallel_jobs self.pool_id = pool_id if pool_id else uuid.uuid4() self.workingDir = os.getcwd() logging.info("Working directory is: %s" % self.workingDir) self.template_dir = os.path.join(os.getenv('CMSSW_BASE'), 'src', 'tthAnalysis', 'HiggsToTauTau', 'test', 'templates') logging.info("Templates directory is: %s" % self.template_dir) create_if_not_exists(self.configDir) create_if_not_exists(self.outputDir) self.output_file = os.path.join(self.outputDir, output_file) self.stdout_file_path = os.path.join( self.configDir, "stdout_{}.log".format(self.projection_module)) self.stderr_file_path = os.path.join( self.configDir, "stderr_{}.log".format(self.projection_module)) self.sw_ver_file_cfg = os.path.join( self.configDir, "VERSION_{}.log".format(self.projection_module)) self.sw_ver_file_out = os.path.join( self.outputDir, "VERSION_{}.log".format(self.projection_module)) self.submission_out = os.path.join(self.configDir, "SUBMISSION.log") self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out = get_log_version( (self.stdout_file_path, self.stderr_file_path, self.sw_ver_file_cfg, self.sw_ver_file_out, self.submission_out)) check_submission_cmd(self.submission_out, submission_cmd) self.sbatchFile_projection = os.path.join( self.configDir, "sbatch_{}.py".format(self.projection_module)) self.cfgFiles_projection = {} self.logFiles_projection = {} self.scriptFiles_projection = {} self.jobOptions_sbatch = {} self.inputFiles = {} self.outputFiles_tmp = {} self.outputFiles = {} self.phoniesToAdd = [] self.filesToClean = [] self.targets = [] self.makefile_target = "sbatch_{}".format(self.projection_module) self.dirs = {} all_dirs = [ DKEY_CFGS, DKEY_HISTO_TMP, DKEY_HISTO, DKEY_PLOTS, DKEY_LOGS, DKEY_SCRIPTS, DKEY_HADD_RT ] cfg_dirs = [ DKEY_CFGS, DKEY_LOGS, DKEY_PLOTS, DKEY_SCRIPTS, DKEY_HADD_RT ] ref_genWeightsFile = os.path.join( os.environ['CMSSW_BASE'], 'src', 'tthAnalysis', 'HiggsToTauTau', 'data', 'refGenWeight_{}.txt'.format(self.era)) self.ref_genWeights = load_refGenWeightsFromFile( ref_genWeightsFile) if projection_module != 'puHist' else {} for sample_name, sample_info in self.samples.items(): if not sample_info['use_it']: continue process_name = sample_info["process_name_specific"] key_dir = getKey(process_name) for dir_type in all_dirs: if dir_type == DKEY_PLOTS: continue initDict(self.dirs, [key_dir, dir_type]) if dir_type in cfg_dirs: self.dirs[key_dir][dir_type] = os.path.join( self.configDir, dir_type, process_name) else: self.dirs[key_dir][dir_type] = os.path.join( self.outputDir, dir_type, process_name) for dir_type in cfg_dirs: initDict(self.dirs, [dir_type]) self.dirs[dir_type] = os.path.join(self.configDir, dir_type) self.cvmfs_error_log = {} self.num_jobs = { 'hadd': 0, 'project': 0, 'plot': 0, }
def __init__(self, outputDir, executable_analyze, channel, central_or_shifts, max_files_per_job, era, use_lumi, lumi, debug, running_method, num_parallel_jobs, histograms_to_fit, executable_prep_dcard="prepareDatacards", executable_make_plots="makePlots"): self.outputDir = outputDir self.executable_analyze = executable_analyze self.channel = channel self.central_or_shifts = central_or_shifts self.max_files_per_job = max_files_per_job self.max_num_jobs = 20000 self.era = era self.use_lumi = use_lumi self.lumi = lumi self.debug = debug assert(running_method.lower() in [ "sbatch", "makefile"]), "Invalid running method: %s" % running_method self.running_method = running_method self.is_sbatch = False self.is_makefile = False if self.running_method.lower() == "sbatch": self.is_sbatch = True else: self.is_makefile = True self.makefile = os.path.join( self.outputDir, "Makefile_%s" % self.channel) self.num_parallel_jobs = num_parallel_jobs self.histograms_to_fit = histograms_to_fit self.executable_prep_dcard = executable_prep_dcard self.prep_dcard_processesToCopy = [ "data_obs", "TT", "TTW", "TTZ", "EWK", "Rares"] self.prep_dcard_signals = ["ttH_hww", "ttH_hzz", "ttH_htt"] self.executable_make_plots = executable_make_plots self.workingDir = os.getcwd() print "Working directory is: " + self.workingDir create_if_not_exists(self.outputDir) self.stdout_file = codecs.open(os.path.join( self.outputDir, "stdout_%s.log" % self.channel), 'w', 'utf-8') self.stderr_file = codecs.open(os.path.join( self.outputDir, "stderr_%s.log" % self.channel), 'w', 'utf-8') self.dirs = {} self.samples = {} self.cfgFiles_analyze_modified = {} self.logFiles_analyze = {} self.sbatchFile_analyze = os.path.join( self.outputDir, "sbatch_analyze_%s.py" % self.channel) self.ntupleFiles = {} self.histogramFiles = {} self.inputFiles_hadd_stage1 = [] self.histogramFile_hadd_stage1 = os.path.join( self.outputDir, DKEY_HIST, "histograms_harvested_stage1_%s.root" % self.channel) self.inputFiles_hadd_stage1_5 = [] self.histogramFile_hadd_stage1_5 = os.path.join( self.outputDir, DKEY_HIST, "histograms_harvested_stage1_5_%s.root" % self.channel) self.inputFiles_hadd_stage2 = [] self.histogramFile_hadd_stage2 = os.path.join( self.outputDir, DKEY_HIST, "histograms_harvested_stage2_%s.root" % self.channel) self.datacardFiles = {} self.cfgFile_prep_dcard_original = os.path.join( self.workingDir, "prepareDatacards_cfg.py") self.cfgFile_prep_dcard_modified = {} self.histogramDir_prep_dcard = None self.make_plots_backgrounds = ["TT", "TTW", "TTZ", "EWK", "Rares"] self.make_plots_signal = "signal" self.cfgFile_make_plots_original = os.path.join( self.workingDir, "makePlots_cfg.py") self.cfgFiles_make_plots_modified = [] self.filesToClean = [] self.rleOutputFiles = {} self.rootOutputFiles = {} self.rootOutputAux = {} if era == '2015': self.triggers_1e = ['HLT_BIT_HLT_Ele23_WPLoose_Gsf_v'] self.triggers_2e = [ 'HLT_BIT_HLT_Ele17_Ele12_CaloIdL_TrackIdL_IsoVL_DZ_v'] self.triggers_1mu = [ 'HLT_BIT_HLT_IsoMu20_v', 'HLT_BIT_HLT_IsoTkMu20_v'] self.triggers_2mu = ['HLT_BIT_HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_v', 'HLT_BIT_HLT_Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL_DZ_v'] self.triggers_1e1mu = ['HLT_BIT_HLT_Mu17_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_v', 'HLT_BIT_HLT_Mu8_TrkIsoVVL_Ele17_CaloIdL_TrackIdL_IsoVL_v'] elif era == '2016': # CV: HLT_Ele25_WPTight_Gsf_v* was prescaled during part of 2016 # Runs B-D, so use HLT_Ele27_eta2p1_WPLoose_Gsf_v in addition self.triggers_1e = ['HLT_BIT_HLT_Ele25_WPTight_Gsf_v', 'HLT_BIT_HLT_Ele27_eta2p1_WPLoose_Gsf_v'] self.triggers_2e = [ 'HLT_BIT_HLT_Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ_v'] self.triggers_1mu = [ 'HLT_BIT_HLT_IsoMu22_v', 'HLT_BIT_HLT_IsoTkMu22_v'] self.triggers_2mu = ['HLT_BIT_HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_v', 'HLT_BIT_HLT_Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL_DZ_v'] self.triggers_1e1mu = ['HLT_BIT_HLT_Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_v', 'HLT_BIT_HLT_Mu8_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_v'] self.triggers_2tau = [ 'HLT_BIT_HLT_DoubleMediumIsoPFTau35_Trk1_eta2p1_Reg_v*'] else: raise ValueError( "Invalid Configuration parameter 'era' = %s !!" % era) self.cvmfs_error_log = {}