def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args): if datacards_poi_ranges is None: datacards_poi_ranges = {} tmp_args = " ".join(args) chunks = [[None, None]] if "{CHUNK}" in tmp_args and "--points" in tmp_args: splited_args = tmp_args.split() n_points = int(splited_args[splited_args.index("--points") + 1]) n_points_per_chunk = 199 chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)] commands = [] for index, (chunk_min, chunk_max) in enumerate(chunks): commands.extend([[ "combine -m {MASS} {POI_RANGE} {ARGS} {WORKSPACE} {CHUNK_POINTS}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0], # TODO: maybe there are more masses? POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "", ARGS=tmp_args.format(CHUNK=str(index), RMIN="{RMIN}", RMAX="{RMAX}"), WORKSPACE=workspace, CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format( CHUNK_MIN=chunk_min, CHUNK_MAX=chunk_max ) ).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commands, n_processes=n_processes)
def main(): parser = argparse.ArgumentParser( description="Run multiple commands in parallel.", parents=[logger.loggingParser]) parser.add_argument( "commands", help= "Commands to be executed on a batch system. They can also be piped into this program.", nargs="*", default=[]) parser.add_argument( "-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) # prepare commands if (len(args.commands) == 0) and (not sys.stdin.isatty()): args.commands.extend(sys.stdin.read().strip().split("\n")) tools.parallelize(run_command, args.commands, n_processes=args.n_processes, description=os.path.basename(sys.argv[0]))
def submission(base_paths, n_processes=1): # retrieve and prepare input files filenames_per_sample_per_pipeline = {} for base_path in base_paths: stdout_directories, stderr_directories = tools.subprocessCall( shlex.split("gfal-ls " + base_path)) tmp_filenames_per_sample_per_pipeline = tools.parallelize( get_filenames, [[base_path, sample] for sample in stdout_directories.decode().strip().split("\n")], n_processes=n_processes, description="Retrieving inputs") for item in tmp_filenames_per_sample_per_pipeline: for sample, filenames_per_pipeline in item.iteritems(): for pipeline, tmp_filenames in filenames_per_pipeline.iteritems( ): filenames_per_sample_per_pipeline.setdefault( sample, {}).setdefault("pipeline", []).extend(tmp_filenames) configs, jobfiles = build_configs(filenames_per_sample_per_pipeline) # submit tasks submit_args = [] for config, jobfile in zip(configs, jobfiles): submit_args.append([config, jobfile]) tools.parallelize(submit, submit_args, n_processes=1, description="Submitting crab tasks")
def annotate_trees(self, datacards_workspaces, root_filename, value_regex_list, value_replacements=None, n_processes=1, values_tree_files=None, *args): if value_replacements is None: value_replacements = {} if values_tree_files is None: values_tree_files = {} commands = [] for datacard, workspace in datacards_workspaces.iteritems(): float_values = [] found_match = False for value_regex in value_regex_list: search_result = re.search(value_regex, workspace) if not search_result is None: value = search_result.groups()[0] float_values.append(float(value_replacements.get(value, value))) found_match = True else: float_values.append(-999.0) if found_match: files = os.path.join(os.path.dirname(workspace), root_filename) values_tree_files.setdefault(tuple(float_values), []).extend(glob.glob(files)) commands.append("annotate-trees.py {FILES} --values {VALUES} {ARGS}".format( FILES=files, VALUES=" ".join([str(value) for value in float_values]), ARGS=" ".join(args) )) tools.parallelize(_call_command, commands, n_processes=n_processes, description="annotate-trees.py") return values_tree_files
def postfit_shapes_fromworkspace(self, datacards_cbs, datacards_workspaces, s_fit_only=False, n_processes=1, *args, **kwargs): for key, value in kwargs.items(): higgs_mass = value if "higgs_mass" in key else "0" commands = [] datacards_postfit_shapes = {} fit_type_list = kwargs.get("fit_type_list", ["fit_s", "fit_b"]) if s_fit_only: fit_type_list.remove("fit_b") for fit_type in fit_type_list: commands.extend(["PostFitShapesFromWorkspace --postfit -w {WORKSPACE} -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}".format( WORKSPACE=datacards_workspaces[datacard], DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0]+"_"+fit_type+".root", MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses? FIT_RESULT=os.path.join(os.path.dirname(datacard), kwargs.get("fit_result", "fitDiagnostics.root")+":"+fit_type), ARGS=" ".join(args) ) for datacard, cb in datacards_cbs.iteritems()]) datacards_postfit_shapes.setdefault(fit_type, {}).update({ datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root" for datacard, cb in datacards_cbs.iteritems()}) tools.parallelize(_call_command, commands, n_processes=n_processes, description="PostFitShapesFromWorkspace") return datacards_postfit_shapes
def main(): parser = argparse.ArgumentParser(description="Merge Artus outputs per nick name.", parents=[logger.loggingParser]) parser.add_argument("project_dir", help="Artus Project directory containing the files \"output/*/*.root\" to merge") parser.add_argument("-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") parser.add_argument("--output-dir", help="Directory to store merged files. Default: Same as project_dir.") args = parser.parse_args() logger.initLogger(args) output_dirs = glob.glob(os.path.join(args.project_dir, "output/*")) nick_names = [nick for nick in [output_dir[output_dir.rfind("/")+1:] for output_dir in output_dirs] if not ".tar.gz" in nick] outputs_per_nick = {nick : glob.glob(os.path.join(args.project_dir, "output", nick, "*.root")) for nick in nick_names} outputs_per_nick = {nick : files for nick, files in outputs_per_nick.iteritems() if len(files) > 0} commands = [] for nick_name, output_files in pi.ProgressIterator(outputs_per_nick.iteritems(), length=len(outputs_per_nick), description="Merging Artus outputs"): merged_dir = os.path.join(args.project_dir if(args.output_dir == None) else args.output_dir, "merged", nick_name) if not os.path.exists(merged_dir): os.makedirs(merged_dir) commands.append("hadd -f %s %s" % (os.path.join(merged_dir, nick_name+".root"), " ".join(output_files))) tools.parallelize(_call_command, commands, n_processes=args.n_processes)
def postfit_shapes(self, datacards_cbs, s_fit_only=False, n_processes=1, *args): commands = [] datacards_postfit_shapes = {} fit_type_list = ["fit_s", "fit_b"] if s_fit_only: fit_type_list.remove("fit_b") for fit_type in fit_type_list: commands.extend([ "PostFitShapes --postfit -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}" .format( DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0] + "_" + fit_type + ".root", MASS=[mass for mass in cb.mass_set() if mass != "*" ][0], # TODO: maybe there are more masses? FIT_RESULT=os.path.join(os.path.dirname(datacard), "mlfit.root:" + fit_type), ARGS=" ".join(args)) for datacard, cb in datacards_cbs.iteritems() ]) datacards_postfit_shapes.setdefault(fit_type, {}).update({ datacard: os.path.splitext(datacard)[0] + "_" + fit_type + ".root" for datacard, cb in datacards_cbs.iteritems() }) tools.parallelize(_call_command, commands, n_processes=n_processes) return datacards_postfit_shapes
def annotate_trees(self, datacards_workspaces, root_filename, value_regex, value_replacements=None, n_processes=1, *args): if value_replacements is None: value_replacements = {} commands = [] for datacard, workspace in datacards_workspaces.iteritems(): search_result = re.search(value_regex, workspace) if not search_result is None: value = search_result.groups()[0] float_value = float(value_replacements.get(value, value)) commands.append( "annotate-trees.py {FILES} --values {VALUE} {ARGS}".format( FILES=os.path.join(os.path.dirname(workspace), root_filename), VALUE=float_value, ARGS=" ".join(args))) tools.parallelize(_call_command, commands, n_processes=n_processes)
def postfit_shapes_fromworkspace(self, datacards_cbs, datacards_workspaces, s_fit_only=False, n_processes=1, *args, **kwargs): higgs_mass = kwargs.get("higgs_mass", 125) commands = [] datacards_postfit_shapes = {} fit_type_list = kwargs.get("fit_type_list", ["fit_s", "fit_b"]) if s_fit_only: fit_type_list.remove("fit_b") for fit_type in fit_type_list: commands.extend(["PostFitShapesFromWorkspace --postfit -w {WORKSPACE} -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}".format( WORKSPACE=datacards_workspaces[datacard], DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0]+"_"+fit_type+".root", MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses? FIT_RESULT=os.path.join(os.path.dirname(datacard), kwargs.get("fit_result", "fitDiagnostics.root")+":"+fit_type), ARGS=" ".join(args) ) for datacard, cb in datacards_cbs.iteritems()]) datacards_postfit_shapes.setdefault(fit_type, {}).update({ datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root" for datacard, cb in datacards_cbs.iteritems()}) tools.parallelize(_call_command, commands, n_processes=n_processes, description="PostFitShapesFromWorkspace") return datacards_postfit_shapes
def main(): parser = argparse.ArgumentParser( description="Merge Artus outputs per nick name.", parents=[logger.loggingParser]) parser.add_argument( "project_dir", help= "Artus Project directory containing the files \"output/*/*.root\" to merge" ) parser.add_argument( "-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") parser.add_argument( "--output-dir", help="Directory to store merged files. Default: Same as project_dir.") args = parser.parse_args() logger.initLogger(args) output_dirs = glob.glob(os.path.join(args.project_dir, "output/*")) nick_names = [ nick for nick in [output_dir[output_dir.rfind("/") + 1:] for output_dir in output_dirs] if not ".tar.gz" in nick ] outputs_per_nick = { nick: glob.glob(os.path.join(args.project_dir, "output", nick, "*.root")) for nick in nick_names } # drop potentially existing SvfitCaches from the filelist for nick, files in outputs_per_nick.iteritems(): outputs_per_nick[nick] = [ file for file in files if ("SvfitCache" not in file) ] outputs_per_nick = { nick: files for nick, files in outputs_per_nick.iteritems() if len(files) > 0 } commands = [] for nick_name, output_files in pi.ProgressIterator( outputs_per_nick.iteritems(), length=len(outputs_per_nick), description="Merging Artus outputs"): merged_dir = os.path.join( args.project_dir if (args.output_dir == None) else args.output_dir, "merged", nick_name) if not os.path.exists(merged_dir): os.makedirs(merged_dir) commands.append("hadd.py -a \" -f\" -t %s \"%s\"" % (os.path.join( merged_dir, nick_name + ".root"), " ".join(output_files))) tools.parallelize(_call_command, commands, n_processes=args.n_processes)
def text2workspace(self, datacards_cbs, n_processes=1, *args): commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format( MASS=[mass for mass in cb.mass_set() if mass != "*"][0], # TODO: maybe there are more masses? ARGS=" ".join(args), DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0]+".root" ) for datacard, cb in datacards_cbs.iteritems()] tools.parallelize(_call_command, commands, n_processes=n_processes) return {datacard : os.path.splitext(datacard)[0]+".root" for datacard in datacards_cbs.keys()}
def main(): parser = argparse.ArgumentParser(description="Run multiple commands in parallel.", parents=[logger.loggingParser]) parser.add_argument("commands", help="Commands to be executed on a batch system. They can also be piped into this program.", nargs="*", default=[]) parser.add_argument("-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) # prepare commands if (len(args.commands) == 0) and (not sys.stdin.isatty()): args.commands.extend(sys.stdin.read().strip().split("\n")) tools.parallelize(run_command, args.commands, n_processes=args.n_processes, description=os.path.basename(sys.argv[0]))
def print_pulls(self, datacards_cbs, n_processes=1, *args, **kwargs): commands = [] for pulls_format, file_format in zip(["latex", "text"], ["tex", "txt"]): for all_nuissances in [False, True]: commands.extend([[ "execute-command.py \"python $CMSSW_BASE/src/HiggsAnalysis/CombinedLimit/test/diffNuisances.py -f {FORMAT} {ALL} {PLOT} {ARGS} {FIT_RESULT}\" --log-file {LOG_FILE}".format( FORMAT=pulls_format, ALL=("-a" if all_nuissances else ""), PLOT="-g "+("" if all_nuissances else "largest_")+"pulls.root", ARGS=" ".join(args), FIT_RESULT=os.path.join(os.path.dirname(datacard), kwargs.get("fit_result", "fitDiagnostics.root")), LOG_FILE=("" if all_nuissances else "largest_")+"pulls."+file_format ), os.path.dirname(datacard) ] for datacard in datacards_cbs.keys()]) tools.parallelize(_call_command, commands, n_processes=n_processes, description="diffNuisances.py")
def print_pulls(self, datacards_cbs, n_processes=1, *args): commands = [] for pulls_format, file_format in zip(["latex", "text"], ["tex", "txt"]): for all_nuissances in [False, True]: commands.extend([[ "execute-command.py \"python $CMSSW_BASE/src/HiggsAnalysis/CombinedLimit/test/diffNuisances.py -f {FORMAT} {ALL} {PLOT} {ARGS} {FIT_RESULT}\" --log-file {LOG_FILE}".format( FORMAT=pulls_format, ALL=("-a" if all_nuissances else ""), PLOT="-g "+("" if all_nuissances else "largest_")+"pulls.root", ARGS=" ".join(args), FIT_RESULT=os.path.join(os.path.dirname(datacard), "mlfit.root"), LOG_FILE=("" if all_nuissances else "largest_")+"pulls."+file_format ), os.path.dirname(datacard) ] for datacard in datacards_cbs.keys()]) tools.parallelize(_call_command, commands, n_processes=n_processes)
def text2workspace(self, datacards_cbs, n_processes=1, *args, **kwargs): physics_model = re.search("(-P|--physics-model)[\s=\"\']*\S*:(?P<physics_model>\S*)[\"\']?\s", " ".join(args)) if physics_model is None: physics_model = {} else: physics_model = physics_model.groupdict() higgs_mass = kwargs.get("higgs_mass", 125) commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format( MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses? ARGS=" ".join(args), DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root" ) for datacard, cb in datacards_cbs.iteritems()] tools.parallelize(_call_command, commands, n_processes=n_processes, description="text2workspace.py") return {datacard : os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root" for datacard in datacards_cbs.keys()}
def annotate_trees(self, datacards_workspaces, root_filename, value_regex, value_replacements=None, n_processes=1, *args): if value_replacements is None: value_replacements = {} commands = [] for datacard, workspace in datacards_workspaces.iteritems(): search_result = re.search(value_regex, workspace) if not search_result is None: value = search_result.groups()[0] float_value = float(value_replacements.get(value, value)) commands.append("annotate-trees.py {FILES} --values {VALUE} {ARGS}".format( FILES=os.path.join(os.path.dirname(workspace), root_filename), VALUE=float_value, ARGS=" ".join(args) )) tools.parallelize(_call_command, commands, n_processes=n_processes)
def main(): parser = argparse.ArgumentParser(description="Convert CSV files to ROOT files.", parents=[logger.loggingParser]) parser.add_argument("files", nargs="+", help="CSV Files.") parser.add_argument("--variable-lists", nargs="+", default=[""], help="Variable lists (in case the CSV has no header), e.g. var1:var2:... [Default: %(default)s]") parser.add_argument("-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) if len(args.variable_lists) == 1: args.variable_lists = args.variable_lists * len(args.files) tools.parallelize(csv2root, zip(args.files, args.variable_lists), n_processes=args.n_processes, description="Converting")
def merge_local(args): outputs_per_nick = folders_to_merge(args) if (args.project_subdir != None): # keep only single path outputs_per_nick = { args.project_subdir: outputs_per_nick[args.project_subdir] } # drop potentially existing SvfitCaches from the filelist for nick, files in outputs_per_nick.iteritems(): outputs_per_nick[nick] = [ file for file in files if ("SvfitCache" not in file) ] outputs_per_nick = { nick: files for nick, files in outputs_per_nick.iteritems() if len(files) > 0 } hadd_arguments = [] for nick_name, output_files in pi.ProgressIterator( outputs_per_nick.iteritems(), length=len(outputs_per_nick), description="Merging Artus outputs"): merged_dir = os.path.join( args.project_dir[0] if (args.output_dir == None) else args.output_dir, "merged", nick_name) if not os.path.exists(merged_dir): os.makedirs(merged_dir) target_filename = os.path.join(merged_dir, nick_name + ".root") if (args.project_subdir != None): target_filename = "merged.root" hadd_arguments.append({ "target_file": target_filename, "source_files": output_files, "hadd_args": " -f ", "max_files": 500 }) tools.parallelize(hadd2, hadd_arguments, n_processes=args.n_processes, description="Merging Artus outputs")
def text2workspace(self, datacards_cbs, n_processes=1, *args, **kwargs): physics_model = re.search("(-P|--physics-model)[\s=\"\']*\S*:(?P<physics_model>\S*)[\"\']?\s", " ".join(args)) if physics_model is None: physics_model = {} else: physics_model = physics_model.groupdict() for key, value in kwargs.items(): higgs_mass = value if "higgs_mass" in key else "0" commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format( MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses? ARGS=" ".join(args), DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root" ) for datacard, cb in datacards_cbs.iteritems()] tools.parallelize(_call_command, commands, n_processes=n_processes, description="text2workspace.py") return {datacard : os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root" for datacard in datacards_cbs.keys()}
def plot1DScan(self, datacards_cbs, datacards_workspaces, poi, n_processes=1, *args, **kwargs): tmp_args = "".join(args) higgs_mass = kwargs.get("higgs_mass", 125) for datacard, workspace in datacards_workspaces.iteritems(): if not os.path.exists(os.path.join(os.path.dirname(workspace), "plots/")): os.makedirs(os.path.join(os.path.dirname(workspace), "plots/")) commandsPlot = [] commandsPlot.extend([[ "$CMSSW_BASE/src/CombineHarvester/CombineTools/scripts/plot1DScan.py --POI {POI} --output={OUTPUT} {ARGS} higgsCombine{NAME}.MultiDimFit.mH{MASS}.root".format( OUTPUT="nll", MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, POI=poi, NAME="Test", ARGS=tmp_args.format() ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")
def plot1DScan(self, datacards_cbs, datacards_workspaces, poi, n_processes=1, *args, **kwargs): tmp_args = "".join(args) for key, value in kwargs.items(): higgs_mass = value if "higgs_mass" in key else "0" for datacard, workspace in datacards_workspaces.iteritems(): if not os.path.exists(os.path.join(os.path.dirname(workspace), "plots/")): os.makedirs(os.path.join(os.path.dirname(workspace), "plots/")) commandsPlot = [] commandsPlot.extend([[ "$CMSSW_BASE/src/CombineHarvester/CombineTools/scripts/plot1DScan.py --POI {POI} higgsCombine.MultiDimFit.mH{MASS}.root".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, POI=poi, ARGS=tmp_args.format() ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")
def nuisance_impacts(self, datacards_cbs, datacards_workspaces, n_processes=1, *args, **kwargs): tmp_args = " ".join(args) higgs_mass = kwargs.get("higgs_mass", 125) commandsInitialFit = [] commandsInitialFit.extend([[ "combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} --robustFit 1 --doInitialFit --allPars {ARGS}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, ARGS=tmp_args.format(), WORKSPACE=workspace ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) commandsFits = [] commandsFits.extend([[ "combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} --robustFit 1 --doFits --parallel {NPROCS} --allPars {ARGS}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, ARGS=tmp_args.format(), WORKSPACE=workspace, NPROCS=n_processes ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) commandsOutput = [] commandsOutput.extend([[ "combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} -o impacts.json --parallel {NPROCS} --allPars {ARGS}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, ARGS=tmp_args.format(), WORKSPACE=workspace, NPROCS=n_processes ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) commandsPlot = [] commandsPlot.extend([[ "plotImpacts.py -i {INPUT} -o {OUTPUT}".format( INPUT="impacts.json", OUTPUT="plots/nuisance_impacts" ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commandsInitialFit, n_processes=n_processes, description="combineTool.py (initial fits)") tools.parallelize(_call_command, commandsFits, n_processes=1, description="combineTool.py (fits)") tools.parallelize(_call_command, commandsOutput, n_processes=1, description="combineTool.py (outputs)") tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")
def hypotestresulttree(self, datacards_cbs, n_processes=1, rvalue="1", poiname="x"): commands = [] hypotestresulttree = {} #for fit_type in fit_type_list: commands.extend(["root -q -b \"HiggsAnalysis/KITHiggsToTauTau/scripts/hypoTestResultTree.cxx(\\\"{INPUT}\\\",\\\"{OUTPUT}\\\",{MASS},{RVALUE},\\\"{POINAME}\\\")\"".format( INPUT=os.path.join(os.path.dirname(datacard),"higgsCombine.HybridNew.mH{angle}.root".format(angle = [mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")), OUTPUT=os.path.join(os.path.dirname(datacard), "higgsCombine.HybridNew.mH{angle}_qmu.root".format(angle =[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")), MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0", # TODO: maybe there are more masses? RVALUE= str(rvalue), POINAME=str(poiname) #ARGS=", ".join(args) ) for datacard, cb in datacards_cbs.iteritems()]) #datacards_postfit_shapes.setdefault(fit_type, {}).update({ # datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root" #for datacard, cb in datacards_cbs.iteritems()}) tools.parallelize(_call_command, commands, n_processes=n_processes, description="hypoTestResultTree.cxx") return {datacard : os.path.join(os.path.dirname(datacard), "higgsCombine.HybridNew.mH{angle}_qmu.root".format(angle =[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")) for datacard in datacards_cbs.keys()}
def submission(base_paths, n_processes=1): # retrieve and prepare input files filenames_per_sample_per_pipeline = {} for base_path in base_paths: stdout_directories, stderr_directories = tools.subprocessCall(shlex.split("gfal-ls " + base_path)) tmp_filenames_per_sample_per_pipeline = tools.parallelize( get_filenames, [[base_path, sample] for sample in stdout_directories.decode().strip().split("\n")], n_processes=n_processes, description="Retrieving inputs" ) for item in tmp_filenames_per_sample_per_pipeline: for sample, filenames_per_pipeline in item.iteritems(): for pipeline, tmp_filenames in filenames_per_pipeline.iteritems(): filenames_per_sample_per_pipeline.setdefault(sample, {}).setdefault("pipeline", []).extend(tmp_filenames) configs, jobfiles = build_configs(filenames_per_sample_per_pipeline) # submit tasks submit_args = [] for config, jobfile in zip(configs, jobfiles): submit_args.append([config, jobfile]) tools.parallelize(submit, submit_args, n_processes=1, description="Submitting crab tasks")
def postfit_shapes(self, datacards_cbs, s_fit_only=False, n_processes=1, *args): commands = [] datacards_postfit_shapes = {} fit_type_list = ["fit_s", "fit_b"] if s_fit_only: fit_type_list.remove("fit_b") for fit_type in fit_type_list: commands.extend(["PostFitShapes --postfit -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}".format( DATACARD=datacard, OUTPUT=os.path.splitext(datacard)[0]+"_"+fit_type+".root", MASS=[mass for mass in cb.mass_set() if mass != "*"][0], # TODO: maybe there are more masses? FIT_RESULT=os.path.join(os.path.dirname(datacard), "mlfit.root:"+fit_type), ARGS=" ".join(args) ) for datacard, cb in datacards_cbs.iteritems()]) datacards_postfit_shapes.setdefault(fit_type, {}).update({ datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root" for datacard, cb in datacards_cbs.iteritems()}) tools.parallelize(_call_command, commands, n_processes=n_processes) return datacards_postfit_shapes
def merge_local(args): outputs_per_nick = folders_to_merge(args) if(args.project_subdir != None): # keep only single path outputs_per_nick = { args.project_subdir : outputs_per_nick[args.project_subdir] } # drop potentially existing SvfitCaches from the filelist for nick, files in outputs_per_nick.iteritems(): outputs_per_nick[nick] = [file for file in files if ("SvfitCache" not in file)] outputs_per_nick = {nick : files for nick, files in outputs_per_nick.iteritems() if len(files) > 0} hadd_arguments = [] for nick_name, output_files in pi.ProgressIterator(outputs_per_nick.iteritems(), length=len(outputs_per_nick), description="Merging Artus outputs"): merged_dir = os.path.join(args.project_dir[0] if(args.output_dir == None) else args.output_dir, "merged", nick_name) if not os.path.exists(merged_dir): os.makedirs(merged_dir) target_filename = os.path.join(merged_dir, nick_name+".root") if(args.project_subdir != None): target_filename = "merged.root" hadd_arguments.append({"target_file": target_filename, "source_files": output_files, "hadd_args" : " -f -v 0 ", "max_files" : 500}) tools.parallelize(hadd2, hadd_arguments, n_processes=args.n_processes, description="Merging Artus outputs")
def main(): parser = argparse.ArgumentParser( description= "Collect matching trees from input files into one output tree", parents=[logger.loggingParser]) parser.add_argument( "-i", "--input-dirs", help= "Input directories = crab project directories containing the subdirectories with crab tasks", nargs="+") parser.add_argument( "-o", "--output-dir", default=None, help= "Local output directory. [Default: subdir \"results\" in first input directory]" ) parser.add_argument( "-d", "--dcache-target", default=None, help= "Directory on dCache (srm) where the files should be copied to. [Default: %(default)s]" ) parser.add_argument( "--input-trees", nargs="+", default=["svfitCache"], help="Paths of input SVfit cache trees. [Default: %(default)s]") parser.add_argument( "--output-tree", default="svfitCache", help="Name of output SVfit cache tree. [Default: %(default)s]") parser.add_argument( "--previous-cache", default="", help= "Path to a previous cache which will be merged. [Default: %(default)s]" ) parser.add_argument( "--dcache", type=bool, default=False, help="Read&Write from and to desy dcache[Default: %(default)s]") parser.add_argument( "--no-run", default=False, action="store_true", help="Do not run but only print dict [Default: %(default)s]") parser.add_argument( "-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) if args.output_dir is None: args.output_dir = os.path.join(args.input_dirs[0], "results") # get paths to crab outputs #max_n_jobs = 8000 #max_n_retrieve = 500 get_crab_outputs_args = [] for input_dir in args.input_dirs: #for jobid_start in xrange(1, max_n_jobs, max_n_retrieve): # jobid_end = jobid_start + max_n_retrieve - 1 # get_crab_outputs_args.append([input_dir, "{jobid_start}-{jobid_end}".format(jobid_start=jobid_start, jobid_end=jobid_end)]) get_crab_outputs_args.append([input_dir, "1-10"]) tar_files = tools.parallelize(_get_crab_outputs, get_crab_outputs_args, max(args.n_processes, 2), description="crab getoutput --dump") tar_files = tools.flattenList(tar_files) # download and un-tar download_untar_args = [[tar_file, args.output_dir] for tar_file in tar_files] tools.parallelize(_download_untar, download_untar_args, args.n_processes, description="download and un-tar crab outputs") root_files = glob.glob(os.path.join(args.output_dir, "*.root")) # TODO: maybe add more root files from -i arguments, that did not need to be un-tared root_files_per_sample_nick = {} for root_file in root_files: basename = os.path.basename(root_file) sample_nick = basename[:basename.index("_job_")] root_files_per_sample_nick.setdefault(sample_nick, []).append(root_file) merged_output_dir = os.path.join(args.output_dir, "merged") if not os.path.exists(merged_output_dir): os.makedirs(merged_output_dir) merge_outputs_args = [[ os.path.join(merged_output_dir, sample_nick + ".root"), tmp_root_files, "-f" ] for sample_nick, tmp_root_files in root_files_per_sample_nick.iteritems()] tools.parallelize(_merge_outputs, merge_outputs_args, args.n_processes, description="merging") if args.dcache_target: dcache_copy_commands = [ "gfal-copy -v -f -r " + merged_output_dir + " " + args.dcache_target ] tools.parallelize(_call_command, dcache_copy_commands, args.n_processes, description="copying to dCache") rm_commands = ["rm " + root_file for root_file in root_files] if args.dcache_target: rm_commands.extend([ "rm " + os.path.join(merged_output_dir, sample_nick + ".root") for sample_nick in root_files_per_sample_nick.keys() ]) tools.parallelize(_call_command, rm_commands, args.n_processes, description="deleting temporary files") log.info("\nJSON configuration for Artus:\n") config_output_dir = args.dcache_target if args.dcache_target else merged_output_dir for src, dst in filename_replacements.iteritems(): config_output_dir = config_output_dir.replace(src, dst) for sample_nick in sorted(root_files_per_sample_nick.keys()): log.info("\"" + sample_nick + "\" : \"" + os.path.join(config_output_dir, sample_nick + ".root") + "\",")
files_dict[file_name][channel]["weights"].append(w) files_dict[file_name][channel]["ntuples"].append( n2) #if n not in files_dict[file_name][channel]["ntuples"]: jsonTools.JsonDict(files_dict).save(os.path.join(output_dir, "ReduceFiles.json"), indent=4) give_away_list = [] for filename, item in files_dict.iteritems(): item["in_dir"] = input_dir item["out_dir"] = output_dir give_away_list.append({filename: item}) aTools.parallelize(reduce_file, give_away_list, n_processes=args.n_processes) #for aufruf in give_away_list: #reduce_file(aufruf) #for filename, item in files_dict.iteritems(): ##setting file paths #input_file_path = os.path.join(input_dir, os.path.join(filename.replace(".root", ""), filename)) #output_file_path = os.path.join(output_dir, os.path.join(filename.replace(".root", ""), filename)) #if not os.path.exists(os.path.join(output_dir, filename.replace(".root", ""))): #os.makedirs(os.path.join(output_dir, filename.replace(".root", ""))) ##debugging to check which files are processed #log.debug("Reduce input from file:") #log.debug(input_file_path) #if os.path.isfile(input_file_path):
def main(): parser = argparse.ArgumentParser(description="Collect matching trees from input files into one output tree", parents=[logger.loggingParser]) parser.add_argument("-i", "--input-dirs", help="Input directories = crab project directories containing the subdirectories with crab tasks", nargs="+") parser.add_argument("-o", "--output-dir", default=None, help="Local output directory. [Default: subdir \"results\" in first input directory]") parser.add_argument("-d", "--dcache-target", default=None, help="Directory on dCache (srm) where the files should be copied to. [Default: %(default)s]") parser.add_argument("--input-trees", nargs="+", default=["svfitCache"], help="Paths of input SVfit cache trees. [Default: %(default)s]") parser.add_argument("--output-tree", default="svfitCache", help="Name of output SVfit cache tree. [Default: %(default)s]") parser.add_argument("--previous-cache", default="", help="Path to a previous cache which will be merged. [Default: %(default)s]") parser.add_argument("--dcache", type=bool, default=False, help="Read&Write from and to desy dcache[Default: %(default)s]") parser.add_argument("--no-run", default=False, action="store_true", help="Do not run but only print dict [Default: %(default)s]") parser.add_argument("-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) if args.output_dir is None: args.output_dir = os.path.join(args.input_dirs[0], "results") tar_files = [] for input_dir in args.input_dirs: tar_files.extend(glob.glob(os.path.join(input_dir, "*/results/*.tar"))) tar_files.extend(glob.glob(os.path.join(input_dir, "results/*.tar"))) tar_commands = ["tar -x -f "+tar_file+" -C "+args.output_dir+" --overwrite" for tar_file in tar_files] tools.parallelize(_call_command, tar_commands, args.n_processes, description="un-tar crab outputs") root_files = glob.glob(os.path.join(args.output_dir, "*.root")) # TODO: maybe add more root files from -i arguments, that did not need to be un-tared root_files_per_sample_nick = {} for root_file in root_files: basename = os.path.basename(root_file) sample_nick = basename[:basename.index("_job_")] root_files_per_sample_nick.setdefault(sample_nick, []).append(root_file) merged_output_dir = os.path.join(args.output_dir, "merged") if not os.path.exists(merged_output_dir): os.makedirs(merged_output_dir) hadd_commands = ["hadd.py "+(" ".join(tmp_root_files))+" -t "+os.path.join(merged_output_dir, sample_nick+".root")+" -a \" -f -v 0\"" for sample_nick, tmp_root_files in root_files_per_sample_nick.iteritems()] tools.parallelize(_call_command, hadd_commands, args.n_processes, description="merging") if args.dcache_target: dcache_copy_commands = ["gfal-copy -f -r "+merged_output_dir+" "+args.dcache_target] tools.parallelize(_call_command, dcache_copy_commands, args.n_processes, description="copying to dCache") rm_commands = ["rm "+root_file for root_file in root_files] if args.dcache_target: rm_commands.extend(["rm "+os.path.join(merged_output_dir, sample_nick+".root") for sample_nick in root_files_per_sample_nick.keys()]) tools.parallelize(_call_command, rm_commands, args.n_processes, description="deleting temporary files") log.info("\nJSON configuration for Artus:\n") config_output_dir = args.dcache_target if args.dcache_target else merged_output_dir for src, dst in filename_replacements.iteritems(): config_output_dir = config_output_dir.replace(src, dst) for sample_nick in sorted(root_files_per_sample_nick.keys()): log.info("\""+sample_nick+"\" : \""+os.path.join(config_output_dir, sample_nick+".root")+"\",")
def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args, **kwargs): if datacards_poi_ranges is None: datacards_poi_ranges = {} tmp_args = " ".join(args) higgs_mass = kwargs.get("higgs_mass", 125) chunks = [[None, None]] if "{CHUNK}" in tmp_args and "--points" in tmp_args: splited_args = tmp_args.split() n_points = int(splited_args[splited_args.index("--points") + 1]) n_points_per_chunk = 199 chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)] method = re.search("(-M|--method)[\s=\"\']*(?P<method>\w*)[\"\']?\s", tmp_args) if not method is None: method = method.groupdict()["method"] name = re.search("(-n|--name)[\s=\"\']*(?P<name>\w*)[\"\']?\s", tmp_args) if not name is None: name = name.groupdict()["name"] split_stat_syst_uncs = kwargs.get("split_stat_syst_uncs", False) if split_stat_syst_uncs and (method is None): log.error("Uncertainties are not split into stat. and syst. components, since the method for combine is unknown!") split_stat_syst_uncs = False if split_stat_syst_uncs and (not "MultiDimFit" in method): log.error("Uncertainties are not split into stat. and syst. components. This is only supported for the MultiDimFit method!") split_stat_syst_uncs = False split_stat_syst_uncs_options = [""] split_stat_syst_uncs_names = [""] if split_stat_syst_uncs: split_stat_syst_uncs_options = [ "--saveWorkspace", "--snapshotName {method} -w w".format(method=method), "--snapshotName {method} -w w --freezeNuisanceGroups syst_plus_bbb".format(method=method, uncs="{uncs}"), #DBUG TEST!!!!!!!!!18.1.2017 --freezeNuisances ] split_stat_syst_uncs_names = [ "Workspace", "TotUnc", "StatUnc", ] for split_stat_syst_uncs_index, (split_stat_syst_uncs_option, split_stat_syst_uncs_name) in enumerate(zip(split_stat_syst_uncs_options, split_stat_syst_uncs_names)): prepared_tmp_args = None new_name = None if split_stat_syst_uncs: new_name = ("" if name is None else name) + split_stat_syst_uncs_name if name is None: prepared_tmp_args = tmp_args + " -n " + new_name else: prepared_tmp_args = copy.deepcopy(tmp_args) prepared_tmp_args = re.sub("(--algo)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2 "+("none" if split_stat_syst_uncs_index == 0 else "\\3")+"\\4", prepared_tmp_args) prepared_tmp_args = re.sub("(-n|--name)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2"+new_name+"\\4", prepared_tmp_args) else: prepared_tmp_args = tmp_args prepared_tmp_args = re.sub("-n -n", "-n", prepared_tmp_args) commands = [] for chunk_index, (chunk_min, chunk_max) in enumerate(chunks): commands.extend([[ "combine -m {MASS} {POI_RANGE} {ARGS} {CHUNK_POINTS} {SPLIT_STAT_SYST_UNCS} {WORKSPACE}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses? POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "", ARGS=prepared_tmp_args.format(CHUNK=str(chunk_index), RMIN="{RMIN}", RMAX="{RMAX}"), CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format( CHUNK_MIN=chunk_min, CHUNK_MAX=chunk_max ), SPLIT_STAT_SYST_UNCS=split_stat_syst_uncs_option.format(uncs=",".join(kwargs.get("additional_freeze_nuisances", [])+datacards_cbs[datacard].syst_name_set())), WORKSPACE="-d "+workspace ).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commands, n_processes=n_processes, description="combine") if split_stat_syst_uncs and (split_stat_syst_uncs_index == 0): # replace workspaces by saved versions from the first fit including the postfit nuisance parameter values for datacard, workspace in datacards_workspaces.iteritems(): datacards_workspaces[datacard] = glob.glob(os.path.join(os.path.dirname(workspace), "higgsCombine"+new_name+"."+method+".*.root"))[0]
def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None): config_dicts = list_of_config_dicts if isinstance(list_of_config_dicts, collections.Iterable) and not isinstance(list_of_config_dicts, basestring) else [list_of_config_dicts] args_strings = list_of_args_strings if isinstance(list_of_args_strings, collections.Iterable) and not isinstance(list_of_args_strings, basestring) else [list_of_args_strings] # fill up missing list elements by repeating previous ones n_plots = max(len(config_dicts), len(args_strings)) if (len(config_dicts) > 1) and (len(config_dicts) < n_plots): log.warning("Too few config dicts specified!") if (len(args_strings) > 1) and (len(args_strings) < n_plots): log.warning("Too few argument lists specified!") config_dicts = (config_dicts*n_plots)[:n_plots] args_strings = (args_strings*n_plots)[:n_plots] harry_args = [] for config_dict, args_string in zip(config_dicts, args_strings): if config_dict is None: harry_args.append(None) else: config_dict["comment"] = " ".join(sys.argv) if "json_defaults" in config_dict: json_defaults_dict = jsonTools.JsonDict(config_dict["json_defaults"]).doIncludes().doComments() config_dict.pop("json_defaults") json_defaults_dict.update(config_dict) config_dict = json_defaults_dict harry_args.append("--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString(indent=None).replace("\"", "'")) if not args_string is None: if harry_args[-1] is None: harry_args[-1] = args_string else: harry_args[-1] += (" "+args_string) if config_dict is None: harry_args[-1] += (" --comment " + (" ".join(sys.argv))) if not n_fast_plots is None: harry_args = harry_args[:n_fast_plots] # multi processing of multiple plots output_filenames = [] failed_plots = [] if len(harry_args) > 1 and n_processes > 1: log.info("Creating {:d} plots in {:d} processes".format(len(harry_args), min(n_processes, len(harry_args)))) results = tools.parallelize(pool_plot, zip([self]*len(harry_args), harry_args), n_processes) tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(*([result for result in results if not result is None and result != (None,)])) output_filenames = [output_filename for output_filename in tmp_output_filenames if not output_filename is None] failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip(tmp_failed_plots, tmp_error_messages) if not failed_plot is None] # single processing of multiple plots elif len(harry_args) > 1: log.info("Creating {:d} plots".format(len(harry_args))) for harry_args in harry_args: try: output_filenames.append(self.plot(harry_args)) except SystemExit as e: failed_plots.append((harry_args, None)) except Exception as e: log.info(str(e)) failed_plots.append((harry_args, None)) # single plot elif len(harry_args) > 0: output_filenames.append(self.plot(harry_args[0])) if len(failed_plots) > 0: log.error("%d failed plots:" % len(failed_plots)) for failed_plot in failed_plots: log.info("\n"+tools.get_colored_string("Failed plot:", color='red')) log.info("\t%s" % failed_plot[0]) if failed_plot[1] is not None: log.info(tools.get_indented_text(" ", tools.get_colored_string("Traceback for this plot:", color='red')+"\n" + failed_plot[1])) return output_filenames
def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None, batch=None): config_dicts = list_of_config_dicts if isinstance(list_of_config_dicts, collections.Iterable) and not isinstance(list_of_config_dicts, basestring) else [list_of_config_dicts] args_strings = list_of_args_strings if isinstance(list_of_args_strings, collections.Iterable) and not isinstance(list_of_args_strings, basestring) else [list_of_args_strings] # fill up missing list elements by repeating previous ones n_plots = max(len(config_dicts), len(args_strings)) if (len(config_dicts) > 1) and (len(config_dicts) < n_plots): log.warning("Too few config dicts specified!") if (len(args_strings) > 1) and (len(args_strings) < n_plots): log.warning("Too few argument lists specified!") config_dicts = (config_dicts*n_plots)[:n_plots] args_strings = (args_strings*n_plots)[:n_plots] if n_processes>1: for i in range(len(args_strings)): args_strings[i] += (" --hide-progressbar ") self.harry_args = [] for config_dict, args_string in zip(config_dicts, args_strings): if config_dict is None: self.harry_args.append(None) else: config_dict["comment"] = " ".join(sys.argv) if not batch is None: config_dict["dry_run"] = True if "json_defaults" in config_dict: json_defaults_dict = jsonTools.JsonDict(config_dict["json_defaults"]).doIncludes().doComments() config_dict.pop("json_defaults") json_defaults_dict.update(config_dict) config_dict = json_defaults_dict self.harry_args.append("--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString(indent=None).replace("\"", "'")) if not args_string is None: if self.harry_args[-1] is None: self.harry_args[-1] = args_string else: self.harry_args[-1] += (" "+args_string) if config_dict is None: self.harry_args[-1] += (" --comment " + (" ".join(sys.argv))) if not batch is None: self.harry_args[-1] += " --dry-run" if not n_fast_plots is None: self.harry_args = self.harry_args[:n_fast_plots] n_plots = len(self.harry_args) self.harry_cores = [None]*n_plots # multi processing of multiple plots output_filenames = [] failed_plots = [] if (n_plots > 1) and (n_processes > 1): log.info("Creating {:d} plots in {:d} processes".format(n_plots, min(n_processes, n_plots))) results = tools.parallelize(pool_plot, zip([self]*n_plots, range(n_plots)), n_processes, description="Plotting") tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(*([result for result in results if not result is None and result != (None,)])) output_filenames = [output_filename for output_filename in tmp_output_filenames if not output_filename is None] failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip(tmp_failed_plots, tmp_error_messages) if not failed_plot is None] # single processing of multiple plots elif n_plots > 1: log.info("Creating {:d} plots".format(n_plots)) for plot_index in xrange(n_plots): try: output_filenames.append(self.plot(plot_index)) except SystemExit as e: failed_plots.append((self.harry_args[plot_index], None)) except Exception as e: log.info(str(e)) failed_plots.append((self.harry_args[plot_index], None)) # single plot elif n_plots > 0: output_filenames.append(self.plot(0)) # batch submission if (not (batch is None)) and (len(failed_plots) < n_plots): try: os.makedirs(os.path.expandvars("$HP_WORK_BASE")) except OSError: if not os.path.isdir(os.path.expandvars("$HP_WORK_BASE")): raise workdir = tempfile.mkdtemp(prefix="harry_work_"+datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")+"_", dir=os.path.expandvars("$HP_WORK_BASE")) main_config = "" with open(os.path.expandvars("$CMSSW_BASE/src/Artus/HarryPlotter/data/grid-control_base_config.conf"), "r") as main_config_file: main_config = main_config_file.read() backend_config = "" with open(os.path.expandvars("$CMSSW_BASE/src/Artus/Configuration/data/grid-control_backend_" + batch + ".conf"), "r") as backend_config_file: backend_config = backend_config_file.read() final_config = string.Template(main_config).safe_substitute( cmsswbase=os.path.expandvars("$CMSSW_BASE"), hpworkbase=os.path.expandvars("$HP_WORK_BASE"), cwd=os.getcwd(), jsonconfigs="\n\t"+("\n\t".join([item[0] for item in output_filenames])), executable=self.standalone_executable, workdir=workdir, backend=backend_config ) final_config_filename = workdir+".conf" with open(final_config_filename, "w") as final_config_file: final_config_file.write(final_config) command = "go.py " + final_config_filename log.info(command) logger.subprocessCall(shlex.split(command)) if len(failed_plots) > 0: log.error("%d failed plots:" % len(failed_plots)) for failed_plot in failed_plots: log.info("\n"+tools.get_colored_string("Failed plot:", color='red')) log.info("\t%s" % failed_plot[0]) if failed_plot[1] is not None: try: log.info(tools.get_indented_text(" ", tools.get_colored_string("Traceback for this plot:", color='red')+"\n" + failed_plot[1])) except: log.info("\t Traceback for this plot: \n" + failed_plot[1]) return output_filenames
def main(): parser = argparse.ArgumentParser( description= "Collect matching trees from input files into one output tree", parents=[logger.loggingParser]) parser.add_argument( "-i", "--input", help= "Input directory with merged Artus outputs including Svit Cache files") parser.add_argument("-o", "--output", default="svfitCache.root", help="Output ROOT file. [Default: %(default)s]") parser.add_argument( "--input-trees", nargs="+", default=["svfitCache"], help="Paths of input SVfit cache trees. [Default: %(default)s]") parser.add_argument( "--output-tree", default="svfitCache", help="Name of output SVfit cache tree. [Default: %(default)s]") parser.add_argument( "--previous-cache", default="", help= "Path to a previous cache which will be merged. [Default: %(default)s]" ) parser.add_argument( "--dcache", type=bool, default=False, help="Read&Write from and to desy dcache[Default: %(default)s]") parser.add_argument( "--no-run", default=False, action="store_true", help="Do not run but only print dict [Default: %(default)s]") parser.add_argument( "-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) merge_commands = [] copy_commands = [] config_file = [] ls_command = "gfal-ls %s" % (srm(args.output)) retCode = logger.subprocessCall(ls_command.split()) if (retCode != 0): mkdir_command = "gfal-mkdir %s" % (srm(args.output)) log.info("Creating " + srm(args.output)) logger.subprocessCall(mkdir_command.split()) tmpdir = tempfile.mkdtemp(suffix='', prefix='tmp', dir="/tmp") #dir=os.getcwd()) if not args.dcache: if not args.no_run: for input in glob.glob(args.input + "/*/*.root"): output = tmpdir input_trees = args.input_trees output_trees = args.output_tree config = jsonTools.JsonDict(input) pipelines = config.get("Pipelines", {}).keys() # extract names without the leading channel pipelines = [ "_".join(pipeline.split("_")[1:]) for pipeline in pipelines ] pipelines = list(set(pipelines)) pipelines = [x for x in pipelines if x != ''] merge_commands = [] for pipeline in pipelines: out_filename = os.path.join( output, pipeline, "svfitCache_" + os.path.basename(input)) if not os.path.exists(os.path.dirname(out_filename)): os.makedirs(os.path.dirname(out_filename)) pipeline_input_trees = [ pipeline + "/" + input_tree for input_tree in input_trees ] merged_tree_name = treemerge.treemerge( [input], pipeline_input_trees, out_filename, output_trees, match_input_tree_names=True) log.info("SVfit cache trees collected in \"%s\"." % merged_tree_name) if args.previous_cache: # check for all available files in previous_cache previous_caches = glob.glob(args.previous_cache + "*/*.root") previous_cachefiles = [ "/".join(cache.split("/")[-2:]) for cache in previous_caches ] for cachefile in previous_cachefiles: current = os.path.join(output, cachefile) previous = os.path.join(args.previous_cache, cachefile) if not os.path.exists(os.path.dirname(current)): os.makedirs(os.path.dirname(current)) if os.path.exists(current): merge_commands.append("mv %s %s_tmp.root " % (current, current)) merge_commands.append( "hadd -f -f6 %s %s_tmp.root %s " % (current, current, previous)) merge_commands.append("rm %s_tmp.root " % (current)) else: merge_commands.append("hadd -f -f6 %s %s" % (current, previous)) tools.parallelize(_call_command, merge_commands, args.n_processes, description="merging") # move to output-directory copy_command = "gfal-copy -r file:///%s %s" % (output, srm(args.output)) logger.subprocessCall(copy_command.split()) # print c&p summary current_caches = glob.glob(args.output + "*/*.root") nicknames = list( set([ os.path.basename(cache).split(".")[0].replace( "svfitCache_", "") for cache in current_caches ])) for nick in sorted(nicknames): config_file.append( '\t\t\t"%s" : "%s",' % (nick, xrd(args.output) + "/svfitCache_" + nick + ".root")) else: input_dirs = glob.glob(args.input + "/*/*/*") untar_commands = [ "tar xf %s -C %s" % (file, tmpdir) for input_dir in input_dirs for file in glob.glob(input_dir + "/*.tar*") ] if not args.no_run: tools.parallelize(_call_command, untar_commands, args.n_processes, description="unpacking") regex = re.compile(".*/(.*)_job_[0-9]+_SvfitCache.._(.*?)[0-9]+.root") matches = [(regex.match(file).groups(), file) for file in glob.glob(tmpdir + "/*.root")] dirs = {} # go through matches and create nested dict {'sample' : {'Pipeline' : [files]}} for match in matches: if match[0][0] not in dirs: dirs[match[0][0]] = {} if match[0][1] not in dirs[match[0][0]]: dirs[match[0][0]][match[0][1]] = [] dirs[match[0][0]][match[0][1]].append(match[1]) for sample in dirs: for pipeline in dirs[sample]: # create folders as needed if not os.path.exists(tmpdir + "/" + pipeline): os.makedirs(tmpdir + "/" + pipeline) previous_cache_file = "" if args.previous_cache: if os.path.isfile(args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root"): previous_cache_file = args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root" tmp_filename = tmpdir + "/" + pipeline + "/svfitCache_" + sample + ".root" out_filename = args.output + "/" + pipeline + "/svfitCache_" + sample + ".root" merge_commands.append( "hadd -f %s %s %s" % (tmp_filename, " ".join( dirs[sample][pipeline]), previous_cache_file)) copy_commands.append("gfal-copy -f file:///%s %s" % (tmp_filename, srm(out_filename))) config_file.append( '"%s" : "%s",' % (sample, xrd(args.output) + "/svfitCache_" + sample + ".root")) if not args.no_run: tools.parallelize(_call_command, merge_commands, args.n_processes, description="merging") tools.parallelize(_call_command, copy_commands, args.n_processes, description="copying") shutil.rmtree(tmpdir) log.info("done. Artus SvfitCacheFile settings: ") for entry in config_file: log.info(entry)
#parser.add_argument("-S", "--Samples", nargs="+", default=["ggh", "qqh"], #help="Samples to be compared [Default: %(default)s]") #parser.add_argument("-o", "--output-dir", #default="./", #help="path to output file. [Default: %(default)s]") args = parser.parse_args() #clean argument input-files inputs = [] for entry in args.input_files: inputs.append(entry.strip(',').strip('"')) if len(inputs) > 1: filenames = inputs else: if os.path.isdir(inputs[0]): filenames = glob.glob(os.path.join(inputs[0], "*", "*.root")) else: filenames = inputs #ntuple_strings = ["mt_jecUncDown_tauEsNom/ntuple","mt_jecUncNom_tauEsDown/ntuple","mt_jecUncNom_tauEsNom/ntuple","mt_jecUncNom_tauEsUp/ntuple","mt_jecUncUp_tauEsNom/ntuple"] #training_logs = [jsonTools.JsonDict("TrainingLog.json")] training_logs = [] for element in args.training_logs: training_logs.append(jsonTools.JsonDict(element)) for channel in args.channels: args_list = [] for element in filenames: args_list.append( [element, training_logs, channel, args.calc_Training_BDT]) aTools.parallelize(file_wrapper, args_list, args.j)
#if log.isEnabledFor(logging.DEBUG): # import pprint # pprint.pprint(plot_configs) # delete existing output files tmp_output_files = list(set([os.path.join(config["output_dir"], config["filename"]+".root") for config in plot_configs[:args.n_plots[0]]])) for output_file in tmp_output_files: if os.path.exists(output_file): os.remove(output_file) log.debug("Removed file \""+output_file+"\" before it is recreated again.") output_files = list(set(output_files)) # create input histograms with HarryPlotter higgsplot.HiggsPlotter(list_of_config_dicts=plot_configs, list_of_args_strings=[args.args], n_processes=args.n_processes, n_plots=args.n_plots[0]) if args.n_plots[0] != 0: tools.parallelize(_call_command, hadd_commands, n_processes=args.n_processes) debug_plot_configs = [] for output_file in (output_files if not args.for_dcsync else merged_output_files): debug_plot_configs.extend(plotconfigs.PlotConfigs().all_histograms(output_file, plot_config_template={"markers":["E"], "colors":["#FF0000"]})) higgsplot.HiggsPlotter(list_of_config_dicts=debug_plot_configs, list_of_args_strings=[args.args], n_processes=args.n_processes, n_plots=args.n_plots[1]) # update CombineHarvester with the yields and shapes datacards.extract_shapes( os.path.join(args.output_dir, input_root_filename_template.replace("$", "")), bkg_histogram_name_template, sig_histogram_name_template, bkg_syst_histogram_name_template, sig_syst_histogram_name_template, update_systematics=True ) # add bin-by-bin uncertainties
def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None): config_dicts = list_of_config_dicts if isinstance(list_of_config_dicts, collections.Iterable) and not isinstance(list_of_config_dicts, basestring) else [list_of_config_dicts] args_strings = list_of_args_strings if isinstance(list_of_args_strings, collections.Iterable) and not isinstance(list_of_args_strings, basestring) else [list_of_args_strings] # fill up missing list elements by repeating previous ones n_plots = max(len(config_dicts), len(args_strings)) if (len(config_dicts) > 1) and (len(config_dicts) < n_plots): log.warning("Too few config dicts specified!") if (len(args_strings) > 1) and (len(args_strings) < n_plots): log.warning("Too few argument lists specified!") config_dicts = (config_dicts*n_plots)[:n_plots] args_strings = (args_strings*n_plots)[:n_plots] harry_args = [] for config_dict, args_string in zip(config_dicts, args_strings): if config_dict is None: harry_args.append(None) else: config_dict["comment"] = " ".join(sys.argv) if "json_defaults" in config_dict: json_defaults_dict = jsonTools.JsonDict(config_dict["json_defaults"]).doIncludes().doComments() config_dict.pop("json_defaults") json_defaults_dict.update(config_dict) config_dict = json_defaults_dict harry_args.append("--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString(indent=None).replace("\"", "'")) if not args_string is None: if harry_args[-1] is None: harry_args[-1] = args_string else: harry_args[-1] += (" "+args_string) if config_dict is None: harry_args[-1] += (" --comment " + (" ".join(sys.argv))) if not n_fast_plots is None: harry_args = harry_args[:n_fast_plots] # multi processing of multiple plots output_filenames = [] failed_plots = [] if len(harry_args) > 1 and n_processes > 1: log.info("Creating {:d} plots in {:d} processes".format(len(harry_args), min(n_processes, len(harry_args)))) results = tools.parallelize(pool_plot, zip([self]*len(harry_args), harry_args), n_processes) tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(*([result for result in results if not result is None and result != (None,)])) output_filenames = [output_filename for output_filename in tmp_output_filenames if not output_filename is None] failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip(tmp_failed_plots, tmp_error_messages) if not failed_plot is None] # single processing of multiple plots elif len(harry_args) > 1: log.info("Creating {:d} plots".format(len(harry_args))) for harry_args in harry_args: try: output_filenames.append(self.plot(harry_args)) except SystemExit as e: failed_plots.append((harry_args, None)) except Exception as e: log.info(str(e)) failed_plots.append((harry_args, None)) # single plot elif len(harry_args) > 0: output_filenames.append(self.plot(harry_args[0])) if len(failed_plots) > 0: log.error("%d failed plots:" % len(failed_plots)) for failed_plot in failed_plots: log.info("\n"+tools.get_colored_string("Failed plot:", color='red')) log.info("\t%s" % failed_plot[0]) if failed_plot[1] is not None: try: log.info(tools.get_indented_text(" ", tools.get_colored_string("Traceback for this plot:", color='red')+"\n" + failed_plot[1])) except: log.info("\t Traceback for this plot: \n" + failed_plot[1]) return output_filenames
def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args, **kwargs): if datacards_poi_ranges is None: datacards_poi_ranges = {} tmp_args = " ".join(args) for key, value in kwargs.items(): higgs_mass = value if "higgs_mass" in key else "0" chunks = [[None, None]] if "{CHUNK}" in tmp_args and "--points" in tmp_args: splited_args = tmp_args.split() n_points = int(splited_args[splited_args.index("--points") + 1]) n_points_per_chunk = 199 chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)] method = re.search("(-M|--method)[\s=\"\']*(?P<method>\w*)[\"\']?\s", tmp_args) if not method is None: method = method.groupdict()["method"] name = re.search("(-n|--name)[\s=\"\']*(?P<name>\w*)[\"\']?\s", tmp_args) if not name is None: name = name.groupdict()["name"] split_stat_syst_uncs = kwargs.get("split_stat_syst_uncs", False) if split_stat_syst_uncs and (method is None): log.error("Uncertainties are not split into stat. and syst. components, since the method for combine is unknown!") split_stat_syst_uncs = False if split_stat_syst_uncs and (not "MultiDimFit" in method): log.error("Uncertainties are not split into stat. and syst. components. This is only supported for the MultiDimFit method!") split_stat_syst_uncs = False split_stat_syst_uncs_options = [""] split_stat_syst_uncs_names = [""] if split_stat_syst_uncs: split_stat_syst_uncs_options = [ "--saveWorkspace", "--snapshotName {method} -w w".format(method=method), "--snapshotName {method} -w w --freezeNuisanceGroups syst_plus_bbb".format(method=method, uncs="{uncs}"), #DBUG TEST!!!!!!!!!18.1.2017 --freezeNuisances ] split_stat_syst_uncs_names = [ "Workspace", "TotUnc", "StatUnc", ] for split_stat_syst_uncs_index, (split_stat_syst_uncs_option, split_stat_syst_uncs_name) in enumerate(zip(split_stat_syst_uncs_options, split_stat_syst_uncs_names)): prepared_tmp_args = None new_name = None if split_stat_syst_uncs: new_name = ("" if name is None else name) + split_stat_syst_uncs_name if name is None: prepared_tmp_args = tmp_args + " -n " + new_name else: prepared_tmp_args = copy.deepcopy(tmp_args) prepared_tmp_args = re.sub("(--algo)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2 "+("none" if split_stat_syst_uncs_index == 0 else "\\3")+"\\4", prepared_tmp_args) prepared_tmp_args = re.sub("(-n|--name)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2"+new_name+"\\4", prepared_tmp_args) else: prepared_tmp_args = tmp_args prepared_tmp_args = re.sub("-n -n", "-n", prepared_tmp_args) commands = [] for chunk_index, (chunk_min, chunk_max) in enumerate(chunks): commands.extend([[ "combine -m {MASS} {POI_RANGE} {ARGS} {CHUNK_POINTS} {SPLIT_STAT_SYST_UNCS} {WORKSPACE}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses? POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "", ARGS=prepared_tmp_args.format(CHUNK=str(chunk_index), RMIN="{RMIN}", RMAX="{RMAX}"), CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format( CHUNK_MIN=chunk_min, CHUNK_MAX=chunk_max ), SPLIT_STAT_SYST_UNCS=split_stat_syst_uncs_option.format(uncs=",".join(kwargs.get("additional_freeze_nuisances", [])+datacards_cbs[datacard].syst_name_set())), WORKSPACE="-d "+workspace ).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commands, n_processes=n_processes, description="combine") if split_stat_syst_uncs and (split_stat_syst_uncs_index == 0): # replace workspaces by saved versions from the first fit including the postfit nuisance parameter values for datacard, workspace in datacards_workspaces.iteritems(): datacards_workspaces[datacard] = glob.glob(os.path.join(os.path.dirname(workspace), "higgsCombine"+new_name+"."+method+".*.root"))[0]
config["parameters_list"] = [] #print "Open RootFile\t", os.path.join(folder, file_list[0]) infile = ROOT.TFile(os.path.join(folder, file_list[0]), "READ") intree = infile.Get("TestTree") for branch in intree.GetListOfBranches(): branch_name = branch.GetName() if not "class" in branch_name and not "weight" in branch_name: config["parameters_list"].append(branch.GetName()) infile.Close() config["request_nick"] = container[2]+"_signal" config["nicks"] = [container[2]+"_signal"] config["weights"]= ["(classID==1)"] channel, category_string, requested_sample = config["channel"], config["category"], config["request_nick"] config["storage_name_extension"] = os.path.join(storage_name_extension, channel, category_string, requested_sample) plot_configs.append(copy.deepcopy(config)) if not os.path.exists(config["storage_name_extension"]): os.makedirs(config["storage_name_extension"]) config["request_nick"] = container[2]+"_bkg" config["nicks"] = [container[2]+"_bkg"] config["weights"]= ["(classID==0)"] channel, category_string, requested_sample = config["channel"], config["category"], config["request_nick"] config["storage_name_extension"] = os.path.join(storage_name_extension, channel, category_string, requested_sample) plot_configs.append(copy.deepcopy(config)) if not os.path.exists(config["storage_name_extension"]): os.makedirs(config["storage_name_extension"]) aTools.parallelize(calculate_partial_correlation, plot_configs, n_processes=args.n_processes)
def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None, batch=None): config_dicts = list_of_config_dicts if isinstance( list_of_config_dicts, collections.Iterable) and not isinstance( list_of_config_dicts, basestring) else [list_of_config_dicts] args_strings = list_of_args_strings if isinstance( list_of_args_strings, collections.Iterable) and not isinstance( list_of_args_strings, basestring) else [list_of_args_strings] # fill up missing list elements by repeating previous ones n_plots = max(len(config_dicts), len(args_strings)) if (len(config_dicts) > 1) and (len(config_dicts) < n_plots): log.warning("Too few config dicts specified!") if (len(args_strings) > 1) and (len(args_strings) < n_plots): log.warning("Too few argument lists specified!") config_dicts = (config_dicts * n_plots)[:n_plots] args_strings = (args_strings * n_plots)[:n_plots] if n_processes > 1: for i in range(len(args_strings)): args_strings[i] += (" --hide-progressbar ") self.harry_args = [] for config_dict, args_string in zip(config_dicts, args_strings): if config_dict is None: self.harry_args.append(None) else: config_dict["comment"] = " ".join(sys.argv) if not batch is None: config_dict["dry_run"] = True if "json_defaults" in config_dict: json_defaults_dict = jsonTools.JsonDict( config_dict["json_defaults"]).doIncludes().doComments( ) config_dict.pop("json_defaults") json_defaults_dict.update(config_dict) config_dict = json_defaults_dict self.harry_args.append( "--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString( indent=None).replace("\"", "'")) if not args_string is None: if self.harry_args[-1] is None: self.harry_args[-1] = args_string else: self.harry_args[-1] += (" " + args_string) if config_dict is None: self.harry_args[-1] += (" --comment " + (" ".join(sys.argv))) if not batch is None: self.harry_args[-1] += " --dry-run" if not n_fast_plots is None: self.harry_args = self.harry_args[:n_fast_plots] n_plots = len(self.harry_args) self.harry_cores = [None] * n_plots # multi processing of multiple plots output_filenames = [] failed_plots = [] if (n_plots > 1) and (n_processes > 1): log.info("Creating {:d} plots in {:d} processes".format( n_plots, min(n_processes, n_plots))) results = tools.parallelize(pool_plot, zip([self] * n_plots, range(n_plots)), n_processes, description="Plotting") tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip( *([ result for result in results if not result is None and result != (None, ) ])) output_filenames = [ output_filename for output_filename in tmp_output_filenames if not output_filename is None ] failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip( tmp_failed_plots, tmp_error_messages) if not failed_plot is None] # single processing of multiple plots elif n_plots > 1: log.info("Creating {:d} plots".format(n_plots)) for plot_index in xrange(n_plots): try: output_filenames.append(self.plot(plot_index)) except SystemExit as e: failed_plots.append((self.harry_args[plot_index], None)) except Exception as e: log.info(str(e)) failed_plots.append((self.harry_args[plot_index], None)) # single plot elif n_plots > 0: output_filenames.append(self.plot(0)) # batch submission if (not (batch is None)) and (len(failed_plots) < n_plots): workdir = tempfile.mkdtemp( prefix="harry_work_" + datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") + "_") main_config = "" with open( os.path.expandvars( "$CMSSW_BASE/src/Artus/HarryPlotter/data/grid-control_base_config.conf" ), "r") as main_config_file: main_config = main_config_file.read() backend_config = "" with open( os.path.expandvars( "$CMSSW_BASE/src/Artus/Configuration/data/grid-control_backend_" + batch + ".conf"), "r") as backend_config_file: backend_config = backend_config_file.read() final_config = string.Template(main_config).safe_substitute( cmsswbase=os.path.expandvars("$CMSSW_BASE"), hpworkbase=os.path.expandvars("$HP_WORK_BASE"), cwd=os.getcwd(), jsonconfigs="\n\t" + ("\n\t".join([item[0] for item in output_filenames])), executable=self.standalone_executable, workdir=workdir, backend=backend_config) final_config_filename = workdir + ".conf" with open(final_config_filename, "w") as final_config_file: final_config_file.write(final_config) command = "go.py " + final_config_filename log.info(command) logger.subprocessCall(shlex.split(command)) if len(failed_plots) > 0: log.error("%d failed plots:" % len(failed_plots)) for failed_plot in failed_plots: log.info("\n" + tools.get_colored_string("Failed plot:", color='red')) log.info("\t%s" % failed_plot[0]) if failed_plot[1] is not None: try: log.info( tools.get_indented_text( " ", tools.get_colored_string( "Traceback for this plot:", color='red') + "\n" + failed_plot[1])) except: log.info("\t Traceback for this plot: \n" + failed_plot[1]) return output_filenames
#help="Second Categories, can be specified multiple times. Several categories specified at once will be concatenated with or[Default: %(default)s]") #parser.add_argument("-S", "--Samples", nargs="+", default=["ggh", "qqh"], #help="Samples to be compared [Default: %(default)s]") #parser.add_argument("-o", "--output-dir", #default="./", #help="path to output file. [Default: %(default)s]") args = parser.parse_args() #clean argument input-files inputs = [] for entry in args.input_files: inputs.append(entry.strip(',').strip('"')) if len(inputs)>1: filenames = inputs else: if os.path.isdir(inputs[0]): filenames = glob.glob(os.path.join(inputs[0], "*", "*.root")) else: filenames = inputs #ntuple_strings = ["mt_jecUncDown_tauEsNom/ntuple","mt_jecUncNom_tauEsDown/ntuple","mt_jecUncNom_tauEsNom/ntuple","mt_jecUncNom_tauEsUp/ntuple","mt_jecUncUp_tauEsNom/ntuple"] #training_logs = [jsonTools.JsonDict("TrainingLog.json")] training_logs = [] for element in args.training_logs: training_logs.append(jsonTools.JsonDict(element)) for channel in args.channels: args_list = [] for element in filenames: args_list.append([element, training_logs, channel, args.calc_Training_BDT]) aTools.parallelize(file_wrapper, args_list, args.j)
"combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} --robustFit 1 --minimizerTolerance 0.1 --minimizerStrategy 0 --minimizerAlgoForMinos Minuit2,migrad --output impacts.json --parallel {NPROCS} --allPars {ARGS}".format( MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, ARGS=tmp_args.format(), WORKSPACE=workspace, NPROCS=n_processes ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) commandsPlot = [] commandsPlot.extend([[ "plotImpacts.py -i {INPUT} -o {OUTPUT}".format( INPUT="impacts.json", OUTPUT="plots/nuisance_impacts" ), os.path.dirname(workspace) ] for datacard, workspace in datacards_workspaces.iteritems()]) tools.parallelize(_call_command, commandsInitialFit, n_processes=n_processes, description="combineTool.py (initial fits)") tools.parallelize(_call_command, commandsFits, n_processes=1, description="combineTool.py (fits)") tools.parallelize(_call_command, commandsOutput, n_processes=1, description="combineTool.py (outputs)") tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)") def auto_rebin(self, bin_threshold = 1.0, rebin_mode = 0): rebin = ch.AutoRebin() rebin.SetBinThreshold(bin_threshold) rebin.SetRebinMode(rebin_mode) rebin.SetPerformRebin(True) rebin.SetVerbosity(0) rebin.Rebin(self.cb, self.cb)
# # Delete existing output files # output_files = list(set([os.path.join(config["output_dir"], config["filename"]+".root") for config in plot_configs[:args.n_plots[0]]])) # for output_file in output_files: # if os.path.exists(output_file): # os.remove(output_file) # log.debug("Removed file \""+output_file+"\" before it is recreated again.") # Create input histograms with HarryPlotter if "inputs" in args.steps: higgsplot.HiggsPlotter(list_of_config_dicts=plot_configs, n_processes=args.n_processes, n_plots=args.n_plots[0]) if args.n_plots[0] != 0: tools.parallelize(_call_command, hadd_commands, n_processes=args.n_processes) # Update CombineHarvester with the yields and shapes datacards.extract_shapes(os.path.join( args.output_dir, input_root_filename_template.replace("$", "")), bkg_histogram_name_template, sig_histogram_name_template, bkg_syst_histogram_name_template, sig_syst_histogram_name_template, update_systematics=True) # Add bin-by-bin uncertainties if not args.no_bbb_uncs: datacards.add_bin_by_bin_uncertainties( processes=datacards.cb.cp().backgrounds().process_set() +
def main(): parser = argparse.ArgumentParser(description="Collect matching trees from input files into one output tree", parents=[logger.loggingParser]) parser.add_argument("-i", "--input-dirs", help="Input directories = crab project directories containing the subdirectories with crab tasks", nargs="+") parser.add_argument("-o", "--output-dir", default=None, help="Local output directory. [Default: subdir \"results\" in first input directory]") parser.add_argument("-d", "--dcache-target", default=None, help="Directory on dCache (srm) where the files should be copied to. [Default: %(default)s]") parser.add_argument("--input-trees", nargs="+", default=["svfitCache"], help="Paths of input SVfit cache trees. [Default: %(default)s]") parser.add_argument("--output-tree", default="svfitCache", help="Name of output SVfit cache tree. [Default: %(default)s]") parser.add_argument("--previous-cache", default="", help="Path to a previous cache which will be merged. [Default: %(default)s]") parser.add_argument("--dcache", type=bool, default=False, help="Read&Write from and to desy dcache[Default: %(default)s]") parser.add_argument("--no-run", default=False, action="store_true", help="Do not run but only print dict [Default: %(default)s]") parser.add_argument("-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) if args.output_dir is None: args.output_dir = os.path.join(args.input_dirs[0], "results") # get paths to crab outputs #max_n_jobs = 8000 #max_n_retrieve = 500 get_crab_outputs_args = [] for input_dir in args.input_dirs: #for jobid_start in xrange(1, max_n_jobs, max_n_retrieve): # jobid_end = jobid_start + max_n_retrieve - 1 # get_crab_outputs_args.append([input_dir, "{jobid_start}-{jobid_end}".format(jobid_start=jobid_start, jobid_end=jobid_end)]) get_crab_outputs_args.append([input_dir, "1-10"]) tar_files = tools.parallelize(_get_crab_outputs, get_crab_outputs_args, max(args.n_processes, 2), description="crab getoutput --dump") tar_files = tools.flattenList(tar_files) # download and un-tar download_untar_args = [[tar_file, args.output_dir] for tar_file in tar_files] tools.parallelize(_download_untar, download_untar_args, args.n_processes, description="download and un-tar crab outputs") root_files = glob.glob(os.path.join(args.output_dir, "*.root")) # TODO: maybe add more root files from -i arguments, that did not need to be un-tared root_files_per_sample_nick = {} for root_file in root_files: basename = os.path.basename(root_file) sample_nick = basename[:basename.index("_job_")] root_files_per_sample_nick.setdefault(sample_nick, []).append(root_file) merged_output_dir = os.path.join(args.output_dir, "merged") if not os.path.exists(merged_output_dir): os.makedirs(merged_output_dir) merge_outputs_args = [[os.path.join(merged_output_dir, sample_nick+".root"), tmp_root_files, "-f"] for sample_nick, tmp_root_files in root_files_per_sample_nick.iteritems()] tools.parallelize(_merge_outputs, merge_outputs_args, args.n_processes, description="merging") if args.dcache_target: dcache_copy_commands = ["gfal-copy -v -f -r "+merged_output_dir+" "+args.dcache_target] tools.parallelize(_call_command, dcache_copy_commands, args.n_processes, description="copying to dCache") rm_commands = ["rm "+root_file for root_file in root_files] if args.dcache_target: rm_commands.extend(["rm "+os.path.join(merged_output_dir, sample_nick+".root") for sample_nick in root_files_per_sample_nick.keys()]) tools.parallelize(_call_command, rm_commands, args.n_processes, description="deleting temporary files") log.info("\nJSON configuration for Artus:\n") config_output_dir = args.dcache_target if args.dcache_target else merged_output_dir for src, dst in filename_replacements.iteritems(): config_output_dir = config_output_dir.replace(src, dst) for sample_nick in sorted(root_files_per_sample_nick.keys()): log.info("\""+sample_nick+"\" : \""+os.path.join(config_output_dir, sample_nick+".root")+"\",")
def main(): parser = argparse.ArgumentParser(description="Collect matching trees from input files into one output tree", parents=[logger.loggingParser]) parser.add_argument("-i", "--input", help="Input directory with merged Artus outputs including Svit Cache files") parser.add_argument("-o", "--output", default="svfitCache.root", help="Output ROOT file. [Default: %(default)s]") parser.add_argument("--input-trees", nargs="+", default=["svfitCache"], help="Paths of input SVfit cache trees. [Default: %(default)s]") parser.add_argument("--output-tree", default="svfitCache", help="Name of output SVfit cache tree. [Default: %(default)s]") parser.add_argument("--previous-cache", default="", help="Path to a previous cache which will be merged. [Default: %(default)s]") parser.add_argument("--dcache", type=bool, default=False, help="Read&Write from and to desy dcache[Default: %(default)s]") parser.add_argument("--no-run", default=False, action="store_true", help="Do not run but only print dict [Default: %(default)s]") parser.add_argument("-n", "--n-processes", type=int, default=1, help="Number of (parallel) processes. [Default: %(default)s]") args = parser.parse_args() logger.initLogger(args) merge_commands = [] copy_commands = [] config_file = [] ls_command = "gfal-ls %s" %(srm(args.output)) retCode = logger.subprocessCall(ls_command.split()) if(retCode != 0): mkdir_command = "gfal-mkdir %s" %(srm(args.output)) log.info("Creating " + srm(args.output)) logger.subprocessCall(mkdir_command.split()) tmpdir = tempfile.mkdtemp(suffix='', prefix='tmp', dir="/tmp") #dir=os.getcwd()) if not args.dcache: if not args.no_run: for input in glob.glob(args.input + "/*/*.root"): output = tmpdir input_trees = args.input_trees output_trees = args.output_tree config = jsonTools.JsonDict(input) pipelines = config.get("Pipelines", {}).keys() # extract names without the leading channel pipelines = ["_".join(pipeline.split("_")[1:]) for pipeline in pipelines] pipelines = list(set(pipelines)) pipelines = [x for x in pipelines if x != ''] merge_commands = [] for pipeline in pipelines: out_filename = os.path.join(output, pipeline, "svfitCache_" + os.path.basename(input)) if not os.path.exists(os.path.dirname(out_filename)): os.makedirs(os.path.dirname(out_filename)) pipeline_input_trees = [pipeline+"/"+input_tree for input_tree in input_trees] merged_tree_name = treemerge.treemerge( [input], pipeline_input_trees, out_filename, output_trees, match_input_tree_names=True ) log.info("SVfit cache trees collected in \"%s\"." % merged_tree_name) if args.previous_cache: # check for all available files in previous_cache previous_caches = glob.glob(args.previous_cache + "*/*.root") previous_cachefiles = [ "/".join(cache.split("/")[-2:]) for cache in previous_caches ] for cachefile in previous_cachefiles: current = os.path.join(output, cachefile) previous = os.path.join(args.previous_cache, cachefile) if not os.path.exists(os.path.dirname(current)): os.makedirs(os.path.dirname(current)) if os.path.exists(current): merge_commands.append("mv %s %s_tmp.root "%(current, current)) merge_commands.append("hadd -f -f6 %s %s_tmp.root %s "%(current, current, previous)) merge_commands.append("rm %s_tmp.root "%(current)) else: merge_commands.append("hadd -f -f6 %s %s"%(current, previous)) tools.parallelize(_call_command, merge_commands, args.n_processes, description="merging") # move to output-directory copy_command = "gfal-copy -r file:///%s %s" % (output, srm(args.output) ) logger.subprocessCall(copy_command.split()) # print c&p summary current_caches = glob.glob(args.output + "*/*.root") nicknames = list(set([ os.path.basename(cache).split(".")[0].replace("svfitCache_", "") for cache in current_caches ])) for nick in sorted(nicknames): config_file.append('\t\t\t"%s" : "%s",' % (nick, xrd(args.output) + "/svfitCache_" + nick + ".root")) else: input_dirs = glob.glob(args.input + "/*/*/*") untar_commands = ["tar xf %s -C %s"%(file,tmpdir) for input_dir in input_dirs for file in glob.glob(input_dir + "/*.tar*")] if not args.no_run: tools.parallelize(_call_command, untar_commands, args.n_processes, description="unpacking") regex=re.compile(".*/(.*)_job_[0-9]+_SvfitCache.._(.*?)[0-9]+.root") matches = [(regex.match(file).groups(),file) for file in glob.glob(tmpdir+"/*.root")] dirs = {} # go through matches and create nested dict {'sample' : {'Pipeline' : [files]}} for match in matches: if match[0][0] not in dirs: dirs[match[0][0]] = {} if match[0][1] not in dirs[match[0][0]]: dirs[match[0][0]][match[0][1]] = [] dirs[match[0][0]][match[0][1]].append(match[1]) for sample in dirs: for pipeline in dirs[sample]: # create folders as needed if not os.path.exists(tmpdir + "/" + pipeline): os.makedirs(tmpdir + "/" + pipeline) previous_cache_file = "" if args.previous_cache: if os.path.isfile(args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root"): previous_cache_file = args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root" tmp_filename = tmpdir + "/" + pipeline + "/svfitCache_" + sample + ".root" out_filename = args.output + "/" + pipeline + "/svfitCache_" + sample + ".root" merge_commands.append("hadd -f %s %s %s"%(tmp_filename, " ".join(dirs[sample][pipeline]), previous_cache_file)) copy_commands.append("gfal-copy -f file:///%s %s" % (tmp_filename, srm(out_filename) )) config_file.append('"%s" : "%s",' % (sample, xrd(args.output) + "/svfitCache_" + sample + ".root")) if not args.no_run: tools.parallelize(_call_command, merge_commands, args.n_processes, description="merging") tools.parallelize(_call_command, copy_commands, args.n_processes, description="copying") shutil.rmtree(tmpdir) log.info("done. Artus SvfitCacheFile settings: ") for entry in config_file: log.info(entry)
files_dict[file_name][channel]["ntuples"].append(n) for front, back in itertools.product(args.replacements+["-.;+"], repeat = 2): n2 = n.replace(args.rs, front, 1).replace(args.rs, back,1).replace("-.;+", args.rs) if w not in files_dict[file_name][channel]["weights"] or n2 not in files_dict[file_name][channel]["ntuples"]: files_dict[file_name][channel]["weights"].append(w) files_dict[file_name][channel]["ntuples"].append(n2) #if n not in files_dict[file_name][channel]["ntuples"]: jsonTools.JsonDict(files_dict).save(os.path.join(output_dir, "ReduceFiles.json"), indent=4) give_away_list = [] for filename, item in files_dict.iteritems(): item["in_dir"] = input_dir item["out_dir"] = output_dir give_away_list.append({filename:item}) aTools.parallelize(reduce_file, give_away_list, n_processes=args.n_processes) #for aufruf in give_away_list: #reduce_file(aufruf)