Exemplo n.º 1
0
	def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args):
		if datacards_poi_ranges is None:
			datacards_poi_ranges = {}
		tmp_args = " ".join(args)

		chunks = [[None, None]]
		if "{CHUNK}" in tmp_args and "--points" in tmp_args:
			splited_args = tmp_args.split()
			n_points = int(splited_args[splited_args.index("--points") + 1])
			n_points_per_chunk = 199
			chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)]

		commands = []
		for index, (chunk_min, chunk_max) in enumerate(chunks):
			commands.extend([[
					"combine -m {MASS} {POI_RANGE} {ARGS} {WORKSPACE} {CHUNK_POINTS}".format(
							MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0], # TODO: maybe there are more masses?
							POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "",
							ARGS=tmp_args.format(CHUNK=str(index), RMIN="{RMIN}", RMAX="{RMAX}"),
							WORKSPACE=workspace,
							CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format(
									CHUNK_MIN=chunk_min,
									CHUNK_MAX=chunk_max
							)
					).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]),
					os.path.dirname(workspace)
			] for datacard, workspace in datacards_workspaces.iteritems()])

		tools.parallelize(_call_command, commands, n_processes=n_processes)
Exemplo n.º 2
0
def main():

    parser = argparse.ArgumentParser(
        description="Run multiple commands in parallel.",
        parents=[logger.loggingParser])

    parser.add_argument(
        "commands",
        help=
        "Commands to be executed on a batch system. They can also be piped into this program.",
        nargs="*",
        default=[])
    parser.add_argument(
        "-n",
        "--n-processes",
        type=int,
        default=1,
        help="Number of (parallel) processes. [Default: %(default)s]")

    args = parser.parse_args()
    logger.initLogger(args)

    # prepare commands
    if (len(args.commands) == 0) and (not sys.stdin.isatty()):
        args.commands.extend(sys.stdin.read().strip().split("\n"))

    tools.parallelize(run_command,
                      args.commands,
                      n_processes=args.n_processes,
                      description=os.path.basename(sys.argv[0]))
def submission(base_paths, n_processes=1):

    # retrieve and prepare input files
    filenames_per_sample_per_pipeline = {}
    for base_path in base_paths:
        stdout_directories, stderr_directories = tools.subprocessCall(
            shlex.split("gfal-ls " + base_path))
        tmp_filenames_per_sample_per_pipeline = tools.parallelize(
            get_filenames,
            [[base_path, sample]
             for sample in stdout_directories.decode().strip().split("\n")],
            n_processes=n_processes,
            description="Retrieving inputs")
        for item in tmp_filenames_per_sample_per_pipeline:
            for sample, filenames_per_pipeline in item.iteritems():
                for pipeline, tmp_filenames in filenames_per_pipeline.iteritems(
                ):
                    filenames_per_sample_per_pipeline.setdefault(
                        sample, {}).setdefault("pipeline",
                                               []).extend(tmp_filenames)
    configs, jobfiles = build_configs(filenames_per_sample_per_pipeline)

    # submit tasks
    submit_args = []
    for config, jobfile in zip(configs, jobfiles):
        submit_args.append([config, jobfile])
    tools.parallelize(submit,
                      submit_args,
                      n_processes=1,
                      description="Submitting crab tasks")
	def annotate_trees(self, datacards_workspaces, root_filename, value_regex_list, value_replacements=None, n_processes=1, values_tree_files=None, *args):
		if value_replacements is None:
			value_replacements = {}

		if values_tree_files is None:
			values_tree_files = {}

		commands = []
		for datacard, workspace in datacards_workspaces.iteritems():
			float_values = []
			found_match = False
			for value_regex in value_regex_list:
				search_result = re.search(value_regex, workspace)
				if not search_result is None:
					value = search_result.groups()[0]
					float_values.append(float(value_replacements.get(value, value)))
					found_match = True
				else:
					float_values.append(-999.0)

			if found_match:
				files = os.path.join(os.path.dirname(workspace), root_filename)
				values_tree_files.setdefault(tuple(float_values), []).extend(glob.glob(files))

				commands.append("annotate-trees.py {FILES} --values {VALUES} {ARGS}".format(
						FILES=files,
						VALUES=" ".join([str(value) for value in float_values]),
						ARGS=" ".join(args)
				))

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="annotate-trees.py")
		return values_tree_files
	def postfit_shapes_fromworkspace(self, datacards_cbs, datacards_workspaces, s_fit_only=False, n_processes=1, *args, **kwargs):
		for key, value in kwargs.items():
			higgs_mass = value if "higgs_mass" in key else "0"	
				
		commands = []
		datacards_postfit_shapes = {}
		fit_type_list = kwargs.get("fit_type_list", ["fit_s", "fit_b"])
		if s_fit_only:
			fit_type_list.remove("fit_b")

		for fit_type in fit_type_list:
			commands.extend(["PostFitShapesFromWorkspace --postfit -w {WORKSPACE} -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}".format(
					WORKSPACE=datacards_workspaces[datacard],
					DATACARD=datacard,
					OUTPUT=os.path.splitext(datacard)[0]+"_"+fit_type+".root",
					MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses?
					FIT_RESULT=os.path.join(os.path.dirname(datacard), kwargs.get("fit_result", "fitDiagnostics.root")+":"+fit_type),
					ARGS=" ".join(args)
			) for datacard, cb in datacards_cbs.iteritems()])

			datacards_postfit_shapes.setdefault(fit_type, {}).update({
					datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root"
			for datacard, cb in datacards_cbs.iteritems()})

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="PostFitShapesFromWorkspace")

		return datacards_postfit_shapes
Exemplo n.º 6
0
def main():
	
	parser = argparse.ArgumentParser(description="Merge Artus outputs per nick name.", parents=[logger.loggingParser])

	parser.add_argument("project_dir", help="Artus Project directory containing the files \"output/*/*.root\" to merge")
	parser.add_argument("-n", "--n-processes", type=int, default=1,
	                    help="Number of (parallel) processes. [Default: %(default)s]")
	parser.add_argument("--output-dir", help="Directory to store merged files. Default: Same as project_dir.")
	args = parser.parse_args()
	logger.initLogger(args)
	output_dirs = glob.glob(os.path.join(args.project_dir, "output/*"))
	nick_names = [nick for nick in [output_dir[output_dir.rfind("/")+1:] for output_dir in output_dirs] if not ".tar.gz" in nick]
	outputs_per_nick = {nick : glob.glob(os.path.join(args.project_dir, "output", nick, "*.root")) for nick in nick_names}
	outputs_per_nick = {nick : files for nick, files in outputs_per_nick.iteritems() if len(files) > 0}
	
	commands = []
	for nick_name, output_files in pi.ProgressIterator(outputs_per_nick.iteritems(),
	                                                   length=len(outputs_per_nick),
	                                                   description="Merging Artus outputs"):
		merged_dir = os.path.join(args.project_dir if(args.output_dir == None) else args.output_dir, "merged", nick_name)
		if not os.path.exists(merged_dir):
			os.makedirs(merged_dir)
	
		commands.append("hadd -f %s %s" % (os.path.join(merged_dir, nick_name+".root"), " ".join(output_files)))
	
	tools.parallelize(_call_command, commands, n_processes=args.n_processes)
Exemplo n.º 7
0
    def postfit_shapes(self,
                       datacards_cbs,
                       s_fit_only=False,
                       n_processes=1,
                       *args):
        commands = []
        datacards_postfit_shapes = {}
        fit_type_list = ["fit_s", "fit_b"]
        if s_fit_only:
            fit_type_list.remove("fit_b")

        for fit_type in fit_type_list:
            commands.extend([
                "PostFitShapes --postfit -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}"
                .format(
                    DATACARD=datacard,
                    OUTPUT=os.path.splitext(datacard)[0] + "_" + fit_type +
                    ".root",
                    MASS=[mass for mass in cb.mass_set() if mass != "*"
                          ][0],  # TODO: maybe there are more masses?
                    FIT_RESULT=os.path.join(os.path.dirname(datacard),
                                            "mlfit.root:" + fit_type),
                    ARGS=" ".join(args))
                for datacard, cb in datacards_cbs.iteritems()
            ])

            datacards_postfit_shapes.setdefault(fit_type, {}).update({
                datacard:
                os.path.splitext(datacard)[0] + "_" + fit_type + ".root"
                for datacard, cb in datacards_cbs.iteritems()
            })

        tools.parallelize(_call_command, commands, n_processes=n_processes)

        return datacards_postfit_shapes
Exemplo n.º 8
0
    def annotate_trees(self,
                       datacards_workspaces,
                       root_filename,
                       value_regex,
                       value_replacements=None,
                       n_processes=1,
                       *args):
        if value_replacements is None:
            value_replacements = {}

        commands = []
        for datacard, workspace in datacards_workspaces.iteritems():
            search_result = re.search(value_regex, workspace)
            if not search_result is None:
                value = search_result.groups()[0]
                float_value = float(value_replacements.get(value, value))

                commands.append(
                    "annotate-trees.py {FILES} --values {VALUE} {ARGS}".format(
                        FILES=os.path.join(os.path.dirname(workspace),
                                           root_filename),
                        VALUE=float_value,
                        ARGS=" ".join(args)))

        tools.parallelize(_call_command, commands, n_processes=n_processes)
	def postfit_shapes_fromworkspace(self, datacards_cbs, datacards_workspaces, s_fit_only=False, n_processes=1, *args, **kwargs):
		higgs_mass = kwargs.get("higgs_mass", 125)
				
		commands = []
		datacards_postfit_shapes = {}
		fit_type_list = kwargs.get("fit_type_list", ["fit_s", "fit_b"])
		if s_fit_only:
			fit_type_list.remove("fit_b")

		for fit_type in fit_type_list:
			commands.extend(["PostFitShapesFromWorkspace --postfit -w {WORKSPACE} -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}".format(
					WORKSPACE=datacards_workspaces[datacard],
					DATACARD=datacard,
					OUTPUT=os.path.splitext(datacard)[0]+"_"+fit_type+".root",
					MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses?
					FIT_RESULT=os.path.join(os.path.dirname(datacard), kwargs.get("fit_result", "fitDiagnostics.root")+":"+fit_type),
					ARGS=" ".join(args)
			) for datacard, cb in datacards_cbs.iteritems()])

			datacards_postfit_shapes.setdefault(fit_type, {}).update({
					datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root"
			for datacard, cb in datacards_cbs.iteritems()})

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="PostFitShapesFromWorkspace")

		return datacards_postfit_shapes
	def annotate_trees(self, datacards_workspaces, root_filename, value_regex_list, value_replacements=None, n_processes=1, values_tree_files=None, *args):
		if value_replacements is None:
			value_replacements = {}

		if values_tree_files is None:
			values_tree_files = {}

		commands = []
		for datacard, workspace in datacards_workspaces.iteritems():
			float_values = []
			found_match = False
			for value_regex in value_regex_list:
				search_result = re.search(value_regex, workspace)
				if not search_result is None:
					value = search_result.groups()[0]
					float_values.append(float(value_replacements.get(value, value)))
					found_match = True
				else:
					float_values.append(-999.0)

			if found_match:
				files = os.path.join(os.path.dirname(workspace), root_filename)
				values_tree_files.setdefault(tuple(float_values), []).extend(glob.glob(files))

				commands.append("annotate-trees.py {FILES} --values {VALUES} {ARGS}".format(
						FILES=files,
						VALUES=" ".join([str(value) for value in float_values]),
						ARGS=" ".join(args)
				))

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="annotate-trees.py")
		return values_tree_files
 def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args):
     if datacards_poi_ranges is None:
         datacards_poi_ranges = {}
     tmp_args = " ".join(args)
     
     chunks = [[None, None]]
     if "{CHUNK}" in tmp_args and "--points" in tmp_args:
         splited_args = tmp_args.split()
         n_points = int(splited_args[splited_args.index("--points") + 1])
         n_points_per_chunk = 199
         chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)]
     
     commands = []
     for index, (chunk_min, chunk_max) in enumerate(chunks):
         commands.extend([[
                 "combine -m {MASS} {POI_RANGE} {ARGS} {WORKSPACE} {CHUNK_POINTS}".format(
                         MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0], # TODO: maybe there are more masses?
                         POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "",
                         ARGS=tmp_args.format(CHUNK=str(index), RMIN="{RMIN}", RMAX="{RMAX}"),
                         WORKSPACE=workspace,
                         CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format(
                                 CHUNK_MIN=chunk_min,
                                 CHUNK_MAX=chunk_max
                         )
                 ).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]),
                 os.path.dirname(workspace)
         ] for datacard, workspace in datacards_workspaces.iteritems()])
     
     tools.parallelize(_call_command, commands, n_processes=n_processes)
Exemplo n.º 12
0
def main():

    parser = argparse.ArgumentParser(
        description="Merge Artus outputs per nick name.",
        parents=[logger.loggingParser])

    parser.add_argument(
        "project_dir",
        help=
        "Artus Project directory containing the files \"output/*/*.root\" to merge"
    )
    parser.add_argument(
        "-n",
        "--n-processes",
        type=int,
        default=1,
        help="Number of (parallel) processes. [Default: %(default)s]")
    parser.add_argument(
        "--output-dir",
        help="Directory to store merged files. Default: Same as project_dir.")
    args = parser.parse_args()
    logger.initLogger(args)
    output_dirs = glob.glob(os.path.join(args.project_dir, "output/*"))
    nick_names = [
        nick for nick in
        [output_dir[output_dir.rfind("/") + 1:] for output_dir in output_dirs]
        if not ".tar.gz" in nick
    ]
    outputs_per_nick = {
        nick:
        glob.glob(os.path.join(args.project_dir, "output", nick, "*.root"))
        for nick in nick_names
    }
    # drop potentially existing SvfitCaches from the filelist
    for nick, files in outputs_per_nick.iteritems():
        outputs_per_nick[nick] = [
            file for file in files if ("SvfitCache" not in file)
        ]
    outputs_per_nick = {
        nick: files
        for nick, files in outputs_per_nick.iteritems() if len(files) > 0
    }

    commands = []
    for nick_name, output_files in pi.ProgressIterator(
            outputs_per_nick.iteritems(),
            length=len(outputs_per_nick),
            description="Merging Artus outputs"):
        merged_dir = os.path.join(
            args.project_dir if (args.output_dir == None) else args.output_dir,
            "merged", nick_name)
        if not os.path.exists(merged_dir):
            os.makedirs(merged_dir)

        commands.append("hadd.py -a \" -f\" -t %s \"%s\"" % (os.path.join(
            merged_dir, nick_name + ".root"), " ".join(output_files)))

    tools.parallelize(_call_command, commands, n_processes=args.n_processes)
Exemplo n.º 13
0
	def text2workspace(self, datacards_cbs, n_processes=1, *args):
		commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format(
				MASS=[mass for mass in cb.mass_set() if mass != "*"][0], # TODO: maybe there are more masses?
				ARGS=" ".join(args),
				DATACARD=datacard,
				OUTPUT=os.path.splitext(datacard)[0]+".root"
		) for datacard, cb in datacards_cbs.iteritems()]

		tools.parallelize(_call_command, commands, n_processes=n_processes)

		return {datacard : os.path.splitext(datacard)[0]+".root" for datacard in datacards_cbs.keys()}
 def text2workspace(self, datacards_cbs, n_processes=1, *args):
     commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format(
             MASS=[mass for mass in cb.mass_set() if mass != "*"][0], # TODO: maybe there are more masses?
             ARGS=" ".join(args),
             DATACARD=datacard,
             OUTPUT=os.path.splitext(datacard)[0]+".root"
     ) for datacard, cb in datacards_cbs.iteritems()]
     
     tools.parallelize(_call_command, commands, n_processes=n_processes)
     
     return {datacard : os.path.splitext(datacard)[0]+".root" for datacard in datacards_cbs.keys()}
Exemplo n.º 15
0
def main():
	
	parser = argparse.ArgumentParser(description="Run multiple commands in parallel.", parents=[logger.loggingParser])

	parser.add_argument("commands", help="Commands to be executed on a batch system. They can also be piped into this program.", nargs="*", default=[])
	parser.add_argument("-n", "--n-processes", type=int, default=1,
	                    help="Number of (parallel) processes. [Default: %(default)s]")

	args = parser.parse_args()
	logger.initLogger(args)
	
	# prepare commands
	if (len(args.commands) == 0) and (not sys.stdin.isatty()):
		args.commands.extend(sys.stdin.read().strip().split("\n"))
	
	tools.parallelize(run_command, args.commands, n_processes=args.n_processes, description=os.path.basename(sys.argv[0]))
	def print_pulls(self, datacards_cbs, n_processes=1, *args, **kwargs):
		commands = []
		for pulls_format, file_format in zip(["latex", "text"], ["tex", "txt"]):
			for all_nuissances in [False, True]:
				commands.extend([[
						"execute-command.py \"python $CMSSW_BASE/src/HiggsAnalysis/CombinedLimit/test/diffNuisances.py -f {FORMAT} {ALL} {PLOT} {ARGS} {FIT_RESULT}\" --log-file {LOG_FILE}".format(
								FORMAT=pulls_format,
								ALL=("-a" if all_nuissances else ""),
								PLOT="-g "+("" if all_nuissances else "largest_")+"pulls.root",
								ARGS=" ".join(args),
								FIT_RESULT=os.path.join(os.path.dirname(datacard), kwargs.get("fit_result", "fitDiagnostics.root")),
								LOG_FILE=("" if all_nuissances else "largest_")+"pulls."+file_format
						),
						os.path.dirname(datacard)
				] for datacard in datacards_cbs.keys()])

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="diffNuisances.py")
 def print_pulls(self, datacards_cbs, n_processes=1, *args):
     commands = []
     for pulls_format, file_format in zip(["latex", "text"], ["tex", "txt"]):
         for all_nuissances in [False, True]:
             commands.extend([[
                     "execute-command.py \"python $CMSSW_BASE/src/HiggsAnalysis/CombinedLimit/test/diffNuisances.py -f {FORMAT} {ALL} {PLOT} {ARGS} {FIT_RESULT}\" --log-file {LOG_FILE}".format(
                             FORMAT=pulls_format,
                             ALL=("-a" if all_nuissances else ""),
                             PLOT="-g "+("" if all_nuissances else "largest_")+"pulls.root",
                             ARGS=" ".join(args),
                             FIT_RESULT=os.path.join(os.path.dirname(datacard), "mlfit.root"),
                             LOG_FILE=("" if all_nuissances else "largest_")+"pulls."+file_format
                     ),
                     os.path.dirname(datacard)
             ] for datacard in datacards_cbs.keys()])
     
     tools.parallelize(_call_command, commands, n_processes=n_processes)
	def text2workspace(self, datacards_cbs, n_processes=1, *args, **kwargs):
		physics_model = re.search("(-P|--physics-model)[\s=\"\']*\S*:(?P<physics_model>\S*)[\"\']?\s", " ".join(args))
		if physics_model is None:
			physics_model = {}
		else:
			physics_model = physics_model.groupdict()
			
		higgs_mass = kwargs.get("higgs_mass", 125)
		commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format(
				MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses?
				ARGS=" ".join(args),
				DATACARD=datacard,
				OUTPUT=os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root"
		) for datacard, cb in datacards_cbs.iteritems()]

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="text2workspace.py")

		return {datacard : os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root" for datacard in datacards_cbs.keys()}
 def annotate_trees(self, datacards_workspaces, root_filename, value_regex, value_replacements=None, n_processes=1, *args):
     if value_replacements is None:
         value_replacements = {}
     
     commands = []
     for datacard, workspace in datacards_workspaces.iteritems():
         search_result = re.search(value_regex, workspace)
         if not search_result is None:
             value = search_result.groups()[0]
             float_value = float(value_replacements.get(value, value))
             
             commands.append("annotate-trees.py {FILES} --values {VALUE} {ARGS}".format(
                     FILES=os.path.join(os.path.dirname(workspace), root_filename),
                     VALUE=float_value,
                     ARGS=" ".join(args)
             ))
     
     tools.parallelize(_call_command, commands, n_processes=n_processes)
Exemplo n.º 20
0
def main():
	
	parser = argparse.ArgumentParser(description="Convert CSV files to ROOT files.",
	                                 parents=[logger.loggingParser])
	
	parser.add_argument("files", nargs="+",
	                    help="CSV Files.")
	parser.add_argument("--variable-lists", nargs="+", default=[""],
	                    help="Variable lists (in case the CSV has no header), e.g. var1:var2:... [Default: %(default)s]")
	parser.add_argument("-n", "--n-processes", type=int, default=1,
	                    help="Number of (parallel) processes. [Default: %(default)s]")
	
	args = parser.parse_args()
	logger.initLogger(args)
	
	if len(args.variable_lists) == 1:
		args.variable_lists = args.variable_lists * len(args.files)
	
	tools.parallelize(csv2root, zip(args.files, args.variable_lists), n_processes=args.n_processes, description="Converting")
Exemplo n.º 21
0
def merge_local(args):
    outputs_per_nick = folders_to_merge(args)
    if (args.project_subdir != None):  # keep only single path
        outputs_per_nick = {
            args.project_subdir: outputs_per_nick[args.project_subdir]
        }
    # drop potentially existing SvfitCaches from the filelist
    for nick, files in outputs_per_nick.iteritems():
        outputs_per_nick[nick] = [
            file for file in files if ("SvfitCache" not in file)
        ]
    outputs_per_nick = {
        nick: files
        for nick, files in outputs_per_nick.iteritems() if len(files) > 0
    }

    hadd_arguments = []
    for nick_name, output_files in pi.ProgressIterator(
            outputs_per_nick.iteritems(),
            length=len(outputs_per_nick),
            description="Merging Artus outputs"):
        merged_dir = os.path.join(
            args.project_dir[0] if
            (args.output_dir == None) else args.output_dir, "merged",
            nick_name)
        if not os.path.exists(merged_dir):
            os.makedirs(merged_dir)

        target_filename = os.path.join(merged_dir, nick_name + ".root")
        if (args.project_subdir != None):
            target_filename = "merged.root"

        hadd_arguments.append({
            "target_file": target_filename,
            "source_files": output_files,
            "hadd_args": " -f ",
            "max_files": 500
        })

    tools.parallelize(hadd2,
                      hadd_arguments,
                      n_processes=args.n_processes,
                      description="Merging Artus outputs")
	def text2workspace(self, datacards_cbs, n_processes=1, *args, **kwargs):
		physics_model = re.search("(-P|--physics-model)[\s=\"\']*\S*:(?P<physics_model>\S*)[\"\']?\s", " ".join(args))
		if physics_model is None:
			physics_model = {}
		else:
			physics_model = physics_model.groupdict()
		
		for key, value in kwargs.items():
			higgs_mass = value if "higgs_mass" in key else "0"
	
		commands = ["text2workspace.py -m {MASS} {ARGS} {DATACARD} -o {OUTPUT}".format(
				MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses?
				ARGS=" ".join(args),
				DATACARD=datacard,
				OUTPUT=os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root"
		) for datacard, cb in datacards_cbs.iteritems()]

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="text2workspace.py")

		return {datacard : os.path.splitext(datacard)[0]+"_"+physics_model.get("physics_model", "default")+".root" for datacard in datacards_cbs.keys()}
	def plot1DScan(self, datacards_cbs, datacards_workspaces, poi, n_processes=1, *args, **kwargs):
		tmp_args = "".join(args)		
		higgs_mass = kwargs.get("higgs_mass", 125)		
			
		for datacard, workspace in datacards_workspaces.iteritems():
			if not os.path.exists(os.path.join(os.path.dirname(workspace), "plots/")):
				os.makedirs(os.path.join(os.path.dirname(workspace), "plots/"))
				
		commandsPlot = []
		commandsPlot.extend([[
				"$CMSSW_BASE/src/CombineHarvester/CombineTools/scripts/plot1DScan.py --POI {POI} --output={OUTPUT} {ARGS} higgsCombine{NAME}.MultiDimFit.mH{MASS}.root".format(
						OUTPUT="nll",	
						MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass,
						POI=poi,
						NAME="Test",
						ARGS=tmp_args.format()				
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])
		
		tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")	
	def plot1DScan(self, datacards_cbs, datacards_workspaces, poi, n_processes=1, *args, **kwargs):
		tmp_args = "".join(args)
		
		for key, value in kwargs.items():
			higgs_mass = value if "higgs_mass" in key else "0"
					
		for datacard, workspace in datacards_workspaces.iteritems():
			if not os.path.exists(os.path.join(os.path.dirname(workspace), "plots/")):
				os.makedirs(os.path.join(os.path.dirname(workspace), "plots/"))
				
		commandsPlot = []
		commandsPlot.extend([[
				"$CMSSW_BASE/src/CombineHarvester/CombineTools/scripts/plot1DScan.py --POI {POI} higgsCombine.MultiDimFit.mH{MASS}.root".format(
						MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass,
						POI=poi,
						ARGS=tmp_args.format()				
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])
		
		tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")	
	def nuisance_impacts(self, datacards_cbs, datacards_workspaces, n_processes=1, *args, **kwargs):

		tmp_args = " ".join(args)
		higgs_mass = kwargs.get("higgs_mass", 125)	
		
		commandsInitialFit = []
		commandsInitialFit.extend([[
				"combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} --robustFit 1  --doInitialFit --allPars {ARGS}".format(
						MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass,
						ARGS=tmp_args.format(),
						WORKSPACE=workspace
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])

		commandsFits = []
		commandsFits.extend([[
				"combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} --robustFit 1 --doFits --parallel {NPROCS} --allPars {ARGS}".format(
						MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass,
						ARGS=tmp_args.format(),
						WORKSPACE=workspace,
						NPROCS=n_processes
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])

		commandsOutput = []
		commandsOutput.extend([[
				"combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} -o impacts.json --parallel {NPROCS} --allPars {ARGS}".format(
						MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass,
						ARGS=tmp_args.format(),
						WORKSPACE=workspace,
						NPROCS=n_processes
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])

		commandsPlot = []
		commandsPlot.extend([[
				"plotImpacts.py -i {INPUT} -o {OUTPUT}".format(
						INPUT="impacts.json",
						OUTPUT="plots/nuisance_impacts"
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])

		tools.parallelize(_call_command, commandsInitialFit, n_processes=n_processes, description="combineTool.py (initial fits)")
		tools.parallelize(_call_command, commandsFits, n_processes=1, description="combineTool.py (fits)")
		tools.parallelize(_call_command, commandsOutput, n_processes=1, description="combineTool.py (outputs)")
		tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")
	def hypotestresulttree(self, datacards_cbs, n_processes=1, rvalue="1", poiname="x"):
		commands = []
		hypotestresulttree = {}

		#for fit_type in fit_type_list:
		commands.extend(["root -q -b \"HiggsAnalysis/KITHiggsToTauTau/scripts/hypoTestResultTree.cxx(\\\"{INPUT}\\\",\\\"{OUTPUT}\\\",{MASS},{RVALUE},\\\"{POINAME}\\\")\"".format(
				INPUT=os.path.join(os.path.dirname(datacard),"higgsCombine.HybridNew.mH{angle}.root".format(angle = [mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")),
				OUTPUT=os.path.join(os.path.dirname(datacard), "higgsCombine.HybridNew.mH{angle}_qmu.root".format(angle =[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")),
				MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0", # TODO: maybe there are more masses?
				RVALUE= str(rvalue),
				POINAME=str(poiname)

				#ARGS=", ".join(args)
			) for datacard, cb in datacards_cbs.iteritems()])

			#datacards_postfit_shapes.setdefault(fit_type, {}).update({
			#		datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root"
			#for datacard, cb in datacards_cbs.iteritems()})

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="hypoTestResultTree.cxx")

		return {datacard : os.path.join(os.path.dirname(datacard), "higgsCombine.HybridNew.mH{angle}_qmu.root".format(angle =[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")) for datacard in datacards_cbs.keys()}
def submission(base_paths, n_processes=1):
	
	# retrieve and prepare input files
	filenames_per_sample_per_pipeline = {}
	for base_path in base_paths:
		stdout_directories, stderr_directories = tools.subprocessCall(shlex.split("gfal-ls " + base_path))
		tmp_filenames_per_sample_per_pipeline = tools.parallelize(
				get_filenames,
				[[base_path, sample] for sample in stdout_directories.decode().strip().split("\n")],
				n_processes=n_processes,
				description="Retrieving inputs"
		)
		for item in tmp_filenames_per_sample_per_pipeline:
			for sample, filenames_per_pipeline in item.iteritems():
				for pipeline, tmp_filenames in filenames_per_pipeline.iteritems():
					filenames_per_sample_per_pipeline.setdefault(sample, {}).setdefault("pipeline", []).extend(tmp_filenames)
	configs, jobfiles = build_configs(filenames_per_sample_per_pipeline)
	
	# submit tasks
	submit_args = []
	for config, jobfile in zip(configs, jobfiles):
		submit_args.append([config, jobfile])
	tools.parallelize(submit, submit_args, n_processes=1, description="Submitting crab tasks")
    def postfit_shapes(self, datacards_cbs, s_fit_only=False, n_processes=1, *args):
        commands = []
        datacards_postfit_shapes = {}
        fit_type_list = ["fit_s", "fit_b"]
        if s_fit_only:
            fit_type_list.remove("fit_b")

        for fit_type in fit_type_list:
            commands.extend(["PostFitShapes --postfit -d {DATACARD} -o {OUTPUT} -m {MASS} -f {FIT_RESULT} {ARGS}".format(
                    DATACARD=datacard,
                    OUTPUT=os.path.splitext(datacard)[0]+"_"+fit_type+".root",
                    MASS=[mass for mass in cb.mass_set() if mass != "*"][0], # TODO: maybe there are more masses?
                    FIT_RESULT=os.path.join(os.path.dirname(datacard), "mlfit.root:"+fit_type),
                    ARGS=" ".join(args)
            ) for datacard, cb in datacards_cbs.iteritems()])
            
            datacards_postfit_shapes.setdefault(fit_type, {}).update({
                    datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root"
            for datacard, cb in datacards_cbs.iteritems()})
        
        tools.parallelize(_call_command, commands, n_processes=n_processes)
        
        return datacards_postfit_shapes
Exemplo n.º 29
0
def merge_local(args):
	outputs_per_nick = folders_to_merge(args)
	if(args.project_subdir != None): # keep only single path
		outputs_per_nick = { args.project_subdir : outputs_per_nick[args.project_subdir] }
	# drop potentially existing SvfitCaches from the filelist
	for nick, files in outputs_per_nick.iteritems():
		outputs_per_nick[nick] = [file for file in files if ("SvfitCache" not in file)]
	outputs_per_nick = {nick : files for nick, files in outputs_per_nick.iteritems() if len(files) > 0}
	
	hadd_arguments = []
	for nick_name, output_files in pi.ProgressIterator(outputs_per_nick.iteritems(),
	                                                   length=len(outputs_per_nick),
	                                                   description="Merging Artus outputs"):
		merged_dir = os.path.join(args.project_dir[0] if(args.output_dir == None) else args.output_dir, "merged", nick_name)
		if not os.path.exists(merged_dir):
			os.makedirs(merged_dir)

		target_filename = os.path.join(merged_dir, nick_name+".root") 
		if(args.project_subdir != None):
			target_filename = "merged.root"

		hadd_arguments.append({"target_file": target_filename, "source_files": output_files, "hadd_args" : " -f -v 0 ", "max_files" : 500})

	tools.parallelize(hadd2, hadd_arguments, n_processes=args.n_processes, description="Merging Artus outputs")
	def hypotestresulttree(self, datacards_cbs, n_processes=1, rvalue="1", poiname="x"):
		commands = []
		hypotestresulttree = {}



		#for fit_type in fit_type_list:
		commands.extend(["root -q -b \"HiggsAnalysis/KITHiggsToTauTau/scripts/hypoTestResultTree.cxx(\\\"{INPUT}\\\",\\\"{OUTPUT}\\\",{MASS},{RVALUE},\\\"{POINAME}\\\")\"".format(
				INPUT=os.path.join(os.path.dirname(datacard),"higgsCombine.HybridNew.mH{angle}.root".format(angle = [mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")),
				OUTPUT=os.path.join(os.path.dirname(datacard), "higgsCombine.HybridNew.mH{angle}_qmu.root".format(angle =[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")),
				MASS=[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0", # TODO: maybe there are more masses?
				RVALUE= str(rvalue),
				POINAME=str(poiname)

				#ARGS=", ".join(args)
			) for datacard, cb in datacards_cbs.iteritems()])

			#datacards_postfit_shapes.setdefault(fit_type, {}).update({
			#		datacard : os.path.splitext(datacard)[0]+"_"+fit_type+".root"
			#for datacard, cb in datacards_cbs.iteritems()})

		tools.parallelize(_call_command, commands, n_processes=n_processes, description="hypoTestResultTree.cxx")

		return {datacard : os.path.join(os.path.dirname(datacard), "higgsCombine.HybridNew.mH{angle}_qmu.root".format(angle =[mass for mass in cb.mass_set() if mass != "*"][0] if len(cb.mass_set()) > 1 else "0")) for datacard in datacards_cbs.keys()}
Exemplo n.º 31
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Collect matching trees from input files into one output tree",
        parents=[logger.loggingParser])

    parser.add_argument(
        "-i",
        "--input-dirs",
        help=
        "Input directories = crab project directories containing the subdirectories with crab tasks",
        nargs="+")
    parser.add_argument(
        "-o",
        "--output-dir",
        default=None,
        help=
        "Local output directory. [Default: subdir \"results\" in first input directory]"
    )
    parser.add_argument(
        "-d",
        "--dcache-target",
        default=None,
        help=
        "Directory on dCache (srm) where the files should be copied to. [Default: %(default)s]"
    )

    parser.add_argument(
        "--input-trees",
        nargs="+",
        default=["svfitCache"],
        help="Paths of input SVfit cache trees. [Default: %(default)s]")
    parser.add_argument(
        "--output-tree",
        default="svfitCache",
        help="Name of output SVfit cache tree. [Default: %(default)s]")
    parser.add_argument(
        "--previous-cache",
        default="",
        help=
        "Path to a previous cache which will be merged. [Default: %(default)s]"
    )
    parser.add_argument(
        "--dcache",
        type=bool,
        default=False,
        help="Read&Write from and to desy dcache[Default: %(default)s]")
    parser.add_argument(
        "--no-run",
        default=False,
        action="store_true",
        help="Do not run but only print dict  [Default: %(default)s]")
    parser.add_argument(
        "-n",
        "--n-processes",
        type=int,
        default=1,
        help="Number of (parallel) processes. [Default: %(default)s]")

    args = parser.parse_args()
    logger.initLogger(args)

    if args.output_dir is None:
        args.output_dir = os.path.join(args.input_dirs[0], "results")

    # get paths to crab outputs
    #max_n_jobs = 8000
    #max_n_retrieve = 500
    get_crab_outputs_args = []
    for input_dir in args.input_dirs:
        #for jobid_start in xrange(1, max_n_jobs, max_n_retrieve):
        #	jobid_end = jobid_start + max_n_retrieve - 1
        #	get_crab_outputs_args.append([input_dir, "{jobid_start}-{jobid_end}".format(jobid_start=jobid_start, jobid_end=jobid_end)])
        get_crab_outputs_args.append([input_dir, "1-10"])

    tar_files = tools.parallelize(_get_crab_outputs,
                                  get_crab_outputs_args,
                                  max(args.n_processes, 2),
                                  description="crab getoutput --dump")
    tar_files = tools.flattenList(tar_files)

    # download and un-tar
    download_untar_args = [[tar_file, args.output_dir]
                           for tar_file in tar_files]
    tools.parallelize(_download_untar,
                      download_untar_args,
                      args.n_processes,
                      description="download and un-tar crab outputs")

    root_files = glob.glob(os.path.join(args.output_dir, "*.root"))
    # TODO: maybe add more root files from -i arguments, that did not need to be un-tared

    root_files_per_sample_nick = {}
    for root_file in root_files:
        basename = os.path.basename(root_file)
        sample_nick = basename[:basename.index("_job_")]
        root_files_per_sample_nick.setdefault(sample_nick,
                                              []).append(root_file)

    merged_output_dir = os.path.join(args.output_dir, "merged")
    if not os.path.exists(merged_output_dir):
        os.makedirs(merged_output_dir)
    merge_outputs_args = [[
        os.path.join(merged_output_dir, sample_nick + ".root"), tmp_root_files,
        "-f"
    ] for sample_nick, tmp_root_files in
                          root_files_per_sample_nick.iteritems()]
    tools.parallelize(_merge_outputs,
                      merge_outputs_args,
                      args.n_processes,
                      description="merging")

    if args.dcache_target:
        dcache_copy_commands = [
            "gfal-copy -v -f -r " + merged_output_dir + " " +
            args.dcache_target
        ]
        tools.parallelize(_call_command,
                          dcache_copy_commands,
                          args.n_processes,
                          description="copying to dCache")

    rm_commands = ["rm " + root_file for root_file in root_files]
    if args.dcache_target:
        rm_commands.extend([
            "rm " + os.path.join(merged_output_dir, sample_nick + ".root")
            for sample_nick in root_files_per_sample_nick.keys()
        ])
    tools.parallelize(_call_command,
                      rm_commands,
                      args.n_processes,
                      description="deleting temporary files")

    log.info("\nJSON configuration for Artus:\n")
    config_output_dir = args.dcache_target if args.dcache_target else merged_output_dir
    for src, dst in filename_replacements.iteritems():
        config_output_dir = config_output_dir.replace(src, dst)
    for sample_nick in sorted(root_files_per_sample_nick.keys()):
        log.info("\"" + sample_nick + "\" : \"" +
                 os.path.join(config_output_dir, sample_nick + ".root") +
                 "\",")
                            files_dict[file_name][channel]["weights"].append(w)
                            files_dict[file_name][channel]["ntuples"].append(
                                n2)
                #if n not in files_dict[file_name][channel]["ntuples"]:
    jsonTools.JsonDict(files_dict).save(os.path.join(output_dir,
                                                     "ReduceFiles.json"),
                                        indent=4)

    give_away_list = []
    for filename, item in files_dict.iteritems():
        item["in_dir"] = input_dir
        item["out_dir"] = output_dir
        give_away_list.append({filename: item})

    aTools.parallelize(reduce_file,
                       give_away_list,
                       n_processes=args.n_processes)

    #for aufruf in give_away_list:
    #reduce_file(aufruf)

    #for filename, item in files_dict.iteritems():
    ##setting file paths
    #input_file_path = os.path.join(input_dir, os.path.join(filename.replace(".root", ""), filename))
    #output_file_path = os.path.join(output_dir, os.path.join(filename.replace(".root", ""), filename))
    #if not os.path.exists(os.path.join(output_dir, filename.replace(".root", ""))):
    #os.makedirs(os.path.join(output_dir, filename.replace(".root", "")))
    ##debugging to check which files are processed
    #log.debug("Reduce input from file:")
    #log.debug(input_file_path)
    #if os.path.isfile(input_file_path):
def main():
	parser = argparse.ArgumentParser(description="Collect matching trees from input files into one output tree",
	                                 parents=[logger.loggingParser])

	parser.add_argument("-i", "--input-dirs", help="Input directories = crab project directories containing the subdirectories with crab tasks", nargs="+")
	parser.add_argument("-o", "--output-dir", default=None,
	                    help="Local output directory. [Default: subdir \"results\" in first input directory]")
	parser.add_argument("-d", "--dcache-target", default=None,
	                    help="Directory on dCache (srm) where the files should be copied to. [Default: %(default)s]")
	
	parser.add_argument("--input-trees", nargs="+", default=["svfitCache"],
	                    help="Paths of input SVfit cache trees. [Default: %(default)s]")
	parser.add_argument("--output-tree", default="svfitCache",
	                    help="Name of output SVfit cache tree. [Default: %(default)s]")
	parser.add_argument("--previous-cache", default="",
	                    help="Path to a previous cache which will be merged. [Default: %(default)s]")
	parser.add_argument("--dcache", type=bool, default=False,
	                    help="Read&Write from and to desy dcache[Default: %(default)s]")
	parser.add_argument("--no-run", default=False, action="store_true",
	                    help="Do not run but only print dict  [Default: %(default)s]")
	parser.add_argument("-n", "--n-processes", type=int, default=1,
	                    help="Number of (parallel) processes. [Default: %(default)s]")
	
	args = parser.parse_args()
	logger.initLogger(args)
	
	if args.output_dir is None:
		args.output_dir = os.path.join(args.input_dirs[0], "results")
	
	tar_files = []
	for input_dir in args.input_dirs:
		tar_files.extend(glob.glob(os.path.join(input_dir, "*/results/*.tar")))
		tar_files.extend(glob.glob(os.path.join(input_dir, "results/*.tar")))
		
	tar_commands = ["tar -x -f "+tar_file+" -C "+args.output_dir+" --overwrite" for tar_file in tar_files]
	tools.parallelize(_call_command, tar_commands, args.n_processes, description="un-tar crab outputs")
	
	root_files = glob.glob(os.path.join(args.output_dir, "*.root"))
	# TODO: maybe add more root files from -i arguments, that did not need to be un-tared
	
	root_files_per_sample_nick = {}
	for root_file in root_files:
		basename = os.path.basename(root_file)
		sample_nick = basename[:basename.index("_job_")]
		root_files_per_sample_nick.setdefault(sample_nick, []).append(root_file)
	
	merged_output_dir = os.path.join(args.output_dir, "merged")
	if not os.path.exists(merged_output_dir):
		os.makedirs(merged_output_dir)
	hadd_commands = ["hadd.py "+(" ".join(tmp_root_files))+" -t "+os.path.join(merged_output_dir, sample_nick+".root")+" -a \" -f -v 0\"" for sample_nick, tmp_root_files in root_files_per_sample_nick.iteritems()]
	tools.parallelize(_call_command, hadd_commands, args.n_processes, description="merging")
	
	if args.dcache_target:
		dcache_copy_commands = ["gfal-copy -f -r "+merged_output_dir+" "+args.dcache_target]
		tools.parallelize(_call_command, dcache_copy_commands, args.n_processes, description="copying to dCache")
	
	rm_commands = ["rm "+root_file for root_file in root_files]
	if args.dcache_target:
		rm_commands.extend(["rm "+os.path.join(merged_output_dir, sample_nick+".root") for sample_nick in root_files_per_sample_nick.keys()])
	tools.parallelize(_call_command, rm_commands, args.n_processes, description="deleting temporary files")
	
	log.info("\nJSON configuration for Artus:\n")
	config_output_dir = args.dcache_target if args.dcache_target else merged_output_dir
	for src, dst in filename_replacements.iteritems():
		config_output_dir = config_output_dir.replace(src, dst)
	for sample_nick in sorted(root_files_per_sample_nick.keys()):
		log.info("\""+sample_nick+"\" : \""+os.path.join(config_output_dir, sample_nick+".root")+"\",")
	def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args, **kwargs):
		if datacards_poi_ranges is None:
			datacards_poi_ranges = {}
		tmp_args = " ".join(args)
		
		higgs_mass = kwargs.get("higgs_mass", 125)
		chunks = [[None, None]]
		if "{CHUNK}" in tmp_args and "--points" in tmp_args:
			splited_args = tmp_args.split()
			n_points = int(splited_args[splited_args.index("--points") + 1])
			n_points_per_chunk = 199
			chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)]

		method = re.search("(-M|--method)[\s=\"\']*(?P<method>\w*)[\"\']?\s", tmp_args)
		if not method is None:
			method = method.groupdict()["method"]

		name = re.search("(-n|--name)[\s=\"\']*(?P<name>\w*)[\"\']?\s", tmp_args)
		if not name is None:
			name = name.groupdict()["name"]

		split_stat_syst_uncs = kwargs.get("split_stat_syst_uncs", False)
		if split_stat_syst_uncs and (method is None):
			log.error("Uncertainties are not split into stat. and syst. components, since the method for combine is unknown!")
			split_stat_syst_uncs = False
		if split_stat_syst_uncs and (not "MultiDimFit" in method):
			log.error("Uncertainties are not split into stat. and syst. components. This is only supported for the MultiDimFit method!")
			split_stat_syst_uncs = False

		split_stat_syst_uncs_options = [""]
		split_stat_syst_uncs_names = [""]
		if split_stat_syst_uncs:
			split_stat_syst_uncs_options = [
				"--saveWorkspace",
				"--snapshotName {method} -w w".format(method=method),
				"--snapshotName {method} -w w --freezeNuisanceGroups syst_plus_bbb".format(method=method, uncs="{uncs}"), #DBUG TEST!!!!!!!!!18.1.2017 --freezeNuisances
			]
			split_stat_syst_uncs_names = [
				"Workspace",
				"TotUnc",
				"StatUnc",
			]

		for split_stat_syst_uncs_index, (split_stat_syst_uncs_option, split_stat_syst_uncs_name) in enumerate(zip(split_stat_syst_uncs_options, split_stat_syst_uncs_names)):
			prepared_tmp_args = None
			new_name = None
			if split_stat_syst_uncs:
				new_name = ("" if name is None else name) + split_stat_syst_uncs_name
				if name is None:
					prepared_tmp_args = tmp_args + " -n " + new_name
				else:
					prepared_tmp_args = copy.deepcopy(tmp_args)
					prepared_tmp_args = re.sub("(--algo)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2 "+("none" if split_stat_syst_uncs_index == 0 else "\\3")+"\\4", prepared_tmp_args)
					prepared_tmp_args = re.sub("(-n|--name)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2"+new_name+"\\4", prepared_tmp_args)
			else:
				prepared_tmp_args = tmp_args

			prepared_tmp_args = re.sub("-n -n", "-n", prepared_tmp_args)

			commands = []
			for chunk_index, (chunk_min, chunk_max) in enumerate(chunks):
				commands.extend([[
						"combine -m {MASS} {POI_RANGE} {ARGS} {CHUNK_POINTS} {SPLIT_STAT_SYST_UNCS} {WORKSPACE}".format(
								MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses?
								POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "",
								ARGS=prepared_tmp_args.format(CHUNK=str(chunk_index), RMIN="{RMIN}", RMAX="{RMAX}"),
								CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format(
										CHUNK_MIN=chunk_min,
										CHUNK_MAX=chunk_max
								),
								SPLIT_STAT_SYST_UNCS=split_stat_syst_uncs_option.format(uncs=",".join(kwargs.get("additional_freeze_nuisances", [])+datacards_cbs[datacard].syst_name_set())),
								WORKSPACE="-d "+workspace
						).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]),
						os.path.dirname(workspace)
				] for datacard, workspace in datacards_workspaces.iteritems()])

			tools.parallelize(_call_command, commands, n_processes=n_processes, description="combine")

			if split_stat_syst_uncs and (split_stat_syst_uncs_index == 0):
				# replace workspaces by saved versions from the first fit including the postfit nuisance parameter values
				for datacard, workspace in datacards_workspaces.iteritems():
					datacards_workspaces[datacard] = glob.glob(os.path.join(os.path.dirname(workspace), "higgsCombine"+new_name+"."+method+".*.root"))[0]
Exemplo n.º 35
0
	def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None):
		config_dicts = list_of_config_dicts if isinstance(list_of_config_dicts, collections.Iterable) and not isinstance(list_of_config_dicts, basestring) else [list_of_config_dicts]
		args_strings = list_of_args_strings if isinstance(list_of_args_strings, collections.Iterable) and not isinstance(list_of_args_strings, basestring) else [list_of_args_strings]
		
		# fill up missing list elements by repeating previous ones
		n_plots = max(len(config_dicts), len(args_strings))
		if (len(config_dicts) > 1) and (len(config_dicts) < n_plots):
			log.warning("Too few config dicts specified!")
		if (len(args_strings) > 1) and (len(args_strings) < n_plots):
			log.warning("Too few argument lists specified!")
		config_dicts = (config_dicts*n_plots)[:n_plots]
		args_strings = (args_strings*n_plots)[:n_plots]
		
		harry_args = []
		for config_dict, args_string in zip(config_dicts, args_strings):
			if config_dict is None:
				harry_args.append(None)
			else:
				config_dict["comment"] = " ".join(sys.argv)
				if "json_defaults" in config_dict:
					json_defaults_dict = jsonTools.JsonDict(config_dict["json_defaults"]).doIncludes().doComments()
					config_dict.pop("json_defaults")
					json_defaults_dict.update(config_dict)
					config_dict = json_defaults_dict
				harry_args.append("--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString(indent=None).replace("\"", "'"))
			
			if not args_string is None:
				if harry_args[-1] is None:
					harry_args[-1] = args_string
				else:
					harry_args[-1] += (" "+args_string)
				if config_dict is None:
					harry_args[-1] += (" --comment " + (" ".join(sys.argv)))
		if not n_fast_plots is None:
			harry_args = harry_args[:n_fast_plots]
		
		# multi processing of multiple plots
		output_filenames = []
		failed_plots = []
		if len(harry_args) > 1 and n_processes > 1:
			log.info("Creating {:d} plots in {:d} processes".format(len(harry_args), min(n_processes, len(harry_args))))
			results = tools.parallelize(pool_plot, zip([self]*len(harry_args), harry_args), n_processes)
			tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(*([result for result in results if not result is None and result != (None,)]))
			output_filenames = [output_filename for output_filename in tmp_output_filenames if not output_filename is None]
			failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip(tmp_failed_plots, tmp_error_messages) if not failed_plot is None]
		
		# single processing of multiple plots
		elif len(harry_args) > 1:
			log.info("Creating {:d} plots".format(len(harry_args)))
			for harry_args in harry_args:
				try:
					output_filenames.append(self.plot(harry_args))
				except SystemExit as e:
					failed_plots.append((harry_args, None))
				except Exception as e:
					log.info(str(e))
					failed_plots.append((harry_args, None))
		
		# single plot
		elif len(harry_args) > 0:
			output_filenames.append(self.plot(harry_args[0]))
		
		if len(failed_plots) > 0:
			log.error("%d failed plots:" % len(failed_plots))
			for failed_plot in failed_plots:
				log.info("\n"+tools.get_colored_string("Failed plot:", color='red'))
				log.info("\t%s" % failed_plot[0])
				if failed_plot[1] is not None:
					log.info(tools.get_indented_text("    ", tools.get_colored_string("Traceback for this plot:", color='red')+"\n" + failed_plot[1]))
		
		return output_filenames
Exemplo n.º 36
0
	def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None, batch=None):
		config_dicts = list_of_config_dicts if isinstance(list_of_config_dicts, collections.Iterable) and not isinstance(list_of_config_dicts, basestring) else [list_of_config_dicts]
		args_strings = list_of_args_strings if isinstance(list_of_args_strings, collections.Iterable) and not isinstance(list_of_args_strings, basestring) else [list_of_args_strings]
		
		# fill up missing list elements by repeating previous ones
		n_plots = max(len(config_dicts), len(args_strings))
		if (len(config_dicts) > 1) and (len(config_dicts) < n_plots):
			log.warning("Too few config dicts specified!")
		if (len(args_strings) > 1) and (len(args_strings) < n_plots):
			log.warning("Too few argument lists specified!")
		config_dicts = (config_dicts*n_plots)[:n_plots]
		args_strings = (args_strings*n_plots)[:n_plots]
		
		if n_processes>1:
			for i in range(len(args_strings)):
				args_strings[i] += (" --hide-progressbar ")

		self.harry_args = []
		for config_dict, args_string in zip(config_dicts, args_strings):
			if config_dict is None:
				self.harry_args.append(None)
			else:
				config_dict["comment"] = " ".join(sys.argv)
				if not batch is None:
					config_dict["dry_run"] = True
				if "json_defaults" in config_dict:
					json_defaults_dict = jsonTools.JsonDict(config_dict["json_defaults"]).doIncludes().doComments()
					config_dict.pop("json_defaults")
					json_defaults_dict.update(config_dict)
					config_dict = json_defaults_dict
				self.harry_args.append("--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString(indent=None).replace("\"", "'"))
			
			if not args_string is None:
				if self.harry_args[-1] is None:
					self.harry_args[-1] = args_string
				else:
					self.harry_args[-1] += (" "+args_string)
				if config_dict is None:
					self.harry_args[-1] += (" --comment " + (" ".join(sys.argv)))
					if not batch is None:
						self.harry_args[-1] += " --dry-run"
		
		if not n_fast_plots is None:
			self.harry_args = self.harry_args[:n_fast_plots]
			n_plots = len(self.harry_args)
		
		self.harry_cores = [None]*n_plots
		
		# multi processing of multiple plots
		output_filenames = []
		failed_plots = []
		if (n_plots > 1) and (n_processes > 1):
			log.info("Creating {:d} plots in {:d} processes".format(n_plots, min(n_processes, n_plots)))
			results = tools.parallelize(pool_plot, zip([self]*n_plots, range(n_plots)), n_processes, description="Plotting")
			tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(*([result for result in results if not result is None and result != (None,)]))
			output_filenames = [output_filename for output_filename in tmp_output_filenames if not output_filename is None]
			failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip(tmp_failed_plots, tmp_error_messages) if not failed_plot is None]
		
		# single processing of multiple plots
		elif n_plots > 1:
			log.info("Creating {:d} plots".format(n_plots))
			for plot_index in xrange(n_plots):
				try:
					output_filenames.append(self.plot(plot_index))
				except SystemExit as e:
					failed_plots.append((self.harry_args[plot_index], None))
				except Exception as e:
					log.info(str(e))
					failed_plots.append((self.harry_args[plot_index], None))
		
		# single plot
		elif n_plots > 0:
			output_filenames.append(self.plot(0))
		
		# batch submission
		if (not (batch is None)) and (len(failed_plots) < n_plots):
			try:
				os.makedirs(os.path.expandvars("$HP_WORK_BASE"))
			except OSError:
				if not os.path.isdir(os.path.expandvars("$HP_WORK_BASE")):
					raise
	
			workdir = tempfile.mkdtemp(prefix="harry_work_"+datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")+"_", dir=os.path.expandvars("$HP_WORK_BASE"))
			
			main_config = ""
			with open(os.path.expandvars("$CMSSW_BASE/src/Artus/HarryPlotter/data/grid-control_base_config.conf"), "r") as main_config_file:
				main_config = main_config_file.read()
			
			backend_config = ""
			with open(os.path.expandvars("$CMSSW_BASE/src/Artus/Configuration/data/grid-control_backend_" + batch + ".conf"), "r") as backend_config_file:
				backend_config = backend_config_file.read()
			
			final_config = string.Template(main_config).safe_substitute(
					cmsswbase=os.path.expandvars("$CMSSW_BASE"),
					hpworkbase=os.path.expandvars("$HP_WORK_BASE"),
					cwd=os.getcwd(),
					jsonconfigs="\n\t"+("\n\t".join([item[0] for item in output_filenames])),
					executable=self.standalone_executable,
					workdir=workdir,
					backend=backend_config
			)
			final_config_filename = workdir+".conf"
			with open(final_config_filename, "w") as final_config_file:
				final_config_file.write(final_config)
			
			command = "go.py " + final_config_filename
			log.info(command)
			logger.subprocessCall(shlex.split(command))
		
		if len(failed_plots) > 0:
			log.error("%d failed plots:" % len(failed_plots))
			for failed_plot in failed_plots:
				log.info("\n"+tools.get_colored_string("Failed plot:", color='red'))
				log.info("\t%s" % failed_plot[0])
				if failed_plot[1] is not None:
					try:
						log.info(tools.get_indented_text("    ", tools.get_colored_string("Traceback for this plot:", color='red')+"\n" + failed_plot[1]))
					except:
						log.info("\t Traceback for this plot: \n" + failed_plot[1])
		
		return output_filenames
Exemplo n.º 37
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Collect matching trees from input files into one output tree",
        parents=[logger.loggingParser])

    parser.add_argument(
        "-i",
        "--input",
        help=
        "Input directory with merged Artus outputs including Svit Cache files")
    parser.add_argument("-o",
                        "--output",
                        default="svfitCache.root",
                        help="Output ROOT file. [Default: %(default)s]")

    parser.add_argument(
        "--input-trees",
        nargs="+",
        default=["svfitCache"],
        help="Paths of input SVfit cache trees. [Default: %(default)s]")
    parser.add_argument(
        "--output-tree",
        default="svfitCache",
        help="Name of output SVfit cache tree. [Default: %(default)s]")
    parser.add_argument(
        "--previous-cache",
        default="",
        help=
        "Path to a previous cache which will be merged. [Default: %(default)s]"
    )
    parser.add_argument(
        "--dcache",
        type=bool,
        default=False,
        help="Read&Write from and to desy dcache[Default: %(default)s]")
    parser.add_argument(
        "--no-run",
        default=False,
        action="store_true",
        help="Do not run but only print dict  [Default: %(default)s]")
    parser.add_argument(
        "-n",
        "--n-processes",
        type=int,
        default=1,
        help="Number of (parallel) processes. [Default: %(default)s]")

    args = parser.parse_args()
    logger.initLogger(args)

    merge_commands = []
    copy_commands = []
    config_file = []

    ls_command = "gfal-ls %s" % (srm(args.output))
    retCode = logger.subprocessCall(ls_command.split())
    if (retCode != 0):
        mkdir_command = "gfal-mkdir %s" % (srm(args.output))
        log.info("Creating " + srm(args.output))
        logger.subprocessCall(mkdir_command.split())
    tmpdir = tempfile.mkdtemp(suffix='', prefix='tmp',
                              dir="/tmp")  #dir=os.getcwd())

    if not args.dcache:
        if not args.no_run:
            for input in glob.glob(args.input + "/*/*.root"):
                output = tmpdir
                input_trees = args.input_trees
                output_trees = args.output_tree
                config = jsonTools.JsonDict(input)
                pipelines = config.get("Pipelines", {}).keys()
                # extract names without the leading channel
                pipelines = [
                    "_".join(pipeline.split("_")[1:]) for pipeline in pipelines
                ]
                pipelines = list(set(pipelines))
                pipelines = [x for x in pipelines if x != '']
                merge_commands = []
                for pipeline in pipelines:
                    out_filename = os.path.join(
                        output, pipeline,
                        "svfitCache_" + os.path.basename(input))
                    if not os.path.exists(os.path.dirname(out_filename)):
                        os.makedirs(os.path.dirname(out_filename))
                    pipeline_input_trees = [
                        pipeline + "/" + input_tree
                        for input_tree in input_trees
                    ]
                    merged_tree_name = treemerge.treemerge(
                        [input],
                        pipeline_input_trees,
                        out_filename,
                        output_trees,
                        match_input_tree_names=True)
                    log.info("SVfit cache trees collected in \"%s\"." %
                             merged_tree_name)

            if args.previous_cache:  # check for all available files in previous_cache
                previous_caches = glob.glob(args.previous_cache + "*/*.root")
                previous_cachefiles = [
                    "/".join(cache.split("/")[-2:])
                    for cache in previous_caches
                ]
                for cachefile in previous_cachefiles:
                    current = os.path.join(output, cachefile)
                    previous = os.path.join(args.previous_cache, cachefile)
                    if not os.path.exists(os.path.dirname(current)):
                        os.makedirs(os.path.dirname(current))
                    if os.path.exists(current):
                        merge_commands.append("mv %s %s_tmp.root " %
                                              (current, current))
                        merge_commands.append(
                            "hadd -f -f6 %s %s_tmp.root %s " %
                            (current, current, previous))
                        merge_commands.append("rm %s_tmp.root " % (current))
                    else:
                        merge_commands.append("hadd -f -f6 %s %s" %
                                              (current, previous))
                tools.parallelize(_call_command,
                                  merge_commands,
                                  args.n_processes,
                                  description="merging")

            # move to output-directory
            copy_command = "gfal-copy -r file:///%s %s" % (output,
                                                           srm(args.output))
            logger.subprocessCall(copy_command.split())

        # print c&p summary
        current_caches = glob.glob(args.output + "*/*.root")
        nicknames = list(
            set([
                os.path.basename(cache).split(".")[0].replace(
                    "svfitCache_", "") for cache in current_caches
            ]))
        for nick in sorted(nicknames):
            config_file.append(
                '\t\t\t"%s" : "%s",' %
                (nick, xrd(args.output) + "/svfitCache_" + nick + ".root"))

    else:
        input_dirs = glob.glob(args.input + "/*/*/*")
        untar_commands = [
            "tar xf %s -C %s" % (file, tmpdir) for input_dir in input_dirs
            for file in glob.glob(input_dir + "/*.tar*")
        ]
        if not args.no_run:
            tools.parallelize(_call_command,
                              untar_commands,
                              args.n_processes,
                              description="unpacking")
        regex = re.compile(".*/(.*)_job_[0-9]+_SvfitCache.._(.*?)[0-9]+.root")
        matches = [(regex.match(file).groups(), file)
                   for file in glob.glob(tmpdir + "/*.root")]
        dirs = {}

        # go through matches and create nested dict {'sample' : {'Pipeline' : [files]}}
        for match in matches:
            if match[0][0] not in dirs:
                dirs[match[0][0]] = {}
            if match[0][1] not in dirs[match[0][0]]:
                dirs[match[0][0]][match[0][1]] = []
            dirs[match[0][0]][match[0][1]].append(match[1])

        for sample in dirs:
            for pipeline in dirs[sample]:
                # create folders as needed
                if not os.path.exists(tmpdir + "/" + pipeline):
                    os.makedirs(tmpdir + "/" + pipeline)
                previous_cache_file = ""
                if args.previous_cache:
                    if os.path.isfile(args.previous_cache + "/" + pipeline +
                                      "/svfitCache_" + sample + ".root"):
                        previous_cache_file = args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root"
                tmp_filename = tmpdir + "/" + pipeline + "/svfitCache_" + sample + ".root"
                out_filename = args.output + "/" + pipeline + "/svfitCache_" + sample + ".root"
                merge_commands.append(
                    "hadd -f %s %s %s" % (tmp_filename, " ".join(
                        dirs[sample][pipeline]), previous_cache_file))
                copy_commands.append("gfal-copy -f file:///%s %s" %
                                     (tmp_filename, srm(out_filename)))
            config_file.append(
                '"%s" : "%s",' %
                (sample, xrd(args.output) + "/svfitCache_" + sample + ".root"))

        if not args.no_run:
            tools.parallelize(_call_command,
                              merge_commands,
                              args.n_processes,
                              description="merging")
            tools.parallelize(_call_command,
                              copy_commands,
                              args.n_processes,
                              description="copying")

    shutil.rmtree(tmpdir)
    log.info("done. Artus SvfitCacheFile settings: ")

    for entry in config_file:
        log.info(entry)
    #parser.add_argument("-S", "--Samples", nargs="+", default=["ggh", "qqh"],
    #help="Samples to be compared [Default: %(default)s]")
    #parser.add_argument("-o", "--output-dir",
    #default="./",
    #help="path to output file. [Default: %(default)s]")
    args = parser.parse_args()

    #clean argument input-files
    inputs = []
    for entry in args.input_files:
        inputs.append(entry.strip(',').strip('"'))
    if len(inputs) > 1:
        filenames = inputs
    else:
        if os.path.isdir(inputs[0]):
            filenames = glob.glob(os.path.join(inputs[0], "*", "*.root"))
        else:
            filenames = inputs
    #ntuple_strings = ["mt_jecUncDown_tauEsNom/ntuple","mt_jecUncNom_tauEsDown/ntuple","mt_jecUncNom_tauEsNom/ntuple","mt_jecUncNom_tauEsUp/ntuple","mt_jecUncUp_tauEsNom/ntuple"]
    #training_logs = [jsonTools.JsonDict("TrainingLog.json")]
    training_logs = []
    for element in args.training_logs:
        training_logs.append(jsonTools.JsonDict(element))

    for channel in args.channels:
        args_list = []
        for element in filenames:
            args_list.append(
                [element, training_logs, channel, args.calc_Training_BDT])
        aTools.parallelize(file_wrapper, args_list, args.j)
	#if log.isEnabledFor(logging.DEBUG):
	#	import pprint
	#	pprint.pprint(plot_configs)
	
	# delete existing output files
	tmp_output_files = list(set([os.path.join(config["output_dir"], config["filename"]+".root") for config in plot_configs[:args.n_plots[0]]]))
	for output_file in tmp_output_files:
		if os.path.exists(output_file):
			os.remove(output_file)
			log.debug("Removed file \""+output_file+"\" before it is recreated again.")
	output_files = list(set(output_files))
	
	# create input histograms with HarryPlotter
	higgsplot.HiggsPlotter(list_of_config_dicts=plot_configs, list_of_args_strings=[args.args], n_processes=args.n_processes, n_plots=args.n_plots[0])
	if args.n_plots[0] != 0:
		tools.parallelize(_call_command, hadd_commands, n_processes=args.n_processes)
	
	debug_plot_configs = []
	for output_file in (output_files if not args.for_dcsync else merged_output_files):
		debug_plot_configs.extend(plotconfigs.PlotConfigs().all_histograms(output_file, plot_config_template={"markers":["E"], "colors":["#FF0000"]}))
	higgsplot.HiggsPlotter(list_of_config_dicts=debug_plot_configs, list_of_args_strings=[args.args], n_processes=args.n_processes, n_plots=args.n_plots[1])
	
	# update CombineHarvester with the yields and shapes
	datacards.extract_shapes(
			os.path.join(args.output_dir, input_root_filename_template.replace("$", "")),
			bkg_histogram_name_template, sig_histogram_name_template,
			bkg_syst_histogram_name_template, sig_syst_histogram_name_template,
			update_systematics=True
	)
	
	# add bin-by-bin uncertainties
Exemplo n.º 40
0
	def multi_plots(self, list_of_config_dicts, list_of_args_strings, n_processes=1, n_fast_plots=None):
		config_dicts = list_of_config_dicts if isinstance(list_of_config_dicts, collections.Iterable) and not isinstance(list_of_config_dicts, basestring) else [list_of_config_dicts]
		args_strings = list_of_args_strings if isinstance(list_of_args_strings, collections.Iterable) and not isinstance(list_of_args_strings, basestring) else [list_of_args_strings]
		
		# fill up missing list elements by repeating previous ones
		n_plots = max(len(config_dicts), len(args_strings))
		if (len(config_dicts) > 1) and (len(config_dicts) < n_plots):
			log.warning("Too few config dicts specified!")
		if (len(args_strings) > 1) and (len(args_strings) < n_plots):
			log.warning("Too few argument lists specified!")
		config_dicts = (config_dicts*n_plots)[:n_plots]
		args_strings = (args_strings*n_plots)[:n_plots]
		
		harry_args = []
		for config_dict, args_string in zip(config_dicts, args_strings):
			if config_dict is None:
				harry_args.append(None)
			else:
				config_dict["comment"] = " ".join(sys.argv)
				if "json_defaults" in config_dict:
					json_defaults_dict = jsonTools.JsonDict(config_dict["json_defaults"]).doIncludes().doComments()
					config_dict.pop("json_defaults")
					json_defaults_dict.update(config_dict)
					config_dict = json_defaults_dict
				harry_args.append("--json-defaults \"%s\"" % jsonTools.JsonDict(config_dict).toString(indent=None).replace("\"", "'"))
			
			if not args_string is None:
				if harry_args[-1] is None:
					harry_args[-1] = args_string
				else:
					harry_args[-1] += (" "+args_string)
				if config_dict is None:
					harry_args[-1] += (" --comment " + (" ".join(sys.argv)))
		if not n_fast_plots is None:
			harry_args = harry_args[:n_fast_plots]
		
		# multi processing of multiple plots
		output_filenames = []
		failed_plots = []
		if len(harry_args) > 1 and n_processes > 1:
			log.info("Creating {:d} plots in {:d} processes".format(len(harry_args), min(n_processes, len(harry_args))))
			results = tools.parallelize(pool_plot, zip([self]*len(harry_args), harry_args), n_processes)
			tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(*([result for result in results if not result is None and result != (None,)]))
			output_filenames = [output_filename for output_filename in tmp_output_filenames if not output_filename is None]
			failed_plots = [(failed_plot, error_message) for failed_plot, error_message in zip(tmp_failed_plots, tmp_error_messages) if not failed_plot is None]
		
		# single processing of multiple plots
		elif len(harry_args) > 1:
			log.info("Creating {:d} plots".format(len(harry_args)))
			for harry_args in harry_args:
				try:
					output_filenames.append(self.plot(harry_args))
				except SystemExit as e:
					failed_plots.append((harry_args, None))
				except Exception as e:
					log.info(str(e))
					failed_plots.append((harry_args, None))
		
		# single plot
		elif len(harry_args) > 0:
			output_filenames.append(self.plot(harry_args[0]))
		
		if len(failed_plots) > 0:
			log.error("%d failed plots:" % len(failed_plots))
			for failed_plot in failed_plots:
				log.info("\n"+tools.get_colored_string("Failed plot:", color='red'))
				log.info("\t%s" % failed_plot[0])
				if failed_plot[1] is not None:
					try:
						log.info(tools.get_indented_text("    ", tools.get_colored_string("Traceback for this plot:", color='red')+"\n" + failed_plot[1]))
					except:
						log.info("\t Traceback for this plot: \n" + failed_plot[1])
		
		return output_filenames
	def combine(self, datacards_cbs, datacards_workspaces, datacards_poi_ranges=None, n_processes=1, *args, **kwargs):
		if datacards_poi_ranges is None:
			datacards_poi_ranges = {}
		tmp_args = " ".join(args)
		
		for key, value in kwargs.items():
			higgs_mass = value if "higgs_mass" in key else "0"		

		chunks = [[None, None]]
		if "{CHUNK}" in tmp_args and "--points" in tmp_args:
			splited_args = tmp_args.split()
			n_points = int(splited_args[splited_args.index("--points") + 1])
			n_points_per_chunk = 199
			chunks = [[chunk*n_points_per_chunk, (chunk+1)*n_points_per_chunk-1] for chunk in xrange(n_points/n_points_per_chunk+1)]

		method = re.search("(-M|--method)[\s=\"\']*(?P<method>\w*)[\"\']?\s", tmp_args)
		if not method is None:
			method = method.groupdict()["method"]

		name = re.search("(-n|--name)[\s=\"\']*(?P<name>\w*)[\"\']?\s", tmp_args)
		if not name is None:
			name = name.groupdict()["name"]

		split_stat_syst_uncs = kwargs.get("split_stat_syst_uncs", False)
		if split_stat_syst_uncs and (method is None):
			log.error("Uncertainties are not split into stat. and syst. components, since the method for combine is unknown!")
			split_stat_syst_uncs = False
		if split_stat_syst_uncs and (not "MultiDimFit" in method):
			log.error("Uncertainties are not split into stat. and syst. components. This is only supported for the MultiDimFit method!")
			split_stat_syst_uncs = False

		split_stat_syst_uncs_options = [""]
		split_stat_syst_uncs_names = [""]
		if split_stat_syst_uncs:
			split_stat_syst_uncs_options = [
				"--saveWorkspace",
				"--snapshotName {method} -w w".format(method=method),
				"--snapshotName {method} -w w --freezeNuisanceGroups syst_plus_bbb".format(method=method, uncs="{uncs}"), #DBUG TEST!!!!!!!!!18.1.2017 --freezeNuisances
			]
			split_stat_syst_uncs_names = [
				"Workspace",
				"TotUnc",
				"StatUnc",
			]

		for split_stat_syst_uncs_index, (split_stat_syst_uncs_option, split_stat_syst_uncs_name) in enumerate(zip(split_stat_syst_uncs_options, split_stat_syst_uncs_names)):
			prepared_tmp_args = None
			new_name = None
			if split_stat_syst_uncs:
				new_name = ("" if name is None else name) + split_stat_syst_uncs_name
				if name is None:
					prepared_tmp_args = tmp_args + " -n " + new_name
				else:
					prepared_tmp_args = copy.deepcopy(tmp_args)
					prepared_tmp_args = re.sub("(--algo)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2 "+("none" if split_stat_syst_uncs_index == 0 else "\\3")+"\\4", prepared_tmp_args)
					prepared_tmp_args = re.sub("(-n|--name)([\s=\"\']*)(\w*)([\"\']?\s)", "\\1\\2"+new_name+"\\4", prepared_tmp_args)
			else:
				prepared_tmp_args = tmp_args

			prepared_tmp_args = re.sub("-n -n", "-n", prepared_tmp_args)

			commands = []
			for chunk_index, (chunk_min, chunk_max) in enumerate(chunks):
				commands.extend([[
						"combine -m {MASS} {POI_RANGE} {ARGS} {CHUNK_POINTS} {SPLIT_STAT_SYST_UNCS} {WORKSPACE}".format(
								MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass, # TODO: maybe there are more masses?
								POI_RANGE="--rMin {RMIN} --rMax {RMAX}" if datacard in datacards_poi_ranges else "",
								ARGS=prepared_tmp_args.format(CHUNK=str(chunk_index), RMIN="{RMIN}", RMAX="{RMAX}"),
								CHUNK_POINTS = "" if (chunk_min is None) or (chunk_max is None) else "--firstPoint {CHUNK_MIN} --lastPoint {CHUNK_MAX}".format(
										CHUNK_MIN=chunk_min,
										CHUNK_MAX=chunk_max
								),
								SPLIT_STAT_SYST_UNCS=split_stat_syst_uncs_option.format(uncs=",".join(kwargs.get("additional_freeze_nuisances", [])+datacards_cbs[datacard].syst_name_set())),
								WORKSPACE="-d "+workspace
						).format(RMIN=datacards_poi_ranges.get(datacard, ["", ""])[0], RMAX=datacards_poi_ranges.get(datacard, ["", ""])[1]),
						os.path.dirname(workspace)
				] for datacard, workspace in datacards_workspaces.iteritems()])

			tools.parallelize(_call_command, commands, n_processes=n_processes, description="combine")

			if split_stat_syst_uncs and (split_stat_syst_uncs_index == 0):
				# replace workspaces by saved versions from the first fit including the postfit nuisance parameter values
				for datacard, workspace in datacards_workspaces.iteritems():
					datacards_workspaces[datacard] = glob.glob(os.path.join(os.path.dirname(workspace), "higgsCombine"+new_name+"."+method+".*.root"))[0]
		config["parameters_list"] = []
		#print "Open RootFile\t", os.path.join(folder, file_list[0])
		infile = ROOT.TFile(os.path.join(folder, file_list[0]), "READ")
		intree = infile.Get("TestTree")
		for branch in intree.GetListOfBranches():
			branch_name = branch.GetName()
			if not "class" in branch_name and not "weight" in branch_name:
				config["parameters_list"].append(branch.GetName())
		infile.Close()

		config["request_nick"] = container[2]+"_signal"
		config["nicks"] = [container[2]+"_signal"]
		config["weights"]= ["(classID==1)"]
		channel, category_string, requested_sample = config["channel"], config["category"], config["request_nick"]
		config["storage_name_extension"] = os.path.join(storage_name_extension, channel, category_string, requested_sample)
		plot_configs.append(copy.deepcopy(config))
		if not os.path.exists(config["storage_name_extension"]):
			os.makedirs(config["storage_name_extension"])

		config["request_nick"] = container[2]+"_bkg"
		config["nicks"] = [container[2]+"_bkg"]
		config["weights"]= ["(classID==0)"]
		channel, category_string, requested_sample = config["channel"], config["category"], config["request_nick"]
		config["storage_name_extension"] = os.path.join(storage_name_extension, channel, category_string, requested_sample)
		plot_configs.append(copy.deepcopy(config))
		if not os.path.exists(config["storage_name_extension"]):
			os.makedirs(config["storage_name_extension"])

	aTools.parallelize(calculate_partial_correlation, plot_configs, n_processes=args.n_processes)
Exemplo n.º 43
0
    def multi_plots(self,
                    list_of_config_dicts,
                    list_of_args_strings,
                    n_processes=1,
                    n_fast_plots=None,
                    batch=None):
        config_dicts = list_of_config_dicts if isinstance(
            list_of_config_dicts, collections.Iterable) and not isinstance(
                list_of_config_dicts, basestring) else [list_of_config_dicts]
        args_strings = list_of_args_strings if isinstance(
            list_of_args_strings, collections.Iterable) and not isinstance(
                list_of_args_strings, basestring) else [list_of_args_strings]

        # fill up missing list elements by repeating previous ones
        n_plots = max(len(config_dicts), len(args_strings))
        if (len(config_dicts) > 1) and (len(config_dicts) < n_plots):
            log.warning("Too few config dicts specified!")
        if (len(args_strings) > 1) and (len(args_strings) < n_plots):
            log.warning("Too few argument lists specified!")
        config_dicts = (config_dicts * n_plots)[:n_plots]
        args_strings = (args_strings * n_plots)[:n_plots]

        if n_processes > 1:
            for i in range(len(args_strings)):
                args_strings[i] += (" --hide-progressbar ")

        self.harry_args = []
        for config_dict, args_string in zip(config_dicts, args_strings):
            if config_dict is None:
                self.harry_args.append(None)
            else:
                config_dict["comment"] = " ".join(sys.argv)
                if not batch is None:
                    config_dict["dry_run"] = True
                if "json_defaults" in config_dict:
                    json_defaults_dict = jsonTools.JsonDict(
                        config_dict["json_defaults"]).doIncludes().doComments(
                        )
                    config_dict.pop("json_defaults")
                    json_defaults_dict.update(config_dict)
                    config_dict = json_defaults_dict
                self.harry_args.append(
                    "--json-defaults \"%s\"" %
                    jsonTools.JsonDict(config_dict).toString(
                        indent=None).replace("\"", "'"))

            if not args_string is None:
                if self.harry_args[-1] is None:
                    self.harry_args[-1] = args_string
                else:
                    self.harry_args[-1] += (" " + args_string)
                if config_dict is None:
                    self.harry_args[-1] += (" --comment " +
                                            (" ".join(sys.argv)))
                    if not batch is None:
                        self.harry_args[-1] += " --dry-run"

        if not n_fast_plots is None:
            self.harry_args = self.harry_args[:n_fast_plots]
            n_plots = len(self.harry_args)

        self.harry_cores = [None] * n_plots

        # multi processing of multiple plots
        output_filenames = []
        failed_plots = []
        if (n_plots > 1) and (n_processes > 1):
            log.info("Creating {:d} plots in {:d} processes".format(
                n_plots, min(n_processes, n_plots)))
            results = tools.parallelize(pool_plot,
                                        zip([self] * n_plots, range(n_plots)),
                                        n_processes,
                                        description="Plotting")
            tmp_output_filenames, tmp_failed_plots, tmp_error_messages = zip(
                *([
                    result for result in results
                    if not result is None and result != (None, )
                ]))
            output_filenames = [
                output_filename for output_filename in tmp_output_filenames
                if not output_filename is None
            ]
            failed_plots = [(failed_plot, error_message)
                            for failed_plot, error_message in zip(
                                tmp_failed_plots, tmp_error_messages)
                            if not failed_plot is None]

        # single processing of multiple plots
        elif n_plots > 1:
            log.info("Creating {:d} plots".format(n_plots))
            for plot_index in xrange(n_plots):
                try:
                    output_filenames.append(self.plot(plot_index))
                except SystemExit as e:
                    failed_plots.append((self.harry_args[plot_index], None))
                except Exception as e:
                    log.info(str(e))
                    failed_plots.append((self.harry_args[plot_index], None))

        # single plot
        elif n_plots > 0:
            output_filenames.append(self.plot(0))

        # batch submission
        if (not (batch is None)) and (len(failed_plots) < n_plots):
            workdir = tempfile.mkdtemp(
                prefix="harry_work_" +
                datetime.datetime.now().strftime("%Y-%m-%d_%H-%M") + "_")

            main_config = ""
            with open(
                    os.path.expandvars(
                        "$CMSSW_BASE/src/Artus/HarryPlotter/data/grid-control_base_config.conf"
                    ), "r") as main_config_file:
                main_config = main_config_file.read()

            backend_config = ""
            with open(
                    os.path.expandvars(
                        "$CMSSW_BASE/src/Artus/Configuration/data/grid-control_backend_"
                        + batch + ".conf"), "r") as backend_config_file:
                backend_config = backend_config_file.read()

            final_config = string.Template(main_config).safe_substitute(
                cmsswbase=os.path.expandvars("$CMSSW_BASE"),
                hpworkbase=os.path.expandvars("$HP_WORK_BASE"),
                cwd=os.getcwd(),
                jsonconfigs="\n\t" +
                ("\n\t".join([item[0] for item in output_filenames])),
                executable=self.standalone_executable,
                workdir=workdir,
                backend=backend_config)
            final_config_filename = workdir + ".conf"
            with open(final_config_filename, "w") as final_config_file:
                final_config_file.write(final_config)

            command = "go.py " + final_config_filename
            log.info(command)
            logger.subprocessCall(shlex.split(command))

        if len(failed_plots) > 0:
            log.error("%d failed plots:" % len(failed_plots))
            for failed_plot in failed_plots:
                log.info("\n" +
                         tools.get_colored_string("Failed plot:", color='red'))
                log.info("\t%s" % failed_plot[0])
                if failed_plot[1] is not None:
                    try:
                        log.info(
                            tools.get_indented_text(
                                "    ",
                                tools.get_colored_string(
                                    "Traceback for this plot:", color='red') +
                                "\n" + failed_plot[1]))
                    except:
                        log.info("\t Traceback for this plot: \n" +
                                 failed_plot[1])

        return output_filenames
						#help="Second Categories, can be specified multiple times. Several categories specified at once will be concatenated with or[Default: %(default)s]")
	#parser.add_argument("-S", "--Samples", nargs="+", default=["ggh", "qqh"],
	                    #help="Samples to be compared [Default: %(default)s]")
	#parser.add_argument("-o", "--output-dir",
						#default="./",
						#help="path to output file. [Default: %(default)s]")
	args = parser.parse_args()
	
	#clean argument input-files
	inputs = []
	for entry in args.input_files:
		inputs.append(entry.strip(',').strip('"'))
	if len(inputs)>1:
		filenames = inputs
	else:
		if os.path.isdir(inputs[0]):
			filenames = glob.glob(os.path.join(inputs[0], "*", "*.root"))
		else:
			filenames = inputs
	#ntuple_strings = ["mt_jecUncDown_tauEsNom/ntuple","mt_jecUncNom_tauEsDown/ntuple","mt_jecUncNom_tauEsNom/ntuple","mt_jecUncNom_tauEsUp/ntuple","mt_jecUncUp_tauEsNom/ntuple"]
	#training_logs = [jsonTools.JsonDict("TrainingLog.json")]
	training_logs = []
	for element in args.training_logs:
		training_logs.append(jsonTools.JsonDict(element))
	
	for channel in args.channels:
		args_list = []
		for element in filenames:
			args_list.append([element, training_logs, channel, args.calc_Training_BDT])
		aTools.parallelize(file_wrapper, args_list, args.j)
				"combineTool.py -M Impacts -d {WORKSPACE} -m {MASS} --robustFit 1 --minimizerTolerance 0.1 --minimizerStrategy 0 --minimizerAlgoForMinos Minuit2,migrad --output impacts.json --parallel {NPROCS} --allPars {ARGS}".format(
						MASS=[mass for mass in datacards_cbs[datacard].mass_set() if mass != "*"][0] if len(datacards_cbs[datacard].mass_set()) > 1 else higgs_mass,
						ARGS=tmp_args.format(),
						WORKSPACE=workspace,
						NPROCS=n_processes
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])

		commandsPlot = []
		commandsPlot.extend([[
				"plotImpacts.py -i {INPUT} -o {OUTPUT}".format(
						INPUT="impacts.json",
						OUTPUT="plots/nuisance_impacts"
				),
				os.path.dirname(workspace)
		] for datacard, workspace in datacards_workspaces.iteritems()])

		tools.parallelize(_call_command, commandsInitialFit, n_processes=n_processes, description="combineTool.py (initial fits)")
		tools.parallelize(_call_command, commandsFits, n_processes=1, description="combineTool.py (fits)")
		tools.parallelize(_call_command, commandsOutput, n_processes=1, description="combineTool.py (outputs)")
		tools.parallelize(_call_command, commandsPlot, n_processes=n_processes, description="combineTool.py (plots)")

	def auto_rebin(self, bin_threshold = 1.0, rebin_mode = 0):
		rebin = ch.AutoRebin()
		rebin.SetBinThreshold(bin_threshold)
		rebin.SetRebinMode(rebin_mode)
		rebin.SetPerformRebin(True)
		rebin.SetVerbosity(0)
		rebin.Rebin(self.cb, self.cb)
Exemplo n.º 46
0
    # 	# Delete existing output files
    # output_files = list(set([os.path.join(config["output_dir"], config["filename"]+".root") for config in plot_configs[:args.n_plots[0]]]))
    # for output_file in output_files:
    # 		if os.path.exists(output_file):
    # 			os.remove(output_file)
    # 			log.debug("Removed file \""+output_file+"\" before it is recreated again.")

    # Create input histograms with HarryPlotter
    if "inputs" in args.steps:
        higgsplot.HiggsPlotter(list_of_config_dicts=plot_configs,
                               n_processes=args.n_processes,
                               n_plots=args.n_plots[0])
        if args.n_plots[0] != 0:
            tools.parallelize(_call_command,
                              hadd_commands,
                              n_processes=args.n_processes)

    # Update CombineHarvester with the yields and shapes
    datacards.extract_shapes(os.path.join(
        args.output_dir, input_root_filename_template.replace("$", "")),
                             bkg_histogram_name_template,
                             sig_histogram_name_template,
                             bkg_syst_histogram_name_template,
                             sig_syst_histogram_name_template,
                             update_systematics=True)

    # Add bin-by-bin uncertainties
    if not args.no_bbb_uncs:
        datacards.add_bin_by_bin_uncertainties(
            processes=datacards.cb.cp().backgrounds().process_set() +
def main():
	parser = argparse.ArgumentParser(description="Collect matching trees from input files into one output tree",
	                                 parents=[logger.loggingParser])

	parser.add_argument("-i", "--input-dirs", help="Input directories = crab project directories containing the subdirectories with crab tasks", nargs="+")
	parser.add_argument("-o", "--output-dir", default=None,
	                    help="Local output directory. [Default: subdir \"results\" in first input directory]")
	parser.add_argument("-d", "--dcache-target", default=None,
	                    help="Directory on dCache (srm) where the files should be copied to. [Default: %(default)s]")
	
	parser.add_argument("--input-trees", nargs="+", default=["svfitCache"],
	                    help="Paths of input SVfit cache trees. [Default: %(default)s]")
	parser.add_argument("--output-tree", default="svfitCache",
	                    help="Name of output SVfit cache tree. [Default: %(default)s]")
	parser.add_argument("--previous-cache", default="",
	                    help="Path to a previous cache which will be merged. [Default: %(default)s]")
	parser.add_argument("--dcache", type=bool, default=False,
	                    help="Read&Write from and to desy dcache[Default: %(default)s]")
	parser.add_argument("--no-run", default=False, action="store_true",
	                    help="Do not run but only print dict  [Default: %(default)s]")
	parser.add_argument("-n", "--n-processes", type=int, default=1,
	                    help="Number of (parallel) processes. [Default: %(default)s]")
	
	args = parser.parse_args()
	logger.initLogger(args)
	
	if args.output_dir is None:
		args.output_dir = os.path.join(args.input_dirs[0], "results")
	
	# get paths to crab outputs
	#max_n_jobs = 8000
	#max_n_retrieve = 500
	get_crab_outputs_args = []
	for input_dir in args.input_dirs:
		#for jobid_start in xrange(1, max_n_jobs, max_n_retrieve):
		#	jobid_end = jobid_start + max_n_retrieve - 1
		#	get_crab_outputs_args.append([input_dir, "{jobid_start}-{jobid_end}".format(jobid_start=jobid_start, jobid_end=jobid_end)])
		get_crab_outputs_args.append([input_dir, "1-10"])
	
	tar_files = tools.parallelize(_get_crab_outputs, get_crab_outputs_args, max(args.n_processes, 2), description="crab getoutput --dump")
	tar_files = tools.flattenList(tar_files)
	
	# download and un-tar
	download_untar_args = [[tar_file, args.output_dir] for tar_file in tar_files]
	tools.parallelize(_download_untar, download_untar_args, args.n_processes, description="download and un-tar crab outputs")
	
	root_files = glob.glob(os.path.join(args.output_dir, "*.root"))
	# TODO: maybe add more root files from -i arguments, that did not need to be un-tared
	
	root_files_per_sample_nick = {}
	for root_file in root_files:
		basename = os.path.basename(root_file)
		sample_nick = basename[:basename.index("_job_")]
		root_files_per_sample_nick.setdefault(sample_nick, []).append(root_file)
	
	merged_output_dir = os.path.join(args.output_dir, "merged")
	if not os.path.exists(merged_output_dir):
		os.makedirs(merged_output_dir)
	merge_outputs_args = [[os.path.join(merged_output_dir, sample_nick+".root"), tmp_root_files, "-f"] for sample_nick, tmp_root_files in root_files_per_sample_nick.iteritems()]
	tools.parallelize(_merge_outputs, merge_outputs_args, args.n_processes, description="merging")
	
	if args.dcache_target:
		dcache_copy_commands = ["gfal-copy -v -f -r "+merged_output_dir+" "+args.dcache_target]
		tools.parallelize(_call_command, dcache_copy_commands, args.n_processes, description="copying to dCache")
	
	rm_commands = ["rm "+root_file for root_file in root_files]
	if args.dcache_target:
		rm_commands.extend(["rm "+os.path.join(merged_output_dir, sample_nick+".root") for sample_nick in root_files_per_sample_nick.keys()])
	tools.parallelize(_call_command, rm_commands, args.n_processes, description="deleting temporary files")
	
	log.info("\nJSON configuration for Artus:\n")
	config_output_dir = args.dcache_target if args.dcache_target else merged_output_dir
	for src, dst in filename_replacements.iteritems():
		config_output_dir = config_output_dir.replace(src, dst)
	for sample_nick in sorted(root_files_per_sample_nick.keys()):
		log.info("\""+sample_nick+"\" : \""+os.path.join(config_output_dir, sample_nick+".root")+"\",")
def main():
	parser = argparse.ArgumentParser(description="Collect matching trees from input files into one output tree",
	                                 parents=[logger.loggingParser])

	parser.add_argument("-i", "--input", help="Input directory with merged Artus outputs including Svit Cache files")
	parser.add_argument("-o", "--output", default="svfitCache.root",
	                    help="Output ROOT file. [Default: %(default)s]")
	
	parser.add_argument("--input-trees", nargs="+", default=["svfitCache"],
	                    help="Paths of input SVfit cache trees. [Default: %(default)s]")
	parser.add_argument("--output-tree", default="svfitCache",
	                    help="Name of output SVfit cache tree. [Default: %(default)s]")
	parser.add_argument("--previous-cache", default="",
	                    help="Path to a previous cache which will be merged. [Default: %(default)s]")
	parser.add_argument("--dcache", type=bool, default=False,
	                    help="Read&Write from and to desy dcache[Default: %(default)s]")
	parser.add_argument("--no-run", default=False, action="store_true",
	                    help="Do not run but only print dict  [Default: %(default)s]")
	parser.add_argument("-n", "--n-processes", type=int, default=1,
	                    help="Number of (parallel) processes. [Default: %(default)s]")
	
	args = parser.parse_args()
	logger.initLogger(args)
	
	merge_commands = []
	copy_commands = []
	config_file = []
	
	ls_command = "gfal-ls %s" %(srm(args.output))
	retCode = logger.subprocessCall(ls_command.split())
	if(retCode != 0):
		mkdir_command = "gfal-mkdir %s" %(srm(args.output))
		log.info("Creating " + srm(args.output))
		logger.subprocessCall(mkdir_command.split())
	tmpdir = tempfile.mkdtemp(suffix='', prefix='tmp', dir="/tmp") #dir=os.getcwd())
	
	if not args.dcache:
		if not args.no_run:
			for input in glob.glob(args.input + "/*/*.root"):
				output = tmpdir 
				input_trees = args.input_trees
				output_trees = args.output_tree
				config = jsonTools.JsonDict(input)
				pipelines = config.get("Pipelines", {}).keys()
				# extract names without the leading channel
				pipelines = ["_".join(pipeline.split("_")[1:]) for pipeline in pipelines]
				pipelines = list(set(pipelines))
				pipelines = [x for x in pipelines if x != '']
				merge_commands = []
				for pipeline in pipelines:
					out_filename = os.path.join(output, pipeline, "svfitCache_" + os.path.basename(input))
					if not os.path.exists(os.path.dirname(out_filename)):
						os.makedirs(os.path.dirname(out_filename))
					pipeline_input_trees = [pipeline+"/"+input_tree for input_tree in input_trees]
					merged_tree_name = treemerge.treemerge(
							[input],  pipeline_input_trees,
							out_filename, output_trees,
							match_input_tree_names=True
					)
					log.info("SVfit cache trees collected in \"%s\"." % merged_tree_name)
			
			if args.previous_cache: # check for all available files in previous_cache
				previous_caches = glob.glob(args.previous_cache + "*/*.root")
				previous_cachefiles = [ "/".join(cache.split("/")[-2:]) for cache in previous_caches ]
				for cachefile in previous_cachefiles:
					current = os.path.join(output, cachefile)
					previous = os.path.join(args.previous_cache, cachefile)
					if not os.path.exists(os.path.dirname(current)):
						os.makedirs(os.path.dirname(current))
					if os.path.exists(current):
						merge_commands.append("mv %s %s_tmp.root "%(current, current))
						merge_commands.append("hadd -f -f6 %s %s_tmp.root %s "%(current, current, previous))
						merge_commands.append("rm %s_tmp.root "%(current))
					else:
						merge_commands.append("hadd -f -f6 %s %s"%(current, previous))
				tools.parallelize(_call_command, merge_commands, args.n_processes, description="merging")
			
			# move to output-directory
			copy_command = "gfal-copy -r file:///%s %s" % (output, srm(args.output) )
			logger.subprocessCall(copy_command.split())
		
		# print c&p summary
		current_caches = glob.glob(args.output + "*/*.root")
		nicknames = list(set([ os.path.basename(cache).split(".")[0].replace("svfitCache_", "") for cache in current_caches ]))
		for nick in sorted(nicknames):
			config_file.append('\t\t\t"%s" : "%s",' % (nick, xrd(args.output) + "/svfitCache_" + nick + ".root"))
	
	else:
		input_dirs = glob.glob(args.input + "/*/*/*")
		untar_commands = ["tar xf %s -C %s"%(file,tmpdir) for input_dir in input_dirs for file in glob.glob(input_dir + "/*.tar*")]
		if not args.no_run:
			tools.parallelize(_call_command, untar_commands, args.n_processes, description="unpacking")
		regex=re.compile(".*/(.*)_job_[0-9]+_SvfitCache.._(.*?)[0-9]+.root")
		matches = [(regex.match(file).groups(),file) for file in glob.glob(tmpdir+"/*.root")]
		dirs = {}
		
		# go through matches and create nested dict {'sample' : {'Pipeline' : [files]}}
		for match in matches:
			if match[0][0] not in dirs:
				dirs[match[0][0]] = {}
			if match[0][1] not in dirs[match[0][0]]:
				dirs[match[0][0]][match[0][1]] = []
			dirs[match[0][0]][match[0][1]].append(match[1])
		
		for sample in dirs:
			for pipeline in dirs[sample]:
				# create folders as needed
				if not os.path.exists(tmpdir + "/" + pipeline):
					os.makedirs(tmpdir + "/" + pipeline)
				previous_cache_file = ""
				if args.previous_cache:
					if os.path.isfile(args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root"):
						previous_cache_file = args.previous_cache + "/" + pipeline + "/svfitCache_" + sample + ".root"
				tmp_filename = tmpdir + "/" + pipeline + "/svfitCache_" + sample + ".root"
				out_filename = args.output + "/" + pipeline + "/svfitCache_" + sample + ".root"
				merge_commands.append("hadd -f %s %s %s"%(tmp_filename, " ".join(dirs[sample][pipeline]), previous_cache_file))
				copy_commands.append("gfal-copy -f file:///%s %s" % (tmp_filename, srm(out_filename) ))
			config_file.append('"%s" : "%s",' % (sample, xrd(args.output) + "/svfitCache_" + sample + ".root"))
		
		if not args.no_run:
			tools.parallelize(_call_command, merge_commands, args.n_processes, description="merging")
			tools.parallelize(_call_command, copy_commands, args.n_processes, description="copying")
	
	shutil.rmtree(tmpdir)
	log.info("done. Artus SvfitCacheFile settings: ")
	
	for entry in config_file: 
		log.info(entry)
					files_dict[file_name][channel]["ntuples"].append(n)
					for front, back in itertools.product(args.replacements+["-.;+"], repeat = 2):
						n2 = n.replace(args.rs, front, 1).replace(args.rs, back,1).replace("-.;+", args.rs)
						if w not in files_dict[file_name][channel]["weights"] or n2 not in files_dict[file_name][channel]["ntuples"]:
							files_dict[file_name][channel]["weights"].append(w)
							files_dict[file_name][channel]["ntuples"].append(n2)
				#if n not in files_dict[file_name][channel]["ntuples"]:
	jsonTools.JsonDict(files_dict).save(os.path.join(output_dir, "ReduceFiles.json"), indent=4)

	give_away_list = []
	for filename, item in files_dict.iteritems():
		item["in_dir"] = input_dir
		item["out_dir"] = output_dir
		give_away_list.append({filename:item})

	aTools.parallelize(reduce_file, give_away_list, n_processes=args.n_processes)

	#for aufruf in give_away_list:
		#reduce_file(aufruf)