def punzi_target(priors, relevant_classes, params, mode = "S"): zzroot = os.environ["CMSSW_BASE"] bin_dir = os.path.join(zzroot, "bin/slc6_amd64_gcc630/") cost_function_evaluator = "run_prior_evaluator" output = check_output([bin_dir + cost_function_evaluator, run_dir, out_dir, engine, str(params["min_iterations"]), str(params["max_iterations"]), str(priors["ggh_prior"]), str(priors["whhadr_prior"]), str(priors["zhhadr_prior"]), str(priors["whlept_prior"]), str(priors["zhlept_prior"]), str(priors["zhmet_prior"]), str(priors["tthhadr_prior"]), str(priors["tthlept_prior"]), str(priors["bkg_prior"]), str(priors["qq_prior"]), mode, ref_dir]) if mode == "S": punzi_file = "Mor18_punzi_S_comp.conf" elif mode == "SB": punzi_file = "Mor18_punzi_comp.conf" # read directly the configuration file containing the relative Punzi improvements w.r.t. the reference # (the one with flat priors) punzihandler = ConfigFileHandler() punzihandler.load_configuration(os.path.join(out_dir, punzi_file)) costval = 0.0 # use the weighted cost function delta_pi = [] for relevant_class in relevant_classes: delta_pi.append(float(punzihandler.get_field('Punzi', relevant_class)) - 1.0) costval = cost_func(delta_pi, 8.0, 2) if math.isnan(costval): print "caught NaN!" costval = -7.0 return costval
def punzi_target(priors, relevant_classes, params): bin_dir = "/home/llr/cms/wind/cmssw/CMSSW_9_4_2/bin/slc6_amd64_gcc630/" cost_function_evaluator = "run_prior_evaluator" output = check_output([ bin_dir + cost_function_evaluator, run_dir, out_dir, engine, str(params["min_iterations"]), str(params["max_iterations"]), str(priors["ggh_prior"]), str(priors["whhadr_prior"]), str(priors["zhhadr_prior"]), str(priors["whlept_prior"]), str(priors["zhlept_prior"]), str(priors["zhmet_prior"]), str(priors["tthhadr_prior"]), str(priors["tthlept_prior"]) ]) # read directly the configuration file containing the relative Punzi improvements w.r.t. the reference # (the one with flat priors) punzihandler = ConfigFileHandler() punzihandler.load_configuration(os.path.join(out_dir, punzi_file)) costval = 0.0 # use the weighted cost function delta_pi = [] for relevant_class in relevant_classes: delta_pi.append( float(punzihandler.get_field('Punzi', relevant_class)) - 1.0) costval = cost_func(delta_pi, 8.0, 2) if math.isnan(costval): print "caught NaN!" costval = -7.0 return costval
def main(): if len(sys.argv) < 3: print "Error: at least 2 arguments are required" campaign_dir = sys.argv[1] workdir = sys.argv[2] if len(sys.argv) >= 4: input_config_file = sys.argv[3] else: input_config_file = None # make sure that the given directory ends with a / if not campaign_dir.endswith('/'): campaign_dir += "/" confhandler = ConfigFileHandler() confhandler.load_configuration(campaign_dir + "campaign.conf") iterables = {} for section in confhandler.get_sections(): if '!' in section: sweep_name = re.sub('!', '', section) sweep_sections = ConfigFileUtils.parse_list(confhandler.get_field(section, 'variables'), lambda x: x) # now look for the sweep variables that belong to this sweep for sweep_section in sweep_sections: # this is a section that determines a new sweep direction, possibly linked sweep_metadata = confhandler.get_field(sweep_section, 'variable').split(':') sweep_scope = sweep_metadata[0] sweep_parameter = sweep_metadata[1] # request more information sweep_behaviour = confhandler.get_field(sweep_section, 'behaviour') if ConfigFileUtils.is_dict(confhandler.get_field(sweep_section, 'start')): # will need a dictionary iterable start_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'start'), lambda x: float(x)) end_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'end'), lambda x: float(x)) step_dict = ConfigFileUtils.parse_dict(confhandler.get_field(sweep_section, 'step'), lambda x: float(x)) if sweep_name not in iterables: it = SweepDimensionDict(sweep_scope, sweep_parameter, start_dict, end_dict, step_dict, sweep_behaviour) iterables[sweep_name] = it else: iterables[sweep_name].add(sweep_scope, sweep_parameter, start_dict, end_dict, step_dict, sweep_behaviour) else: # construct a list iterable instead start_list = ConfigFileUtils.parse_list(confhandler.get_field(sweep_section, 'start'), lambda x: x) end_list = ConfigFileUtils.parse_list(confhandler.get_field(sweep_section, 'end'), lambda x: x) if sweep_name not in iterables: it = SweepDimensionList(sweep_scope, sweep_parameter, start_list, end_list, sweep_behaviour) iterables[sweep_name] = it else: iterables[sweep_name].add(sweep_scope, sweep_parameter, start_list, end_list, sweep_behaviour) MC_path = os.path.join(workdir, "trainval/") model_type = confhandler.get_field('global', 'model_type') # get the mass point from the global config file in a way that ensures backward compatibility try: mass_point = float(confhandler.get_field('global', 'mass_point')) except KeyError: mass_point = 125.0 if model_type == 'SimpleModel': # using the full mass range for training, not using the 118/130GeV cut mcoll = SimpleModelFactoryDynamic.GenerateSimpleModelCollections(MC_path, input_config_file = input_config_file, hyperparam_config_file = None, mass_point = mass_point) elif model_type == 'CombinedModel': mcoll = ModelFactoryFullCategorySetOptimizedInputs.GenerateCombinedModelCollections(MC_path) iterate(iterables, {}, lambda it: augment_config(mcoll, campaign_dir, it))
def get_loss(run, mcoll, model): confhandler = ConfigFileHandler() confhandler.load_configuration( os.path.join(run, "training", mcoll, "model_benchmark.txt")) return float(confhandler.get_field(model, 'val_loss'))
def main(): if len(sys.argv) != 3: print "Error: exactly 2 arguments are required!" source_path = sys.argv[1] #source_path = "/data_CMS/cms/wind/CJLST_NTuples_prepared_systematics/" dest_path = sys.argv[2] # global settings: zzroot = os.environ["CMSSW_BASE"] bin_dir = os.path.join(zzroot, "bin/slc6_amd64_gcc630/") scrambler = os.path.join(bin_dir, "run_scrambler") chunk_extractor = os.path.join(bin_dir, "run_chunk_extractor") settings_path = os.path.join(dest_path, "settings.conf") confhandler = ConfigFileHandler() confhandler.load_configuration(settings_path) # load global settings from the configuration file root_file_name = confhandler.get_field("Global", "root_file_name") source_dir = confhandler.get_field("Global", "source_dir") chunk_size = int(confhandler.get_field("Global", "chunk_size")) def submit_job(cmd_dir, command): job_submitter = os.environ["JOB_SUBMITTER"] filename = str(uuid.uuid4()) + ".sh" file_path = os.path.join(cmd_dir, filename) with open(file_path, "w") as cmd_file: cmd_file.write("#!/bin/bash\n") cmd_file.write(command) while True: try: output = sp.check_output([job_submitter, "-short", file_path]) break except sp.CalledProcessError: print "-------------------------------------------------" print " error submitting job, retrying ... " print "-------------------------------------------------" print output def chunk_file(in_dir, out_root, base_name, number_chunks, cmd_dir): splits = np.linspace(0, 1, number_chunks) in_file = os.path.join(in_dir, root_file_name) if number_chunks == 1: out_folder = os.path.join(out_root, base_name + "_chunk_0/") if not os.path.exists(out_folder): os.makedirs(out_folder) out_file = os.path.join(out_folder, root_file_name) command = " ".join([chunk_extractor, in_file, out_file, str(0.0), str(1.0), str(0)]) submit_job(cmd_dir, command) print command else: for i in range(len(splits) - 1): start_split = splits[i] end_split = splits[i + 1] out_folder = os.path.join(out_root, base_name + "_chunk_" + str(i) + "/") if not os.path.exists(out_folder): os.makedirs(out_folder) out_file = os.path.join(out_folder, root_file_name) command = " ".join([chunk_extractor, in_file, out_file, str(start_split), str(end_split), str(0)]) submit_job(cmd_dir, command) print command # create the needed folders: train_dir = os.path.join(dest_path, "training/") validation_dir = os.path.join(dest_path, "validation/") test_dir = os.path.join(dest_path, "test/") trainval_dir = os.path.join(dest_path, "trainval/") temp_dir = os.path.join(dest_path, "temp/") # create these directories if not os.path.exists(train_dir): os.makedirs(train_dir) if not os.path.exists(validation_dir): os.makedirs(validation_dir) if not os.path.exists(test_dir): os.makedirs(test_dir) if not os.path.exists(trainval_dir): os.makedirs(trainval_dir) if not os.path.exists(temp_dir): os.makedirs(temp_dir) training_files = [cur_file for cur_file in confhandler.get_sections() if "Global" not in cur_file] available_files = next(os.walk(source_path))[1] used_files = [] for training_file in training_files: sect = confhandler.get_section(training_file) print "--------------------------------------------------" print "currently splitting: " + training_file source_files = ConfigFileUtils.parse_list(sect["source"], lambda x: x) train_val_splits = ConfigFileUtils.parse_list(sect["train_val_split"], lambda x: float(x)) val_test_splits = ConfigFileUtils.parse_list(sect["val_test_split"], lambda x: float(x)) # first split the needed files into 3 pieces, as dictated by the splits read from the config file for source_file, train_val_split, val_test_split in zip(source_files, train_val_splits, val_test_splits): print "extracting 0.0 - " + str(train_val_split) + " from " + source_file dest_dir = os.path.join(train_dir, source_file) if not os.path.exists(dest_dir): os.makedirs(dest_dir) output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name), os.path.join(dest_dir, root_file_name), str(0.0), str(train_val_split)]) print output print "-- -- -- -- -- -- -- -- -- -- -- --" print "extracting " + str(train_val_split) + " - " + str(val_test_split) + " from " + source_file dest_dir = os.path.join(validation_dir, source_file) if not os.path.exists(dest_dir): os.makedirs(dest_dir) output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name), os.path.join(dest_dir, root_file_name), str(train_val_split), str(val_test_split)]) print output print "-- -- -- -- -- -- -- -- -- -- -- --" print "extracting " + str(val_test_split) + " - 1.0 from " + source_file dest_dir = os.path.join(test_dir, source_file) if not os.path.exists(dest_dir): os.makedirs(dest_dir) output = sp.check_output([chunk_extractor, os.path.join(source_path, source_file, root_file_name), os.path.join(dest_dir, root_file_name), str(val_test_split), str(1.0)]) print output used_files.append(source_file) print "--------------------------------------------------" unused_files = [cur_file for cur_file in available_files if cur_file not in used_files] # for all files that are not used for training, split them 50:50 into validation and test ... for unused_file in unused_files: source_dir = os.path.join(source_path, unused_file) # ... unless they are only needed to assess systematics, i.e. are not going to be used at all during the validation step if "ext" in unused_file or "tuneup" in unused_file or "tunedown" in unused_file: print "extracting 0.0 - 1.0 from " + unused_file dest_dir = os.path.join(test_dir, unused_file) if not os.path.exists(dest_dir): os.makedirs(dest_dir) output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name), os.path.join(dest_dir, root_file_name), str(0.0), str(1.0)]) print output else: print "extracting 0.0 - 0.5 from " + unused_file dest_dir = os.path.join(validation_dir, unused_file) if not os.path.exists(dest_dir): os.makedirs(dest_dir) output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name), os.path.join(dest_dir, root_file_name), str(0.0), str(0.5)]) print output print "-- -- -- -- -- -- -- -- -- -- -- --" print "extracting 0.5 - 1.0 from " + unused_file dest_dir = os.path.join(test_dir, unused_file) if not os.path.exists(dest_dir): os.makedirs(dest_dir) output = sp.check_output([chunk_extractor, os.path.join(source_dir, root_file_name), os.path.join(dest_dir, root_file_name), str(0.5), str(1.0)]) print output # now have all the needed files split apart, can now proceed to combine them into the training # datasets that will end up in trainval for training_file in training_files: print "now building training dataset: " + training_file sect = confhandler.get_section(training_file) source_folders = ConfigFileUtils.parse_list(sect["source"], lambda x: x) for mode in ["training", "validation"]: temp_dest_folder = os.path.join(dest_path, temp_dir, training_file, mode) temp_dest_file = os.path.join(temp_dest_folder, root_file_name) if not os.path.exists(temp_dest_folder): os.makedirs(temp_dest_folder) source_files = [os.path.join(dest_path, mode, cur_file, root_file_name) for cur_file in source_folders] print "hadd " + temp_dest_file + " " + " ".join(source_files) output = sp.check_output(["hadd", temp_dest_file] + source_files) print output temp_scrambled_folder = os.path.join(dest_path, temp_dir, "scrambled", training_file, mode) if not os.path.exists(temp_scrambled_folder): os.makedirs(temp_scrambled_folder) temp_scrambled_file = os.path.join(temp_scrambled_folder, root_file_name) print scrambler + " " + temp_dest_file + " " + temp_scrambled_file output = sp.check_output([scrambler, temp_dest_file, temp_scrambled_file]) print output trainval_dest_folder = os.path.join(trainval_dir, training_file) if not os.path.exists(trainval_dest_folder): os.makedirs(trainval_dest_folder) print "hadd " + os.path.join(trainval_dest_folder, root_file_name) + " " + os.path.join(dest_path, temp_dir, "scrambled", training_file, "training", root_file_name) + " " + os.path.join(dest_path, temp_dir, "scrambled", training_file, "validation", root_file_name) output = sp.check_output(["hadd", os.path.join(trainval_dest_folder, root_file_name), os.path.join(dest_path, temp_dir, "scrambled", training_file, "training", root_file_name), os.path.join(dest_path, temp_dir, "scrambled", training_file, "validation", root_file_name)]) print output # at the end, chunk the ROOT files into many smaller ones, to keep the augmentation time short train_chunks_dir = os.path.join(dest_path, "training_chunks/") validation_chunks_dir = os.path.join(dest_path, "validation_chunks/") test_chunks_dir = os.path.join(dest_path, "test_chunks/") # create these directories if not os.path.exists(train_chunks_dir): os.makedirs(train_chunks_dir) if not os.path.exists(validation_chunks_dir): os.makedirs(validation_chunks_dir) if not os.path.exists(test_chunks_dir): os.makedirs(test_chunks_dir) for mode in ["training", "validation", "test"]: # look at each file individually and put it into chunks cur_dir = os.path.join(dest_path, mode) available_folders = next(os.walk(cur_dir))[1] for available_folder in available_folders: available_file = os.path.join(cur_dir, available_folder, root_file_name) number_chunks = max(1, os.path.getsize(available_file) / chunk_size) print "now splitting file " + available_file + " into " + str(number_chunks) + " chunks" out_root = os.path.join(dest_path, mode + "_chunks") chunk_file(os.path.join(dest_path, mode, available_folder), out_root, available_folder, number_chunks, temp_dir) print "done."
def main(): if len(sys.argv) != 3: print "Error: exactly 2 arguments are required!" settings_path = sys.argv[1] run_dir = sys.argv[2] confhandler = ConfigFileHandler() confhandler.load_configuration(settings_path) root_file_name = confhandler.get_field("Global", "root_file_name") # need to merge the many individual chunks coming from the augmentation. keep care that the weights are updated correctly! augmentation_training_chunks_dir = os.path.join( run_dir, "augmentation_training_chunks") augmentation_validation_chunks_dir = os.path.join( run_dir, "augmentation_validation_chunks") augmentation_test_chunks_dir = os.path.join(run_dir, "augmentation_test_chunks") augmentation_training_dir = os.path.join(run_dir, "augmentation_training") augmentation_validation_dir = os.path.join(run_dir, "augmentation_validation") augmentation_test_dir = os.path.join(run_dir, "augmentation_test") if not os.path.exists(augmentation_training_dir): os.makedirs(augmentation_training_dir) if not os.path.exists(augmentation_validation_dir): os.makedirs(augmentation_validation_dir) if not os.path.exists(augmentation_test_dir): os.makedirs(augmentation_test_dir) def merge_chunks(source_dir, dest_dir): if not os.path.exists(dest_dir): os.makedirs(dest_dir) available_dirs = next(os.walk(source_dir))[1] merged_dirs = list( set(map(lambda x: re.sub('_chunk_.*$', '', x), available_dirs))) for merged_dir in merged_dirs: chunks = sorted([ cur_dir for cur_dir in available_dirs if merged_dir + "_chunk_" in cur_dir ]) dest_folder = os.path.join(dest_dir, merged_dir) if not os.path.exists(dest_folder): os.makedirs(dest_folder) dest_file = os.path.join(dest_folder, root_file_name) source_files = [ os.path.join(source_dir, chunk, root_file_name) for chunk in chunks ] print "merging " + " ".join(chunks) + " into " + merged_dir # do the raw merging output = sp.check_output(["hadd", dest_file] + source_files) print output # now, need to ensure that the metadata is corrected (as hadd also modfies it in a way that is incorrect here) command = [ "rootcp", "--replace", source_files[0] + ":ClassTree/Counters", dest_file + ":/ClassTree/Counters" ] print " ".join(command) output = sp.check_output(command) print output merge_chunks(augmentation_training_chunks_dir, augmentation_training_dir) merge_chunks(augmentation_validation_chunks_dir, augmentation_validation_dir) merge_chunks(augmentation_test_chunks_dir, augmentation_test_dir)