def plotting_main(data, trainvar, filename, info_dir): """ Main function for plotting. Parameters: ----------- data : pandas DataFrame Data to be used for creating the TProfiles trainvar : str Name of the training variable. filename : str Path to the file where the TProfile will be saved masses_type : str Type of the masses to be used. 'low', 'high' or 'all' info_dir : str Path to the info_directory of the required channel global_settings : dict Global settings (channel, bdtType etc.) Returns: -------- Nothing """ masses = find_masses() histo_dicts_json = os.path.join(info_dir, 'histo_dict.json') histo_dicts = ut.read_parameters(histo_dicts_json) histo_dict = dlt.find_correct_dict('Variable', str(trainvar), histo_dicts) canvas, profile = plotting_init(data, trainvar, histo_dict, masses) canvas.SaveAs(filename)
def initialize_trainvars(channel='2l_2tau', process='HH', random_sample='TTZ'): '''Reads in all the possible trainvars for initial run Parameters: ---------- channel : str Name of the channel where the .root file is taken from (e.g 2l_2tau) process : str Name of the process where the .root is loaded (e.g. ttH or HH) random_sample : str A random sample the .root file tha is to be loaded belongs to Returns: trainvars : list list of all possible trainvars that are to be used in the optimization ''' info_folder = os.path.join(os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/info') inputpath_info_path = os.path.join(info_folder, process, channel, 'tauID_training.json') info_dict = ut.read_parameters(inputpath_info_path)[1] path_to_files = info_dict['inputPath'] wildcard_root_files = os.path.join(path_to_files, '*' + random_sample + '*', 'central', '*.root') single_root_file = glob.glob(wildcard_root_files)[0] channel_info_path = os.path.join(info_folder, process, channel, 'info.json') channel_info_dict = ut.read_multiline_json_to_dict(channel_info_path) channel_in_tree = channel_info_dict['channelInTree'] samplename_info = os.path.join(info_folder, 'samplename_info.json') global_settings = ut.read_settings('global') samplename_info = ut.read_parameters(samplename_info) folder_name = random_sample sample_dict = dlt.find_sample(folder_name, samplename_info) if sample_dict == {}: sample_dict = dl.advanced_sample_name( global_settings['bdtType'], folder_name, []) # TTZ is just a random choice sample_name = sample_dict['sampleName'] input_tree = str( os.path.join(channel_in_tree, 'sel/evtntuple', sample_name, 'evtTree')) trainvars = access_ttree(single_root_file, input_tree) trainvars = data_related_trainvars(trainvars) return trainvars
def read_trainvars_from_histo_dict(histo_dict_path): """Reads the trainvars for which there is histogram info set previously Parameters: ----------- histo_dict_path : str Path where the histo_dict.json is located Returns: ------- old_trainvars : list List of trainvars read from the histo_dict.json file """ histo_dicts = ut.read_parameters(histo_dict_path) old_trainvars = [histo_dict['Variable'] for histo_dict in histo_dicts] return old_trainvars
def do_fit(info_dir, data, preferences): """ Fits the Data with a given order of polynomial Parameters: ----------- info_dir : str Path to the info_directory of the required channel data : pandas DataFrame Data to be used for creating the TProfiles & fits Returns: -------- Nothing """ trainvars = preferences['trainvars'] if 'gen_mHH' in trainvars: trainvars.remove('gen_mHH') masses = find_masses() histo_dicts_json = os.path.join(info_dir, 'histo_dict.json') histo_dicts = ut.read_parameters(histo_dicts_json) for trainvar in trainvars: histo_dict = dlt.find_correct_dict('Variable', str(trainvar), histo_dicts) fit_poly_order = get_fit_function(histo_dict) canvas, profile = plotting_init(data, trainvar, histo_dict, masses) print('Variable Name: ' + str(trainvar)) filename = '_'.join(['TProfile_signal_fit_func', str(trainvar)]) out_file = os.path.join(weight_dir, filename + '.root') fit_function = 'fitFunction_' + str(trainvar) masses = find_masses() mass_min = min(masses) mass_max = max(masses) print('Fitfunction: ' + fit_function) print('Range: ' + '[' + str(mass_min) + ',' + str(mass_max) + ']') function_TF1 = TF1(fit_function, fit_poly_order, float(mass_min), float(mass_max)) result_ptr = TFitResultPtr() result_ptr = profile.Fit(function_TF1, 'SF') # Fit with Minuit function_TF1.Draw('same') canvas.Modified() canvas.Update() canvas.SaveAs(out_file) tfile = ROOT.TFile(out_file, "RECREATE") function_TF1.Write() tfile.Close()
def read_trainvar_info(self, path): """Reads the trainvar info Parameters: ----------- path : str Path to the training file Returns: ------- trainvar_info : dict Dictionary containing trainvar info (e.g is the trainvar supposed to be an integer or not) """ trainvar_info = {} info_dicts = ut.read_parameters(path) for single_dict in info_dicts: trainvar_info[single_dict['key']] = single_dict['true_int'] return trainvar_info
def create_renewed_histo_dict(missing_trainvars, redundant_trainvars, histo_dict_path): """ Creates renewed list of histogram infos. Parameters: ----------- missing_trainvars : list List of new trainvars not present in the old histo_dict.json redundant_trainvars : list List of trainvars in the old histo_dict.json not present in the new trainvars.json histo_dict_path : str Path where the histo_dict.json is located Returns: ------- new_histo_infos : list of dicts List of histo_infos to be saved into the renewed histo_dict.json """ old_histo_dicts = ut.read_parameters(histo_dict_path) new_histo_infos = [] template = { 'Variable': '', 'nbins': 55, 'min': 0.0, 'max': 1100.0, 'fitFunc_AllMassTraining': 'pol1', 'fitFunc_LowMassTraining': 'pol1', 'fitFunc_HighMassTraining': 'pol1' } for old_histo_dict in old_histo_dicts: if old_histo_dict['Variable'] in redundant_trainvars: continue else: new_histo_infos.append(old_histo_dict) for missing_trainvar in missing_trainvars: histo_info = template.copy() histo_info['Variable'] = missing_trainvar new_histo_infos.append(histo_info) return new_histo_infos
def main(hyperparameter_file, output_dir): settings_dir = os.path.join(output_dir, 'run_settings') global_settings = ut.read_settings(settings_dir, 'global') num_classes = global_settings['num_classes'] nthread = global_settings['nthread'] path = Path(hyperparameter_file) save_dir = str(path.parent) hyperparameters = ut.read_parameters(hyperparameter_file)[0] preferences = dlt.get_parameters(global_settings['process'], global_settings['channel'], global_settings['bkg_mass_rand'], global_settings['tauID_training']) data = dlt.load_data( preferences['inputPath'], preferences['channelInTree'], preferences['trainvars'], global_settings['bdtType'], global_settings['channel'], preferences['keys'], preferences['masses'], global_settings['bkg_mass_rand'], ) dlt.reweigh_dataframe(data, preferences['weight_dir'], preferences['trainvars'], ['gen_mHH'], preferences['masses']) normalize_hh_dataframe(data, preferences, global_settings) if bool(global_settings['use_kfold']): score = et.kfold_cv(xt.model_evaluation_main, data, preferences['trainvars'], global_settings, hyperparameters) else: score, pred_train, pred_test = et.get_evaluation( xt.model_evaluation_main, data, preferences['trainvars'], global_settings, hyperparameters) st.save_prediction_files(pred_train, pred_test, save_dir) score_path = os.path.join(save_dir, 'score.json') with open(score_path, 'w') as score_file: json.dump({global_settings['fitness_fn']: score}, score_file)
def main(): settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = ut.read_settings(settings_dir, 'global') output_dir = os.path.expandvars(global_settings['output_dir']) if not os.path.isdir(output_dir): os.makedirs(output_dir) ut.save_run_settings(output_dir) print("::::::: Reading parameters :::::::") param_file = os.path.join(settings_dir, 'xgb_parameters.json') value_dicts = ut.read_parameters(param_file) pso_settings = ut.read_settings(settings_dir, 'pso') hyperparameter_sets = xt.prepare_run_params(value_dicts, pso_settings['sample_size']) print("\n============ Starting hyperparameter optimization ==========\n") best_hyperparameters = pt.run_pso(value_dicts, st.get_fitness_score, hyperparameter_sets, output_dir) print("\n============ Saving results ================\n") best_parameters_path = os.path.join(output_dir, 'best_hyperparameters.json') ut.save_dict_to_json(best_hyperparameters, best_parameters_path) print("Results saved to " + str(output_dir))
def read_fitness(output_dir, fitness_key='d_roc'): '''Creates the list of score dictionaries of each sample. List is ordered according to the number of the sample Parameters: ---------- output_dir : str Path to the directory of output Returns: ------- scores : list of floats List of fitnesses ''' samples = os.path.join(output_dir, 'samples') wild_card_path = os.path.join(samples, '*', 'score.json') number_samples = len(glob.glob(wild_card_path)) score_dicts = [] for number in range(number_samples): path = os.path.join(samples, str(number), 'score.json') score_dict = ut.read_parameters(path)[0] score_dicts.append(score_dict) scores = [score_dict[fitness_key] for score_dict in score_dicts] return scores
def test_read_parameters(): path_to_test_file = os.path.join(resources_dir, 'best_parameters.json') result = ut.read_parameters(path_to_test_file) expected = [{'a': 1, 'b': 2, 'c': 3}, {'stuff': 1}] assert result == expected