def main(to_continue, opt_dir, bbww): if not to_continue: settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = ut.read_settings(settings_dir, 'global') output_dir = os.path.expandvars(global_settings['output_dir']) else: settings_dir = os.path.join(opt_dir, 'run_settings') global_settings = ut.read_settings(settings_dir, 'global') output_dir = opt_dir if not os.path.isdir(output_dir): os.makedirs(output_dir) if not to_continue: if not os.path.exists(os.path.join(output_dir, 'run_settings')): ut.save_run_settings(output_dir) if not os.path.exists(os.path.join(output_dir, 'run_info')): ut.save_info_dir(output_dir) # use_scratch_for_data(global_settings) print("::::::: Reading parameters :::::::") if global_settings['ml_method'] == 'xgb': param_file = os.path.join(settings_dir, 'xgb_parameters.json') else: param_file = os.path.join(settings_dir, 'nn_parameters.json') hyperparameter_info = ut.read_json_cfg(param_file) pso_settings = ut.read_settings(settings_dir, 'pso') pso_settings.update(global_settings) addition = ut.create_infoPath_addition(global_settings) channel_dir = os.path.join(output_dir, 'run_info') info_dir = os.path.join(channel_dir, addition) scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters normalizer = hht.HHDataNormalizer if not bbww else bbwwt.bbWWDataNormalizer if os.path.exists(preferences['data_csv']): print(':::::::: Loading data from .csv file ::::::::') data = pandas.read_csv(preferences['data_csv']) else: print('::::::: Loading data to be saved to pandas.DataFrame :::::::') if not bbww: loader = hht.HHDataLoader(normalizer, preferences, global_settings) else: loader = bbwwt.bbWWLoader(normalizer, preferences, global_settings) data = loader.data if global_settings['ml_method'] in ['lbn', 'nn']: use_Wjets = True if 'bb2l' in global_settings['channel']: use_Wjets = False data = mt.multiclass_encoding(data, use_Wjets) loader.save_to_csv() print("\n============ Starting hyperparameter optimization ==========\n") swarm = pt.ParticleSwarm(pso_settings, st.get_fitness_score, hyperparameter_info, to_continue, output_dir) optimal_hyperparameters = swarm.particleSwarmOptimization()[0] print("\n============ Saving results ================\n") best_parameters_path = os.path.join(output_dir, 'best_hyperparameters.json') ut.save_dict_to_json(optimal_hyperparameters, best_parameters_path) print("Results saved to " + str(output_dir))
def main(): cmssw_path = os.path.expandvars('$CMSSW_BASE') package_dir = os.path.join(cmssw_path, 'src/machineLearning/machineLearning') settings_dir = os.path.join(package_dir, 'settings') global_settings = ut.read_settings(settings_dir, 'global') modes = ['nonres/default', 'spin0', 'spin2'] table_infos = [] output_file = os.path.expandvars( os.path.join(global_settings['output_dir'], 'EventYield.tex')) for mode in modes: global_settings['scenario'] = mode channel_dir = os.path.join(package_dir, 'info', 'HH', global_settings['channel']) reader = hpr.HHParameterReader(channel_dir, mode) preferences = reader.parameters normalizer = hht.HHDataNormalizer loader = hht.HHDataLoader(normalizer, preferences, global_settings) mode_data = loader.data for era in set(mode_data['era']): era_data = mode_data.loc[mode_data['era'] == era] channel = global_settings['channel'] table_creator = eyc.EventYieldTable(era_data, channel, era, mode) table_info = table_creator.create_table() table_infos.append(table_info) table_writer = eyc.EventYieldsFile(table_infos, output_file) table_writer.fill_document_file() print('File saved to %s' % output_file)
def choose_trainvar(datacard_dir, channel, trainvar, bdt_type): '''Reads the training variables from the data folder from file 'optimization_trainvars.txt'. Is used for the xgb_tth cf function. Parametrs: --------- datacard_dir : dummy argument Needed for compability with the other trainvars loading channel : dummy argument Needed for compability with the other trainvars loading trainvar : dummy argument Needed for compability with texpandvarshe other trainvars loading bdt_type : dummy argument Needed for compability with the other trainvars loading Returns: ------- trainvars : list list of trainvars that are to be used in the optimization. ''' global_settings = ut.read_settings('global') out_dir = os.path.expandvars(global_settings['output_dir']) trainvars_path = os.path.join(out_dir, 'optimization_trainvars.txt') try: trainvars = dlt.read_list(trainvars_path) except: print('Could not find trainvars') trainvars = '' return trainvars
def main(output_dir, save_model, channel, mode, era, BM): settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = ut.read_settings(settings_dir, 'global') global_settings['ml_method'] = 'lbn' global_settings['channel'] = 'bb1l' if output_dir == 'None': output_dir = global_settings['channel']+'/'+global_settings['ml_method']+'/'+\ res_nonres + '/' + mode +'/' + era global_settings['output_dir'] = output_dir else: global_settings['output_dir'] = output_dir global_settings['output_dir'] = os.path.expandvars( global_settings['output_dir']) if not os.path.exists(global_settings['output_dir']): os.makedirs(global_settings['output_dir']) channel_dir, info_dir, _ = ut.find_settings() scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters if not BM == 'None': preferences["nonResScenarios"] = [BM] print('BM point to be considered: ' + str(preferences["nonResScenarios"])) if not era == '0': preferences['included_eras'] = [era.replace('20', '')] print('era: ' + str(preferences['included_eras'])) preferences = define_trainvars(global_settings, preferences, info_dir) particles = PARTICLE_INFO[global_settings['channel']] data_dict = create_data_dict(preferences, global_settings) classes = set(data_dict["even_data"]["process"]) for class_ in classes: multitarget = list( set(data_dict["even_data"].loc[data_dict["even_data"]["process"] == class_, "multitarget"]))[0] print(str(class_) + '\t' + str(multitarget)) even_model = create_model(preferences, global_settings, data_dict, "even_data", save_model) if global_settings['feature_importance'] == 1: trainvars = preferences['trainvars'] data = data_dict['odd_data'] LBNFeatureImportance = nt.LBNFeatureImportances(even_model, data,\ trainvars, global_settings['channel']) score_dict = LBNFeatureImportance.custom_permutation_importance() hhvt.plot_feature_importances_from_dict(score_dict, global_settings['output_dir']) odd_model = create_model(preferences, global_settings, data_dict, "odd_data", save_model) print(odd_model.summary()) nodewise_performance(data_dict['odd_data'], data_dict['even_data'],\ odd_model, even_model, data_dict['trainvars'], particles, \ global_settings, preferences) even_train_info, even_test_info = evaluate_model( even_model, data_dict['even_data'], data_dict['odd_data'],\ data_dict['trainvars'], global_settings, "even_data", particles) odd_train_info, odd_test_info = evaluate_model( odd_model, data_dict['odd_data'], data_dict['even_data'], \ data_dict['trainvars'], global_settings, "odd_data", particles) hhvt.plotROC([odd_train_info, odd_test_info], [even_train_info, even_test_info], global_settings)
def main(): settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = ut.read_settings(settings_dir, 'global') output_dir = os.path.expandvars(global_settings['output_dir']) if not os.path.isdir(output_dir): os.makedirs(output_dir) ut.save_run_settings(output_dir) print("::::::: Reading parameters :::::::") param_file = os.path.join(settings_dir, 'xgb_parameters.json') value_dicts = ut.read_parameters(param_file) pso_settings = ut.read_settings(settings_dir, 'pso') hyperparameter_sets = xt.prepare_run_params(value_dicts, pso_settings['sample_size']) print("\n============ Starting hyperparameter optimization ==========\n") best_hyperparameters = pt.run_pso(value_dicts, st.get_fitness_score, hyperparameter_sets, output_dir) print("\n============ Saving results ================\n") best_parameters_path = os.path.join(output_dir, 'best_hyperparameters.json') ut.save_dict_to_json(best_hyperparameters, best_parameters_path) print("Results saved to " + str(output_dir))
def initialize_trainvars(channel='2l_2tau', process='HH', random_sample='TTZ'): '''Reads in all the possible trainvars for initial run Parameters: ---------- channel : str Name of the channel where the .root file is taken from (e.g 2l_2tau) process : str Name of the process where the .root is loaded (e.g. ttH or HH) random_sample : str A random sample the .root file tha is to be loaded belongs to Returns: trainvars : list list of all possible trainvars that are to be used in the optimization ''' info_folder = os.path.join(os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/info') inputpath_info_path = os.path.join(info_folder, process, channel, 'tauID_training.json') info_dict = ut.read_parameters(inputpath_info_path)[1] path_to_files = info_dict['inputPath'] wildcard_root_files = os.path.join(path_to_files, '*' + random_sample + '*', 'central', '*.root') single_root_file = glob.glob(wildcard_root_files)[0] channel_info_path = os.path.join(info_folder, process, channel, 'info.json') channel_info_dict = ut.read_multiline_json_to_dict(channel_info_path) channel_in_tree = channel_info_dict['channelInTree'] samplename_info = os.path.join(info_folder, 'samplename_info.json') global_settings = ut.read_settings('global') samplename_info = ut.read_parameters(samplename_info) folder_name = random_sample sample_dict = dlt.find_sample(folder_name, samplename_info) if sample_dict == {}: sample_dict = dl.advanced_sample_name( global_settings['bdtType'], folder_name, []) # TTZ is just a random choice sample_name = sample_dict['sampleName'] input_tree = str( os.path.join(channel_in_tree, 'sel/evtntuple', sample_name, 'evtTree')) trainvars = access_ttree(single_root_file, input_tree) trainvars = data_related_trainvars(trainvars) return trainvars
def get_fitness_score(hyperparameter_sets, global_settings, sample_size=0): '''The main function call that is the slurm equivalent of ensemble_fitness in xgb_tools Parameters: ---------- hyperparameter_sets : list of dicts Parameter-sets for all particles global_settings : dict Global settings for the hyperparameter optimization sample_size: integer Sample size in case where it does not correspond to the value given in the settings file Returns: ------- scores : list of floats Fitnesses for each hyperparameter-set ''' output_dir = os.path.expandvars(global_settings['output_dir']) previous_files_dir = os.path.join(output_dir, 'previous_files') if not os.path.exists(previous_files_dir): os.makedirs(previous_files_dir) settings_dir = os.path.join(output_dir, 'run_settings') if sample_size == 0: opt_settings = ut.read_settings(settings_dir, global_settings['optimization_algo']) sample_size = opt_settings['sample_size'] parameters_to_file(output_dir, hyperparameter_sets) wild_card_path = os.path.join(output_dir, 'samples', '*', 'parameters.json') zero_sized = 1 while zero_sized != 0: zero_sized = check_parameter_file_sizes(wild_card_path) time.sleep(2) for parameter_file in glob.glob(wild_card_path): sample_nr = get_sample_nr(parameter_file) job_file = prepare_job_file(parameter_file, sample_nr, global_settings) subprocess.call(['sbatch', job_file]) wait_iteration(output_dir, sample_size) time.sleep(30) scores = read_fitness(output_dir, global_settings['fitness_fn']) move_previous_files(output_dir, previous_files_dir) return scores
def main(hyperparameter_file, output_dir): settings_dir = os.path.join(output_dir, 'run_settings') global_settings = ut.read_settings(settings_dir, 'global') nthread = global_settings['nthread'] path = Path(hyperparameter_file) save_dir = str(path.parent) hyperparameters = ut.read_json_cfg(hyperparameter_file) addition = ut.create_infoPath_addition(global_settings) channel_dir = os.path.join(output_dir, 'run_info') info_dir = os.path.join(channel_dir, addition) scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters global_settings['debug'] = False data_file = os.path.join(output_dir, 'data.csv') data = pandas.read_csv(data_file) if bool(global_settings['use_kfold']): score, train, test = et.kfold_cv( xt.model_evaluation_main, data, preferences['trainvars'], global_settings, hyperparameters ) else: score, train, test = et.get_evaluation( xt.model_evaluation_main, data, preferences['trainvars'], global_settings, hyperparameters ) score_path = os.path.join(save_dir, 'score.json') score_dict = { global_settings['fitness_fn']: score, 'train': train, 'test': test } with open(score_path, 'w') as score_file: json.dump(score_dict, score_file)
def main(output_dir, settings_dir, hyperparameter_file, debug): if settings_dir == 'None': settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = ut.read_settings(settings_dir, 'global') if output_dir == 'None': output_dir = global_settings['output_dir'] else: global_settings['output_dir'] = output_dir global_settings['output_dir'] = os.path.expandvars( global_settings['output_dir']) if not os.path.exists(global_settings['output_dir']): os.makedirs(global_settings['output_dir']) channel_dir, info_dir, _ = ut.find_settings() scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters if hyperparameter_file == 'None': hyperparameter_file = os.path.join(info_dir, 'hyperparameters.json') hyperparameters = ut.read_json_cfg(hyperparameter_file) evaluation_main(global_settings, preferences, hyperparameters, debug)
def main(output_dir, settings_dir, hyperparameter_file, debug): if settings_dir == 'None': settings_dir = os.path.join( os.path.expandvars('$CMSSW_BASE'), 'src/machineLearning/machineLearning/settings') global_settings = settings_dir + '/' + 'global_%s_%s_%s_settings.json' % ( channel, mode, res_nonres) command = 'rsync %s ~/machineLearning/CMSSW_11_2_0_pre1/src/machineLearning/machineLearning/settings/global_settings.json' % global_settings p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) global_settings = ut.read_settings(settings_dir, 'global') if output_dir == 'None': output_dir = global_settings['channel']+'/'+global_settings['ml_method']+'/'+\ res_nonres + '/' + mode +'/' + era global_settings['output_dir'] = output_dir else: global_settings['output_dir'] = output_dir global_settings['output_dir'] = os.path.expandvars( global_settings['output_dir']) if not os.path.exists(global_settings['output_dir']): os.makedirs(global_settings['output_dir']) channel_dir, info_dir, _ = ut.find_settings() scenario = global_settings['scenario'] reader = hpr.HHParameterReader(channel_dir, scenario) preferences = reader.parameters if not BM == 'None': preferences["nonResScenarios"] = [BM] print('BM point to be considered: ' + str(preferences["nonResScenarios"])) if not era == '0': preferences['included_eras'] = [era.replace('20', '')] print('era: ' + str(preferences['included_eras'])) preferences = define_trainvars(global_settings, preferences, info_dir) if hyperparameter_file == 'None': hyperparameter_file = os.path.join(info_dir, 'hyperparameters.json') hyperparameters = ut.read_json_cfg(hyperparameter_file) print('hyperparametrs ' + str(hyperparameters)) evaluation_main(global_settings, preferences, hyperparameters, debug)
def main(hyperparameter_file, output_dir): settings_dir = os.path.join(output_dir, 'run_settings') global_settings = ut.read_settings(settings_dir, 'global') num_classes = global_settings['num_classes'] nthread = global_settings['nthread'] path = Path(hyperparameter_file) save_dir = str(path.parent) hyperparameters = ut.read_parameters(hyperparameter_file)[0] preferences = dlt.get_parameters(global_settings['process'], global_settings['channel'], global_settings['bkg_mass_rand'], global_settings['tauID_training']) data = dlt.load_data( preferences['inputPath'], preferences['channelInTree'], preferences['trainvars'], global_settings['bdtType'], global_settings['channel'], preferences['keys'], preferences['masses'], global_settings['bkg_mass_rand'], ) dlt.reweigh_dataframe(data, preferences['weight_dir'], preferences['trainvars'], ['gen_mHH'], preferences['masses']) normalize_hh_dataframe(data, preferences, global_settings) if bool(global_settings['use_kfold']): score = et.kfold_cv(xt.model_evaluation_main, data, preferences['trainvars'], global_settings, hyperparameters) else: score, pred_train, pred_test = et.get_evaluation( xt.model_evaluation_main, data, preferences['trainvars'], global_settings, hyperparameters) st.save_prediction_files(pred_train, pred_test, save_dir) score_path = os.path.join(save_dir, 'score.json') with open(score_path, 'w') as score_file: json.dump({global_settings['fitness_fn']: score}, score_file)
def test_read_settings(): pso_settings = ut.read_settings(settings_dir, 'pso') global_settings = ut.read_settings(settings_dir, 'global') assert len(pso_settings.keys()) == 7 assert len(global_settings.keys()) == 14
def run_pso(value_dicts, calculate_fitnesses, hyperparameter_sets, output_dir): '''Performs the whole particle swarm optimization. Pay attention that the best fitness has the maximum value, not the minimum. (multiply by -1 if needed) Parameters: ---------- value_dicts : list of dicts Info about every variable that is to be optimized calculate_fitness : method Function that calculates the fitness and returns the score hyperparameter_sets : list of dicts The parameter-sets of all particles. output_dir : str Path to the directory of the output Returns: ------- best_hyperparameters : dict Best hyperparameters found. ''' print(':::::::: Initializing :::::::::') settings_dir = os.path.join(output_dir, 'run_settings') global_settings = ut.read_settings(settings_dir, 'global') pso_settings = ut.read_settings(settings_dir, 'pso') inertial_weight, inertial_weight_step = get_weight_step(pso_settings) iteration = 1 new_hyperparameter_sets = hyperparameter_sets personal_bests = {} compactness = et.calculate_compactness(hyperparameter_sets) fitnesses = calculate_fitnesses(hyperparameter_sets, global_settings) personal_bests = hyperparameter_sets best_fitnesses = fitnesses index = np.argmax(fitnesses) best_hyperparameters = hyperparameter_sets[index] best_fitness = fitnesses[index] current_speeds = initialize_speeds(hyperparameter_sets) max_iterations_not_reached = True not_clustered = True while max_iterations_not_reached and not_clustered: print('::::::: Iteration: ' + str(iteration) + ' ::::::::') hyperparameter_sets = new_hyperparameter_sets compactness = et.calculate_compactness(hyperparameter_sets) print(' --- Compactness: ' + str(compactness) + ' ---') fitnesses = calculate_fitnesses(hyperparameter_sets, global_settings) best_fitnesses = find_best_fitness(fitnesses, best_fitnesses) personal_bests = calculate_personal_bests(fitnesses, best_fitnesses, hyperparameter_sets, personal_bests) weight_dict = { 'c1': pso_settings['c1'], 'c2': pso_settings['c2'], 'w': inertial_weight } new_hyperparameter_sets, current_speeds = prepare_new_day( personal_bests, hyperparameter_sets, best_hyperparameters, current_speeds, value_dicts, weight_dict) index = np.argmax(fitnesses) if best_fitness < max(fitnesses): best_hyperparameters = hyperparameter_sets[index] best_fitness = fitnesses[index] inertial_weight += inertial_weight_step iteration += 1 max_iterations_not_reached = iteration <= pso_settings['iterations'] not_clustered = pso_settings['compactness_threshold'] < compactness return best_hyperparameters