def predict_class(files, config, yaml_file, prefix, config_save, test_mode): if config_save is None: config_save = config d_cuts = configyaml.ConfigYaml(yaml_file) pt_bins = np.array(d_cuts.values['model_building']['bins_pt']) base_name = definitions.PROCESSING_FOLDER + config + '/ml-dataset/' + prefix + 'model_pt' models = [ base_name + str(pt_bin) + '_main_mojo.zip' for pt_bin in range(len(pt_bins) - 1) ] for file in files: print('Processing file: ') print(file) dataset = pd.read_parquet(file) if test_mode: dataset = dataset.iloc[:1000] dataset['Probability'] = -999. pt_bins_df = pd.cut(dataset['Pt'], list(pt_bins), labels=False) predictions = dataset.groupby(pt_bins_df, as_index=False, group_keys=False).apply( add_prediction, models) dataset['Probability'] = predictions.astype('float32') file_name = file.split('/')[-1] dataset.to_parquet(definitions.PROCESSING_FOLDER + config_save + '/filtered/' + file_name) print()
def train_model(dataset_name, pt_bin, yaml_file, prefix): d_cuts = configyaml.ConfigYaml(yaml_file) train = dr.get_ml_dataset(dataset_name, d_cuts, pt_bin) params = d_cuts.values['model_building']['model_parameters'] train_parameters = d_cuts.values['model_building']['train_parameters'] cv_params = d_cuts.values['model_building']['cv_parameters'] cv_params.update(train_parameters) features = d_cuts.values['model_building']['features'] target = d_cuts.values['model_building']['target'] lgb_dataset = lgb.Dataset(train[features], label=train[target]) del train start = time.time() cv = lgb.cv(params, lgb_dataset, **cv_params) print('Total CV time: ' + str(time.time() - start)) results_cv = pd.DataFrame(cv) cv_results_file = dr.get_location_step(dataset_name, 'ml') + 'cv_' + str(pt_bin) + '.pkl' try: os.remove(cv_results_file) except FileNotFoundError: pass print('Best iteration of the model: ') print(results_cv.iloc[-1]) results_cv.to_pickle(cv_results_file) train_parameters['num_boost_round'] = len(results_cv) start = time.time() gbm = lgb.train(params, lgb_dataset, **train_parameters) print('Total training time: ' + str(time.time() - start)) name_to_save = dr.get_location_step(dataset_name, 'ml') + prefix + 'model_' + str(pt_bin) + '.txt' try: os.remove(name_to_save) except FileNotFoundError: pass temp_file = dr.definitions.TEMP + 'temp_model.txt' gbm.save_model(temp_file) shutil.copyfile(temp_file, name_to_save) os.remove(temp_file) return gbm, name_to_save
def reduce_opt(files_to_reduce, config, yaml_file, id_job, particle, pre_filter_bkg, maximum_pt_filter): d_cuts = configyaml.ConfigYaml(yaml_file) pt_bins = np.array(d_cuts.values['reduce_data']['bins_pt']) cols_keep = d_cuts.values['reduce_data']['features'] base_name = definitions.PROCESSING_FOLDER + config + '/skimmed/' dataset = pd.concat([pd.read_parquet(file, columns=cols_keep) for file in files_to_reduce]) dataset = dataset.loc[((dataset['bkg'] < pre_filter_bkg) & (dataset['Pt'] < maximum_pt_filter)) | ( dataset['Pt'] >= maximum_pt_filter)] reduce_dataframe_memory(dataset) df_pt_bins = pd.cut(dataset['Pt'], list(pt_bins), labels=False) dataset.groupby(df_pt_bins).apply( lambda x: x.to_parquet(base_name + 'id' + str(id_job) + '_pt' + str(x.name) + '_' + particle + '.parquet'))
def submit_train(dataset_name, yaml_config, prefix=None): d_cuts = configyaml.ConfigYaml(yaml_config) pt_bins = np.array(d_cuts.values['model_building']['bins_pt']) pt_bins = pd.cut(0.5 * (pt_bins[:-1] + pt_bins[1:]), bins=pt_bins) base_f = definitions.ROOT_DIR queue = d_cuts.values['model_building']['queue'] for i in reversed(range(len(pt_bins))): arguments = str(i) + ' ' + str(dataset_name) if prefix is not None: arguments += ' --prefix ' + prefix command = get_job_command(dataset_name + '_t_pt_' + str(i), base_f + "/ml/train_lgb.py ", arguments, queue=queue) subprocess.run(command, shell=True)
def __init__(self, name_file, particle='D0'): """Default constructor. yaml_file should come from the class CutsYaml. The particle is set as Default to D0 """ yaml_config = configyaml.ConfigYaml(name_file, default_file=ROOT_DIR + "/config/config_retangular.yaml") try: d_meson_cuts = yaml_config.values[particle]['cuts'] except KeyError as key_error: print(key_error) raise (ValueError, "The particle " + str(particle) + " cuts were not found.") # Save the cuts to a DataFrame self.cut_df = pd.DataFrame(d_meson_cuts).apply(pd.to_numeric, errors='ignore') self.cut_df.set_index('PtBin', inplace=True) # Change names to values with no -range, min_, max_ names = [a.split('_')[0] for a in self.cut_df.columns] type_col = [a.split('_')[1] for a in self.cut_df.columns] # save the type of cut self.range_features = [names[i] for i in range(len(names)) if type_col[i] == "range"] self.min_features = [names[i] for i in range(len(names)) if type_col[i] == "min"] self.max_features = [names[i] for i in range(len(names)) if type_col[i] == "max"] self.bool_features = [names[i] for i in range(len(names)) if type_col[i] == "bool"] self.cut_df.columns = names self.cut_type = type_col pt_ = self.cut_df['Pt'] min_pt = [pt_[i][0] for i in range(len(pt_))] max_pt = [pt_[i][1] for i in range(len(pt_))] # Define basic selection variable types self.pt_bins = list(min_pt) + list([max_pt[-1]]) # Change pt_bins to intervals mid_pt = (np.array(min_pt) + np.array(max_pt)) / 2. self.cut_df['PtBin'] = pd.cut(mid_pt, self.pt_bins) self.cut_df.set_index('PtBin', inplace=True) self.particle_mass = float(yaml_config.values[particle]['particle_mass']) self.particle_name = str(yaml_config.values[particle]['particle_name']) self.features_absolute = tuple(yaml_config.values[particle]['features_abs'])
def train_model(config, pt_bin, yaml_file, prefix): train_f = definitions.PROCESSING_FOLDER + config + '/ml-dataset/ml_sample_train_' + str( pt_bin) + '.parquet' train = h2o.import_file(train_f) d_cuts = configyaml.ConfigYaml(yaml_file) # Configuration of the GRID Search features = d_cuts.values['model_building']['features'] target = d_cuts.values['model_building']['target'] parameters = d_cuts.values['model_building']['model_parameters'] train[target] = train[target] > -1 train[target] = train[target].asfactor() model = H2OXGBoostEstimator(**parameters) model.train(features, target, training_frame=train) place_to_save = definitions.PROCESSING_FOLDER + config + '/ml-dataset/' file_list_saved = list() # Save Main model path_main = h2o.save_model(model, place_to_save, force=True) path_main_rename = ''.join([ x + '/' for x in path_main.split('/')[:-1] ]) + prefix + 'model_pt' + str(pt_bin) + '_main' os.rename(path_main, path_main_rename) file_list_saved.append(path_main_rename) model_list = model.cross_validation_models() for model_cv, i in zip(model_list, range(len(model_list))): path = h2o.save_model(model_cv, place_to_save, force=True) path_new = ''.join([ x + '/' for x in path.split('/')[:-1] ]) + prefix + 'model_pt' + str(pt_bin) + '_cv' + str(i) os.rename(path, path_new) file_list_saved.append(path_new) return model, model_list, file_list_saved
def fit_d_meson_inv_mass(config_file_name=None, suffix='_t'): config = configyaml.ConfigYaml(config_file_name) data_sample = reader.load_pairs(config_file_name, 'selected') base_folder = config.values['base_folder'] print("Fitting the Invariant Mass") fits = data_sample.groupby(['APtBin', 'TPtBin']).apply(fit_inv_mass, suffix=suffix, **config.values['inv_mass']) fits.columns = ['Fits'] fits.to_pickle(base_folder + '/fits_inv_mass' + suffix + '.pkl') print("Plotting the fits") for index, row in fits.iteritems(): a_i = index[0] t_i = index[1] fig, ax = plt.subplots() plot_inv_mass_fit(row, ax, **config.values['correlation_qa_style']) fig.savefig(base_folder + '/plots/mass_pt_a' + str(a_i) + '_t' + str(t_i) + '.pdf', bbox_inches="tight")
' generation on cluster') parser.add_argument('-s', "--skip_signal", dest='skip_signal', action='store_true', help='Skip signal processing') parser.set_defaults(submit_bkg=True) parser.set_defaults(skip_signal=False) args = parser.parse_args() print("The following configuration will be used:") print('Configuration in MC (for signal): ' + args.mc_config) print('Configuration in data (for background): ' + args.data_config) d_cuts = configyaml.ConfigYaml(args.yaml_file) dr.check_for_folder(dr.get_location_step(args.data_config, 'ml')) if not args.skip_signal: prepare_signal(args.mc_config, d_cuts.values['model_building']['bins_pt'], 'dmeson') from dhfcorr.utils import batch, format_list_to_bash runs = dr.get_run_numbers(args.data_config) print("Processing Background:") clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True) clear.wait() job_id = 0
type=str, help='Configuration name (used to save the temporary files)') parser.add_argument("--yaml_config", default=None, help='Configuration file)') parser.add_argument("--id", default=0, help='id to save the file') parser.add_argument("--particle_name", default='dmeson', help='particle name') args = parser.parse_args() run_list = args.run_list run_list = run_list.split(',') yaml_config = args.yaml_config d_cuts = configyaml.ConfigYaml(yaml_config) folder_to_save = reader.get_location_step('ml') mc_mean = pd.read_pickle(folder_to_save + '/mc_mean_sigma.pkl') def filter_bkg(df, mc_shape, n_sigma=4.0): pt_bin = df.name mean = mc_shape.loc[pt_bin]['mean'] std = mc_shape.loc[pt_bin]['std'] bkg_sidebands = df[np.abs(df['InvMass'] - mean) > n_sigma * std] return bkg_sidebands candidates_df = list() for run in run_list: bkg = reader.load(args.config_name, args.particle_name,
variables_to_keep_trig = [ 'GridPID', 'EventNumber', 'ID', 'IsParticleCandidate', 'Pt', 'Eta', 'Phi', 'InvMass', 'prediction' ] variables_to_keep_assoc = [ 'GridPID', 'EventNumber', 'Charge', 'Pt', 'Eta', 'Phi', 'InvMassPartnersULS', 'InvMassPartnersLS' ] index = ['GridPID', 'EventNumber'] df = reader.load('D0_HMV0', ['dmeson', 'electron'], columns=variables_to_keep_trig, index=index, lazy=True) config_corr = configyaml.ConfigYaml('dhfcorr/config/optimize_bdt_cut.yaml') pt_bins_trig = config_corr.values['correlation']['bins_trig'] pt_bins_assoc = config_corr.values['correlation']['bins_assoc'] trig_suffix = '_t' assoc_suffix = '_a' inv_mass_trig_list = list() pairs = dhfcorr.correlate.make_pairs.build_pairs_from_lazy( df, (trig_suffix, assoc_suffix), pt_bins_trig, pt_bins_assoc, **config_corr.values['correlation']) selected = pd.read_pickle('pairs_d_hfe_hm.pkl').reset_index(level=0, drop=True) # Remove ROOT messages