def train_model(dataset_name, pt_bin, yaml_file, prefix): d_cuts = configyaml.ConfigYaml(yaml_file) train = dr.get_ml_dataset(dataset_name, d_cuts, pt_bin) params = d_cuts.values['model_building']['model_parameters'] train_parameters = d_cuts.values['model_building']['train_parameters'] cv_params = d_cuts.values['model_building']['cv_parameters'] cv_params.update(train_parameters) features = d_cuts.values['model_building']['features'] target = d_cuts.values['model_building']['target'] lgb_dataset = lgb.Dataset(train[features], label=train[target]) del train start = time.time() cv = lgb.cv(params, lgb_dataset, **cv_params) print('Total CV time: ' + str(time.time() - start)) results_cv = pd.DataFrame(cv) cv_results_file = dr.get_location_step(dataset_name, 'ml') + 'cv_' + str(pt_bin) + '.pkl' try: os.remove(cv_results_file) except FileNotFoundError: pass print('Best iteration of the model: ') print(results_cv.iloc[-1]) results_cv.to_pickle(cv_results_file) train_parameters['num_boost_round'] = len(results_cv) start = time.time() gbm = lgb.train(params, lgb_dataset, **train_parameters) print('Total training time: ' + str(time.time() - start)) name_to_save = dr.get_location_step(dataset_name, 'ml') + prefix + 'model_' + str(pt_bin) + '.txt' try: os.remove(name_to_save) except FileNotFoundError: pass temp_file = dr.definitions.TEMP + 'temp_model.txt' gbm.save_model(temp_file) shutil.copyfile(temp_file, name_to_save) os.remove(temp_file) return gbm, name_to_save
def prepare_signal(mc_config, pt_bins, particle): print("Processing signal") signal = sl.get_true_dmesons(dr.load(mc_config, particle)) signal['PtBin'] = pd.cut(signal['Pt'], pt_bins) # Create the columns which will be used for the ML training 'CandidateType' # CandidateType = -1 -> Background # CandidateType = 0 -> Non-Prompt D mesons # CandidateType = 1 -> Prompt D mesons signal['CandidateType'] = signal['IsPrompt'].astype(np.int) folder_to_save = dr.get_location_step(mc_config, 'ml') dr.check_for_folder(folder_to_save) delete_previous = subprocess.Popen('rm -f ' + folder_to_save + 'sig_*', shell=True) delete_previous.wait() for name, group in signal.groupby('PtBin', as_index=False): print(name) df = group.drop(['PtBin'], axis='columns') df.to_parquet(folder_to_save + 'sig_' + str(name) + '.parquet') mc_mean_sigma = calculate_mean_sigma_mc(signal) os.remove(folder_to_save + 'mc_mean_sigma.pkl') mc_mean_sigma.to_pickle(folder_to_save + 'mc_mean_sigma.pkl')
def predict_class(files, yaml_file, prefix): dataset_name = dr.get_dataset_name_from_file(files[0]) config = ConfigYaml(yaml_file) gbm = Model(dataset_name, yaml_config=config, prefix=prefix) location_to_save = dr.get_location_step(dataset_name, 'consolidated') for file in tqdm(files): data = pd.read_parquet(file) data['Probability'] = gbm.predict(data)['Probability'] file_name = location_to_save + dr.get_file_name(file) try: os.remove(file_name) except FileNotFoundError: pass data.to_parquet(file_name)
def submit_predict(dataset_name, particle, n_files, prefix=None, yaml_file=None): check_for_folder(dr.get_location_step(dataset_name, 'consolidated')) files = dr.find_missing_processed_files(dataset_name, 'raw', 'consolidated', particle, full_file_path=True) print(files) additional_arguments = '' if prefix is not None: additional_arguments += ' --prefix ' + str(prefix) if yaml_file is not None: additional_arguments += ' --yaml_file ' + str(yaml_file) n_files_to_process = split_submit_job( files, definitions.ROOT_DIR + '/ml/predict.py', dataset_name + '_p_', n_files, additional_arguments) return n_files_to_process
"--skip_signal", dest='skip_signal', action='store_true', help='Skip signal processing') parser.set_defaults(submit_bkg=True) parser.set_defaults(skip_signal=False) args = parser.parse_args() print("The following configuration will be used:") print('Configuration in MC (for signal): ' + args.mc_config) print('Configuration in data (for background): ' + args.data_config) d_cuts = configyaml.ConfigYaml(args.yaml_file) dr.check_for_folder(dr.get_location_step(args.data_config, 'ml')) if not args.skip_signal: prepare_signal(args.mc_config, d_cuts.values['model_building']['bins_pt'], 'dmeson') from dhfcorr.utils import batch, format_list_to_bash runs = dr.get_run_numbers(args.data_config) print("Processing Background:") clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True) clear.wait() job_id = 0 for run_list in tqdm(list(batch(runs, args.nfiles))):
parser.add_argument("--yaml_config", default=None, help='Configuration file)') parser.add_argument("--id", default=0, help='id to save the file') parser.add_argument("--particle_name", default='dmeson', help='particle name') args = parser.parse_args() run_list = args.run_list run_list = run_list.split(',') yaml_config = args.yaml_config d_cuts = configyaml.ConfigYaml(yaml_config) folder_to_save = reader.get_location_step('ml') mc_mean = pd.read_pickle(folder_to_save + '/mc_mean_sigma.pkl') def filter_bkg(df, mc_shape, n_sigma=4.0): pt_bin = df.name mean = mc_shape.loc[pt_bin]['mean'] std = mc_shape.loc[pt_bin]['std'] bkg_sidebands = df[np.abs(df['InvMass'] - mean) > n_sigma * std] return bkg_sidebands candidates_df = list() for run in run_list: bkg = reader.load(args.config_name, args.particle_name, run_number=[run]) bkg['PtBin'] = pd.cut(bkg['Pt'],