예제 #1
0
def train_model(dataset_name, pt_bin, yaml_file, prefix):
    d_cuts = configyaml.ConfigYaml(yaml_file)

    train = dr.get_ml_dataset(dataset_name, d_cuts, pt_bin)

    params = d_cuts.values['model_building']['model_parameters']
    train_parameters = d_cuts.values['model_building']['train_parameters']

    cv_params = d_cuts.values['model_building']['cv_parameters']
    cv_params.update(train_parameters)

    features = d_cuts.values['model_building']['features']
    target = d_cuts.values['model_building']['target']

    lgb_dataset = lgb.Dataset(train[features], label=train[target])

    del train

    start = time.time()
    cv = lgb.cv(params, lgb_dataset, **cv_params)
    print('Total CV time: ' + str(time.time() - start))
    results_cv = pd.DataFrame(cv)

    cv_results_file = dr.get_location_step(dataset_name, 'ml') + 'cv_' + str(pt_bin) + '.pkl'

    try:
        os.remove(cv_results_file)
    except FileNotFoundError:
        pass

    print('Best iteration of the model: ')
    print(results_cv.iloc[-1])
    results_cv.to_pickle(cv_results_file)

    train_parameters['num_boost_round'] = len(results_cv)

    start = time.time()
    gbm = lgb.train(params, lgb_dataset, **train_parameters)
    print('Total training time: ' + str(time.time() - start))

    name_to_save = dr.get_location_step(dataset_name, 'ml') + prefix + 'model_' + str(pt_bin) + '.txt'

    try:
        os.remove(name_to_save)
    except FileNotFoundError:
        pass

    temp_file = dr.definitions.TEMP + 'temp_model.txt'

    gbm.save_model(temp_file)
    shutil.copyfile(temp_file, name_to_save)

    os.remove(temp_file)

    return gbm, name_to_save
예제 #2
0
def prepare_signal(mc_config, pt_bins, particle):
    print("Processing signal")
    signal = sl.get_true_dmesons(dr.load(mc_config, particle))
    signal['PtBin'] = pd.cut(signal['Pt'], pt_bins)

    # Create the columns which will be used for the ML training 'CandidateType'
    # CandidateType = -1 -> Background
    # CandidateType =  0 -> Non-Prompt D mesons
    # CandidateType =  1 -> Prompt D mesons

    signal['CandidateType'] = signal['IsPrompt'].astype(np.int)

    folder_to_save = dr.get_location_step(mc_config, 'ml')
    dr.check_for_folder(folder_to_save)

    delete_previous = subprocess.Popen('rm -f ' + folder_to_save + 'sig_*', shell=True)
    delete_previous.wait()

    for name, group in signal.groupby('PtBin', as_index=False):
        print(name)
        df = group.drop(['PtBin'], axis='columns')
        df.to_parquet(folder_to_save + 'sig_' + str(name) + '.parquet')

    mc_mean_sigma = calculate_mean_sigma_mc(signal)

    os.remove(folder_to_save + 'mc_mean_sigma.pkl')

    mc_mean_sigma.to_pickle(folder_to_save + 'mc_mean_sigma.pkl')
예제 #3
0
def predict_class(files, yaml_file, prefix):
    dataset_name = dr.get_dataset_name_from_file(files[0])
    config = ConfigYaml(yaml_file)
    gbm = Model(dataset_name, yaml_config=config, prefix=prefix)
    location_to_save = dr.get_location_step(dataset_name, 'consolidated')

    for file in tqdm(files):
        data = pd.read_parquet(file)
        data['Probability'] = gbm.predict(data)['Probability']
        file_name = location_to_save + dr.get_file_name(file)

        try:
            os.remove(file_name)
        except FileNotFoundError:
            pass

        data.to_parquet(file_name)
예제 #4
0
def submit_predict(dataset_name,
                   particle,
                   n_files,
                   prefix=None,
                   yaml_file=None):
    check_for_folder(dr.get_location_step(dataset_name, 'consolidated'))
    files = dr.find_missing_processed_files(dataset_name,
                                            'raw',
                                            'consolidated',
                                            particle,
                                            full_file_path=True)
    print(files)

    additional_arguments = ''
    if prefix is not None:
        additional_arguments += ' --prefix ' + str(prefix)
    if yaml_file is not None:
        additional_arguments += ' --yaml_file ' + str(yaml_file)

    n_files_to_process = split_submit_job(
        files, definitions.ROOT_DIR + '/ml/predict.py', dataset_name + '_p_',
        n_files, additional_arguments)

    return n_files_to_process
예제 #5
0
                        "--skip_signal",
                        dest='skip_signal',
                        action='store_true',
                        help='Skip signal processing')
    parser.set_defaults(submit_bkg=True)
    parser.set_defaults(skip_signal=False)

    args = parser.parse_args()

    print("The following configuration will be used:")
    print('Configuration in MC (for signal): ' + args.mc_config)
    print('Configuration in data (for background): ' + args.data_config)

    d_cuts = configyaml.ConfigYaml(args.yaml_file)

    dr.check_for_folder(dr.get_location_step(args.data_config, 'ml'))

    if not args.skip_signal:
        prepare_signal(args.mc_config,
                       d_cuts.values['model_building']['bins_pt'], 'dmeson')

    from dhfcorr.utils import batch, format_list_to_bash

    runs = dr.get_run_numbers(args.data_config)

    print("Processing Background:")
    clear = subprocess.Popen('rm -f ' + ' bkg_*', shell=True)
    clear.wait()
    job_id = 0

    for run_list in tqdm(list(batch(runs, args.nfiles))):
예제 #6
0
    parser.add_argument("--yaml_config",
                        default=None,
                        help='Configuration file)')
    parser.add_argument("--id", default=0, help='id to save the file')
    parser.add_argument("--particle_name",
                        default='dmeson',
                        help='particle name')

    args = parser.parse_args()
    run_list = args.run_list
    run_list = run_list.split(',')

    yaml_config = args.yaml_config
    d_cuts = configyaml.ConfigYaml(yaml_config)

    folder_to_save = reader.get_location_step('ml')
    mc_mean = pd.read_pickle(folder_to_save + '/mc_mean_sigma.pkl')

    def filter_bkg(df, mc_shape, n_sigma=4.0):
        pt_bin = df.name
        mean = mc_shape.loc[pt_bin]['mean']
        std = mc_shape.loc[pt_bin]['std']
        bkg_sidebands = df[np.abs(df['InvMass'] - mean) > n_sigma * std]
        return bkg_sidebands

    candidates_df = list()
    for run in run_list:
        bkg = reader.load(args.config_name,
                          args.particle_name,
                          run_number=[run])
        bkg['PtBin'] = pd.cut(bkg['Pt'],