def plotting_main(data, trainvar, filename, info_dir):
    """ Main function for plotting.

    Parameters:
    -----------
    data : pandas DataFrame
        Data to be used for creating the TProfiles
    trainvar : str
        Name of the training variable.
    filename : str
        Path to the file where the TProfile will be saved
    masses_type : str
        Type of the masses to be used. 'low', 'high' or 'all'
    info_dir : str
        Path to the info_directory of the required channel
    global_settings : dict
        Global settings (channel, bdtType etc.)

    Returns:
    --------
    Nothing
    """
    masses = find_masses()
    histo_dicts_json = os.path.join(info_dir, 'histo_dict.json')
    histo_dicts = ut.read_parameters(histo_dicts_json)
    histo_dict = dlt.find_correct_dict('Variable', str(trainvar), histo_dicts)
    canvas, profile = plotting_init(data, trainvar, histo_dict, masses)
    canvas.SaveAs(filename)
def initialize_trainvars(channel='2l_2tau', process='HH', random_sample='TTZ'):
    '''Reads in all the possible trainvars for initial run

    Parameters:
    ----------
    channel : str
        Name of the channel where the .root file is taken from (e.g 2l_2tau)
    process : str
        Name of the process where the .root is loaded (e.g. ttH or HH)
    random_sample : str
        A random sample the .root file tha is to be loaded belongs to

    Returns:
    trainvars : list
        list of all possible trainvars that are to be used in the optimization
    '''
    info_folder = os.path.join(os.path.expandvars('$CMSSW_BASE'),
                               'src/machineLearning/machineLearning/info')
    inputpath_info_path = os.path.join(info_folder, process, channel,
                                       'tauID_training.json')
    info_dict = ut.read_parameters(inputpath_info_path)[1]
    path_to_files = info_dict['inputPath']
    wildcard_root_files = os.path.join(path_to_files,
                                       '*' + random_sample + '*', 'central',
                                       '*.root')
    single_root_file = glob.glob(wildcard_root_files)[0]
    channel_info_path = os.path.join(info_folder, process, channel,
                                     'info.json')
    channel_info_dict = ut.read_multiline_json_to_dict(channel_info_path)
    channel_in_tree = channel_info_dict['channelInTree']
    samplename_info = os.path.join(info_folder, 'samplename_info.json')
    global_settings = ut.read_settings('global')
    samplename_info = ut.read_parameters(samplename_info)
    folder_name = random_sample
    sample_dict = dlt.find_sample(folder_name, samplename_info)
    if sample_dict == {}:
        sample_dict = dl.advanced_sample_name(
            global_settings['bdtType'], folder_name,
            [])  # TTZ is just a random choice
    sample_name = sample_dict['sampleName']
    input_tree = str(
        os.path.join(channel_in_tree, 'sel/evtntuple', sample_name, 'evtTree'))
    trainvars = access_ttree(single_root_file, input_tree)
    trainvars = data_related_trainvars(trainvars)
    return trainvars
def read_trainvars_from_histo_dict(histo_dict_path):
    """Reads the trainvars for which there is histogram info set previously

    Parameters:
    -----------
    histo_dict_path : str
        Path where the histo_dict.json is located

    Returns:
    -------
    old_trainvars : list
        List of trainvars read from the histo_dict.json file
    """
    histo_dicts = ut.read_parameters(histo_dict_path)
    old_trainvars = [histo_dict['Variable'] for histo_dict in histo_dicts]
    return old_trainvars
def do_fit(info_dir, data, preferences):
    """ Fits the Data with a given order of polynomial

    Parameters:
    -----------
    info_dir : str
        Path to the info_directory of the required channel
    data : pandas DataFrame
        Data to be used for creating the TProfiles & fits

    Returns:
    --------
    Nothing
    """
    trainvars = preferences['trainvars']
    if 'gen_mHH' in trainvars:
        trainvars.remove('gen_mHH')
    masses = find_masses()
    histo_dicts_json = os.path.join(info_dir, 'histo_dict.json')
    histo_dicts = ut.read_parameters(histo_dicts_json)
    for trainvar in trainvars:
        histo_dict = dlt.find_correct_dict('Variable', str(trainvar),
                                           histo_dicts)
        fit_poly_order = get_fit_function(histo_dict)
        canvas, profile = plotting_init(data, trainvar, histo_dict, masses)
        print('Variable Name: ' + str(trainvar))
        filename = '_'.join(['TProfile_signal_fit_func', str(trainvar)])
        out_file = os.path.join(weight_dir, filename + '.root')
        fit_function = 'fitFunction_' + str(trainvar)
        masses = find_masses()
        mass_min = min(masses)
        mass_max = max(masses)
        print('Fitfunction: ' + fit_function)
        print('Range: ' + '[' + str(mass_min) + ',' + str(mass_max) + ']')
        function_TF1 = TF1(fit_function, fit_poly_order, float(mass_min),
                           float(mass_max))
        result_ptr = TFitResultPtr()
        result_ptr = profile.Fit(function_TF1, 'SF')  # Fit with Minuit
        function_TF1.Draw('same')
        canvas.Modified()
        canvas.Update()
        canvas.SaveAs(out_file)
        tfile = ROOT.TFile(out_file, "RECREATE")
        function_TF1.Write()
        tfile.Close()
示例#5
0
    def read_trainvar_info(self, path):
        """Reads the trainvar info

        Parameters:
        -----------
        path : str
            Path to the training file

        Returns:
        -------
        trainvar_info : dict
            Dictionary containing trainvar info (e.g is the trainvar supposed to
            be an integer or not)
        """
        trainvar_info = {}
        info_dicts = ut.read_parameters(path)
        for single_dict in info_dicts:
            trainvar_info[single_dict['key']] = single_dict['true_int']
        return trainvar_info
def create_renewed_histo_dict(missing_trainvars, redundant_trainvars,
                              histo_dict_path):
    """ Creates renewed list of histogram infos.

    Parameters:
    -----------
    missing_trainvars : list
        List of new trainvars not present in the old histo_dict.json
    redundant_trainvars : list
        List of trainvars in the old histo_dict.json not present in the new
        trainvars.json
    histo_dict_path : str
        Path where the histo_dict.json is located

    Returns:
    -------
    new_histo_infos : list of dicts
        List of histo_infos to be saved into the renewed histo_dict.json
    """
    old_histo_dicts = ut.read_parameters(histo_dict_path)
    new_histo_infos = []
    template = {
        'Variable': '',
        'nbins': 55,
        'min': 0.0,
        'max': 1100.0,
        'fitFunc_AllMassTraining': 'pol1',
        'fitFunc_LowMassTraining': 'pol1',
        'fitFunc_HighMassTraining': 'pol1'
    }
    for old_histo_dict in old_histo_dicts:
        if old_histo_dict['Variable'] in redundant_trainvars:
            continue
        else:
            new_histo_infos.append(old_histo_dict)
    for missing_trainvar in missing_trainvars:
        histo_info = template.copy()
        histo_info['Variable'] = missing_trainvar
        new_histo_infos.append(histo_info)
    return new_histo_infos
示例#7
0
def main(hyperparameter_file, output_dir):
    settings_dir = os.path.join(output_dir, 'run_settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    num_classes = global_settings['num_classes']
    nthread = global_settings['nthread']
    path = Path(hyperparameter_file)
    save_dir = str(path.parent)
    hyperparameters = ut.read_parameters(hyperparameter_file)[0]
    preferences = dlt.get_parameters(global_settings['process'],
                                     global_settings['channel'],
                                     global_settings['bkg_mass_rand'],
                                     global_settings['tauID_training'])
    data = dlt.load_data(
        preferences['inputPath'],
        preferences['channelInTree'],
        preferences['trainvars'],
        global_settings['bdtType'],
        global_settings['channel'],
        preferences['keys'],
        preferences['masses'],
        global_settings['bkg_mass_rand'],
    )
    dlt.reweigh_dataframe(data, preferences['weight_dir'],
                          preferences['trainvars'], ['gen_mHH'],
                          preferences['masses'])
    normalize_hh_dataframe(data, preferences, global_settings)
    if bool(global_settings['use_kfold']):
        score = et.kfold_cv(xt.model_evaluation_main, data,
                            preferences['trainvars'], global_settings,
                            hyperparameters)
    else:
        score, pred_train, pred_test = et.get_evaluation(
            xt.model_evaluation_main, data, preferences['trainvars'],
            global_settings, hyperparameters)
        st.save_prediction_files(pred_train, pred_test, save_dir)
    score_path = os.path.join(save_dir, 'score.json')
    with open(score_path, 'w') as score_file:
        json.dump({global_settings['fitness_fn']: score}, score_file)
示例#8
0
def main():
    settings_dir = os.path.join(
        os.path.expandvars('$CMSSW_BASE'),
        'src/machineLearning/machineLearning/settings')
    global_settings = ut.read_settings(settings_dir, 'global')
    output_dir = os.path.expandvars(global_settings['output_dir'])
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    ut.save_run_settings(output_dir)
    print("::::::: Reading parameters :::::::")
    param_file = os.path.join(settings_dir, 'xgb_parameters.json')
    value_dicts = ut.read_parameters(param_file)
    pso_settings = ut.read_settings(settings_dir, 'pso')
    hyperparameter_sets = xt.prepare_run_params(value_dicts,
                                                pso_settings['sample_size'])
    print("\n============ Starting hyperparameter optimization ==========\n")
    best_hyperparameters = pt.run_pso(value_dicts, st.get_fitness_score,
                                      hyperparameter_sets, output_dir)
    print("\n============ Saving results ================\n")
    best_parameters_path = os.path.join(output_dir,
                                        'best_hyperparameters.json')
    ut.save_dict_to_json(best_hyperparameters, best_parameters_path)
    print("Results saved to " + str(output_dir))
示例#9
0
def read_fitness(output_dir, fitness_key='d_roc'):
    '''Creates the list of score dictionaries of each sample. List is ordered
    according to the number of the sample

    Parameters:
    ----------
    output_dir : str
        Path to the directory of output

    Returns:
    -------
    scores : list of floats
        List of fitnesses
    '''
    samples = os.path.join(output_dir, 'samples')
    wild_card_path = os.path.join(samples, '*', 'score.json')
    number_samples = len(glob.glob(wild_card_path))
    score_dicts = []
    for number in range(number_samples):
        path = os.path.join(samples, str(number), 'score.json')
        score_dict = ut.read_parameters(path)[0]
        score_dicts.append(score_dict)
    scores = [score_dict[fitness_key] for score_dict in score_dicts]
    return scores
示例#10
0
def test_read_parameters():
    path_to_test_file = os.path.join(resources_dir, 'best_parameters.json')
    result = ut.read_parameters(path_to_test_file)
    expected = [{'a': 1, 'b': 2, 'c': 3}, {'stuff': 1}]
    assert result == expected