Exemplo n.º 1
0
def get_frac_correct(df_train,
                     df_test,
                     pipeline_str=None,
                     num_groups=4,
                     energy_key='MC_log_energy'):
    '''Calculates the fraction of correctly identified samples in each energy bin
    for each composition in comp_list. In addition, the statisitcal error for the
    fraction correctly identified is calculated.'''

    # Input validation
    if energy_key not in ['MC_log_energy', 'reco_log_energy']:
        raise ValueError(
            "Invalid energy_key ({}) entered. Must be either "
            "'MC_log_energy' or 'reco_log_energy'.".format(energy_key))

    if pipeline_str is None:
        pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups)

    # Fit pipeline and get mask for correctly identified events
    feature_list, feature_labels = comp.get_training_features()
    pipeline = comp.get_pipeline(pipeline_str)
    comp_target_str = 'comp_target_{}'.format(num_groups)
    pipeline.fit(df_train[feature_list], df_train[comp_target_str])

    test_predictions = pipeline.predict(df_test[feature_list])
    correctly_identified_mask = (test_predictions == df_test[comp_target_str])

    data = {}
    for composition in comp_list + ['total']:
        comp_mask = df_test['comp_group_{}'.format(num_groups)] == composition
        # Get number of MC comp in each energy bin
        num_MC_energy, _ = np.histogram(df_test.loc[comp_mask, energy_key],
                                        bins=energybins.log_energy_bins)
        num_MC_energy_err = np.sqrt(num_MC_energy)

        # Get number of correctly identified comp in each energy bin
        combined_mask = comp_mask & correctly_identified_mask
        num_reco_energy, _ = np.histogram(df_test.loc[combined_mask,
                                                      energy_key],
                                          bins=energybins.log_energy_bins)
        num_reco_energy_err = np.sqrt(num_reco_energy)

        # Calculate correctly identified fractions as a function of energy
        frac_correct, frac_correct_err = comp.ratio_error(
            num_reco_energy, num_reco_energy_err, num_MC_energy,
            num_MC_energy_err)
        data['frac_correct_{}'.format(composition)] = frac_correct
        data['frac_correct_err_{}'.format(composition)] = frac_correct_err

    return data
Exemplo n.º 2
0
def get_config_flux(config):

    sim_config = data_config_to_sim_config(config)

    pipeline_str = 'BDT'
    pipeline = comp.get_pipeline(pipeline_str)
    energybins = comp.analysis.get_energybins()
    # Load simulation and training features
    df_sim_train, df_sim_test = comp.load_sim(config=sim_config, verbose=False)
    feature_list, feature_labels = comp.analysis.get_training_features()
    # Load data
    df_data = comp.load_data(config=config)
    X_data = comp.dataframe_functions.dataframe_to_array(
        df_data, feature_list + ['lap_log_energy'])
    log_energy = X_data[:, -1]
    X_data = X_data[:, :-1]

    pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])
    data_predictions = pipeline.predict(X_data)
    # Get composition masks
    data_labels = np.array([
        comp.dataframe_functions.label_to_comp(pred)
        for pred in data_predictions
    ])
    data_light_mask = data_labels == 'light'
    data_heavy_mask = data_labels == 'heavy'
    # Get number of identified comp in each energy bin
    df_flux = {}
    comp_list = ['light', 'heavy']
    for composition in comp_list:
        comp_mask = data_labels == composition
        df_flux['counts_' + composition] = np.histogram(
            log_energy[comp_mask], bins=energybins.log_energy_bins)[0]
        df_flux['counts_' + composition + '_err'] = np.sqrt(
            df_flux['counts_' + composition])

    df_flux['counts_total'] = np.histogram(log_energy,
                                           bins=energybins.log_energy_bins)[0]
    df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total'])
    # Solid angle
    max_zenith_rad = df_sim_train['lap_zenith'].max()
    solid_angle = 2 * np.pi * (1 - np.cos(max_zenith_rad))
    df_flux['solid_angle'] = solid_angle
    # Livetime
    livetime, livetime_err = comp.get_detector_livetime(config=config)
    df_flux['livetime'] = livetime
    df_flux['livetime_err'] = livetime_err

    return df_flux
def save_anisotropy_dataframe(config, outfile):

    print('Loading data...')
    data_df = comp.load_dataframe(datatype='data',
                                  config=config,
                                  verbose=False)
    keep_columns = [
        'lap_zenith', 'lap_azimuth', 'start_time_mjd', 'pred_comp',
        'lap_log_energy'
    ]

    comp_list = ['light', 'heavy']
    pipeline_str = 'GBDT'
    pipeline = comp.get_pipeline(pipeline_str)
    feature_list, feature_labels = comp.get_training_features()
    data_df.loc[:, feature_list].dropna(axis=0, how='any', inplace=True)

    print('Loading simulation...')
    if 'IC86' in config:
        sim_config = 'IC86.2012'
    else:
        sim_config = 'IC79'
    sim_df = comp.load_dataframe(datatype='sim',
                                 config=sim_config,
                                 verbose=False,
                                 split=False)
    X_train, y_train = comp.dataframe_functions.dataframe_to_X_y(
        sim_df, feature_list)
    print('Training classifier...')
    pipeline = pipeline.fit(X_train, y_train)
    X_data = comp.dataframe_functions.dataframe_to_array(data_df, feature_list)
    data_pred = pd.Series(pipeline.predict(X_data), dtype=int)
    data_df['pred_comp'] = data_pred.apply(
        comp.dataframe_functions.label_to_comp)
    # print('decision_function = {}'.format(pipeline.decision_function(X_data)))
    # data_df['score'] = pipeline.decision_function(X_data)

    print('Saving anisotropy DataFrame for {}'.format(config))
    with pd.HDFStore(outfile, 'w') as store:
        store.put('dataframe', data_df.loc[:, keep_columns], format='table')

    return
Exemplo n.º 4
0
def get_composition_pipeline(pipeline, p, use_sample_weights):
    if p is None:
        pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config,
                                                     num_groups)
        pipeline = comp.load_trained_model(pipeline_str)
    if use_sample_weights is not None:
        model = use_sample_weights
        pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config,
                                                     num_groups)
        pipeline = comp.get_pipeline(pipeline_str)
        compositions = df_sim_train['comp_group_{}'.format(num_groups)].values
        energies = df_sim_train['reco_energy'].values
        sample_weight = calculate_sample_weights(compositions,
                                                 energies,
                                                 model=model)
        X = df_sim_train[feature_list].values
        y = df_sim_train['comp_target_{}'.format(num_groups)].values
        fit_params = {'classifier__sample_weight': sample_weight}
        pipeline.fit(X, y, **fit_params)

    return pipeline
Exemplo n.º 5
0
        df['reco_log_energy'] = energy_pipeline.predict(
            df[feature_list].values)
        df['reco_energy'] = 10**df['reco_log_energy']

    print('Loading or fitting composition classifier...')
    if any([
            args.weights_model, args.energy_spectrum_weights,
            args.compositon_weights
    ]):
        model = args.weights_model
        energy_spectrum_weights = args.energy_spectrum_weights
        compositon_weights = args.compositon_weights

        pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, config,
                                                     num_groups)
        pipeline = comp.get_pipeline(pipeline_str)
        compositions = df_sim_train['comp_group_{}'.format(num_groups)].values
        energies = df_sim_train['reco_energy'].values
        sample_weight = calculate_sample_weights(
            compositions,
            energies,
            model=model,
            compositon_weights=compositon_weights,
            energy_spectrum_weights=energy_spectrum_weights)
        X = df_sim_train[feature_list].values
        y = df_sim_train['comp_target_{}'.format(num_groups)].values
        fit_params = {'classifier__sample_weight': sample_weight}
        pipeline.fit(X, y, **fit_params)
    elif p is None:
        pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, config,
                                                     num_groups)
def get_classified_fractions(df_train,
                             df_test,
                             pipeline_str=None,
                             num_groups=4,
                             energy_key='MC_log_energy'):
    '''Calculates the fraction of correctly identified samples in each energy bin
    for each composition in comp_list. In addition, the statisitcal error for the
    fraction correctly identified is calculated.'''

    # Input validation
    if energy_key not in ['MC_log_energy', 'reco_log_energy']:
        raise ValueError(
            "Invalid energy_key ({}) entered. Must be either "
            "'MC_log_energy' or 'reco_log_energy'.".format(energy_key))

    if pipeline_str is None:
        pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups)

    # Fit pipeline and get mask for correctly identified events
    feature_list, feature_labels = comp.get_training_features()
    if 'CustomClassifier' in pipeline_str:
        pipeline = comp.get_pipeline(pipeline_str)
    else:
        pipeline = comp.load_trained_model(pipeline_str)
    comp_target_str = 'comp_target_{}'.format(num_groups)

    if 'CustomClassifier' in pipeline_str:
        test_predictions = pipeline.predict(
            df_test['comp_target_{}'.format(num_groups)])
    else:
        test_predictions = pipeline.predict(df_test[feature_list])
    pred_comp = np.array(
        comp.decode_composition_groups(test_predictions,
                                       num_groups=num_groups))

    data = {}
    for true_composition, identified_composition in product(
            comp_list, comp_list):
        true_comp_mask = df_test['comp_group_{}'.format(
            num_groups)] == true_composition
        ident_comp_mask = pred_comp == identified_composition

        # Get number of MC comp in each energy bin
        num_true_comp, _ = np.histogram(df_test.loc[true_comp_mask,
                                                    energy_key],
                                        bins=energybins.log_energy_bins)
        num_true_comp_err = np.sqrt(num_true_comp)

        # Get number of correctly identified comp in each energy bin
        combined_mask = true_comp_mask & ident_comp_mask
        num_identified_comp, _ = np.histogram(df_test.loc[combined_mask,
                                                          energy_key],
                                              bins=energybins.log_energy_bins)
        num_identified_comp_err = np.sqrt(num_identified_comp)

        # Calculate correctly identified fractions as a function of energy
        frac_identified, frac_identified_err = comp.ratio_error(
            num_identified_comp, num_identified_comp_err, num_true_comp,
            num_true_comp_err)
        data['true_{}_identified_{}'.format(
            true_composition, identified_composition)] = frac_identified
        data['true_{}_identified_{}_err'.format(
            true_composition, identified_composition)] = frac_identified_err

    return data