예제 #1
0
def get_config_flux(config):

    sim_config = data_config_to_sim_config(config)

    pipeline_str = 'BDT'
    pipeline = comp.get_pipeline(pipeline_str)
    energybins = comp.analysis.get_energybins()
    # Load simulation and training features
    df_sim_train, df_sim_test = comp.load_sim(config=sim_config, verbose=False)
    feature_list, feature_labels = comp.analysis.get_training_features()
    # Load data
    df_data = comp.load_data(config=config)
    X_data = comp.dataframe_functions.dataframe_to_array(
        df_data, feature_list + ['lap_log_energy'])
    log_energy = X_data[:, -1]
    X_data = X_data[:, :-1]

    pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])
    data_predictions = pipeline.predict(X_data)
    # Get composition masks
    data_labels = np.array([
        comp.dataframe_functions.label_to_comp(pred)
        for pred in data_predictions
    ])
    data_light_mask = data_labels == 'light'
    data_heavy_mask = data_labels == 'heavy'
    # Get number of identified comp in each energy bin
    df_flux = {}
    comp_list = ['light', 'heavy']
    for composition in comp_list:
        comp_mask = data_labels == composition
        df_flux['counts_' + composition] = np.histogram(
            log_energy[comp_mask], bins=energybins.log_energy_bins)[0]
        df_flux['counts_' + composition + '_err'] = np.sqrt(
            df_flux['counts_' + composition])

    df_flux['counts_total'] = np.histogram(log_energy,
                                           bins=energybins.log_energy_bins)[0]
    df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total'])
    # Solid angle
    max_zenith_rad = df_sim_train['lap_zenith'].max()
    solid_angle = 2 * np.pi * (1 - np.cos(max_zenith_rad))
    df_flux['solid_angle'] = solid_angle
    # Livetime
    livetime, livetime_err = comp.get_detector_livetime(config=config)
    df_flux['livetime'] = livetime
    df_flux['livetime_err'] = livetime_err

    return df_flux
예제 #2
0
    config = args.config
    num_groups = args.num_groups
    p = args.prob_correct

    comp_list = comp.get_comp_list(num_groups=num_groups)
    energybins = comp.get_energybins(config)
    num_ebins = len(energybins.log_energy_midpoints)

    data_dir = os.path.join(comp.paths.comp_data_dir, config, 'unfolding',
                            'datachallenge')

    # Load simulation and train composition classifier
    df_sim_train, df_sim_test = comp.load_sim(config=config,
                                              energy_reco=False,
                                              log_energy_min=None,
                                              log_energy_max=None,
                                              test_size=0.5,
                                              verbose=True)

    feature_list, feature_labels = comp.get_training_features()

    print('Loading energy regressor...')
    energy_pipeline = comp.load_trained_model(
        'linearregression_energy_{}'.format(config))
    # energy_pipeline = comp.load_trained_model('RF_energy_{}'.format(config))
    for df in [df_sim_train, df_sim_test]:
        df['reco_log_energy'] = energy_pipeline.predict(
            df[feature_list].values)
        df['reco_energy'] = 10**df['reco_log_energy']

    print('Loading or fitting composition classifier...')
예제 #3
0
                        'gridsearch. Ignored if gridsearch=False.')
    args = parser.parse_args()

    config = args.config
    num_groups = args.num_groups

    comp_list = comp.get_comp_list(num_groups=num_groups)
    energybins = comp.get_energybins(config=config)
    log_energy_min = energybins.log_energy_min
    log_energy_max = energybins.log_energy_max

    # Load training data and fit model
    df_sim_train, df_sim_test = comp.load_sim(
        config=config,
        energy_reco=False,
        log_energy_min=None,
        log_energy_max=None,
        # log_energy_min=log_energy_min,
        # log_energy_max=log_energy_max,
        test_size=0.5)
    feature_list, feature_labels = comp.get_training_features()
    X_train = df_sim_train[feature_list].values
    y_train = df_sim_train['comp_target_{}'.format(num_groups)].values

    # Load untrained model
    pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, config,
                                                 num_groups)
    pipeline = comp.get_pipeline(pipeline_str)

    if args.gridsearch:
        param_grid = comp.get_param_grid(pipeline_name=pipeline_str)
        pipeline = comp.gridsearch_optimize(pipeline=pipeline,
예제 #4
0
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split

import comptools as comp

color_dict = comp.get_color_dict()

config = 'IC79.2010'
num_groups = 4
comp_list = comp.get_comp_list(num_groups)
energybins = comp.get_energybins(config=config)

df_sim_train, df_sim_test = comp.load_sim(config=config, test_size=0.5,
                                          log_energy_min=energybins.log_energy_min,
                                          log_energy_max=energybins.log_energy_max,
                                          verbose=True)

ldf_cols = [col for col in df_sim_train.columns if 'ldf' in col]

isnull_mask_train = df_sim_train[ldf_cols].isnull().sum(axis=1).astype(bool)
isnull_mask_test = df_sim_test[ldf_cols].isnull().sum(axis=1).astype(bool)
zero_ldf = df_sim_train[ldf_cols].sum(axis=1) == 0

X_train = df_sim_train.loc[~isnull_mask_train, ldf_cols].values
X_train = X_train / X_train.sum(axis=1)[:, None]
y_train = df_sim_train.loc[~isnull_mask_train, f'comp_target_{num_groups}'].values

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=2)

X_test = df_sim_test.loc[~isnull_mask_test, ldf_cols].values
예제 #5
0
def fit_efficiencies(df_file=None,
                     config='IC86.2012',
                     num_groups=2,
                     sigmoid='slant',
                     n_samples=1000):
    print('Loading df_file: {}'.format(df_file))

    comp_list = comp.get_comp_list(num_groups=num_groups)

    energybins = comp.get_energybins(config=config)
    # Want to include energy bins for energies below the normal analysis energy
    # range so we can get a better estimate of how the detector efficiencies turn on
    low_energy_bins = np.arange(5.0, energybins.log_energy_min, 0.1)
    bins = np.concatenate((low_energy_bins, energybins.log_energy_bins))
    bin_midpoints = (bins[1:] + bins[:-1]) / 2

    df_sim = comp.load_sim(df_file=df_file,
                           config=config,
                           test_size=0,
                           log_energy_min=None,
                           log_energy_max=None)

    # Thrown areas are different for different energy bin
    thrown_radii = comp.simfunctions.get_sim_thrown_radius(bin_midpoints)
    thrown_areas = np.pi * thrown_radii**2
    thrown_areas_max = thrown_areas.max()

    # Calculate efficiencies and effective areas for each composition group
    efficiencies = pd.DataFrame()
    effective_area, effective_area_err = {}, {}
    for composition in comp_list + ['total']:
        compositions = df_sim['comp_group_{}'.format(num_groups)]
        # Need list of simulation sets for composition to get number of thrown showers
        if composition == 'total':
            comp_mask = np.full_like(compositions, True)
        else:
            comp_mask = compositions == composition
        sim_list = df_sim.loc[comp_mask, 'sim'].unique()
        thrown_showers = thrown_showers_per_ebin(sim_list,
                                                 log_energy_bins=bins)
        print('thrown_showers ({}) = {}'.format(composition, thrown_showers))
        passed_showers = np.histogram(df_sim.loc[comp_mask, 'MC_log_energy'],
                                      bins=bins)[0]

        efficiency, efficiency_err = comp.ratio_error(
            num=passed_showers,
            num_err=np.sqrt(passed_showers),
            den=thrown_showers,
            den_err=np.sqrt(thrown_showers))

        # Calculate effective area from efficiencies and thrown areas
        effective_area[composition] = efficiency * thrown_areas
        effective_area_err[composition] = efficiency_err * thrown_areas

        # Scale efficiencies by geometric factor to take into account
        # different simulated thrown radii
        thrown_radius_factor = thrown_areas / thrown_areas_max
        efficiencies['eff_{}'.format(
            composition)] = efficiency * thrown_radius_factor
        efficiencies['eff_err_{}'.format(
            composition)] = efficiency_err * thrown_radius_factor

    # Fit sigmoid function to efficiency vs. energy distribution
    # fit_func = sigmoid_flat if sigmoid == 'flat' else sigmoid_slant
    poly_degree = 1
    num_params = poly_degree + 3
    fit_func = generate_fit_func(degree=poly_degree)
    # p0 = [7e4, 8.0, 50.0] if sigmoid == 'flat' else [7e4, 8.5, 50.0, 800]
    init_params = [8.5, 50.0, 7e4, 800]
    p0 = np.empty(num_params)
    p0[:min(num_params, len(init_params))] = init_params[:num_params]

    efficiencies_fit = {}
    energy_min_fit, energy_max_fit = 5.8, energybins.log_energy_max
    midpoints_fitmask = np.logical_and(bin_midpoints > energy_min_fit,
                                       bin_midpoints < energy_max_fit)
    # Find best-fit sigmoid function
    for composition in comp_list + ['total']:
        eff = efficiencies.loc[midpoints_fitmask, 'eff_{}'.format(composition)]
        eff_err = efficiencies.loc[midpoints_fitmask,
                                   'eff_err_{}'.format(composition)]
        popt, pcov = curve_fit(fit_func,
                               bin_midpoints[midpoints_fitmask],
                               eff,
                               p0=p0,
                               sigma=eff_err)
        eff_fit = fit_func(bin_midpoints, *popt)
        efficiencies_fit[composition] = eff_fit

        chi2 = np.sum((eff - eff_fit[midpoints_fitmask])**2 / (eff_err)**2)
        ndof = len(eff_fit[midpoints_fitmask]) - len(p0)
        print('({}) chi2 / ndof = {} / {} = {}'.format(composition, chi2, ndof,
                                                       chi2 / ndof))

    # Perform many fits to random statistical fluxuations of the best fit efficiency
    # This will be used to estimate the uncertainty in the best fit efficiency
    np.random.seed(2)
    efficiencies_fit_samples = defaultdict(list)
    for _ in xrange(n_samples):
        for composition in comp_list + ['total']:
            # Get new random sample to fit
            eff_err = efficiencies.loc[midpoints_fitmask,
                                       'eff_err_{}'.format(composition)]
            eff_sample = np.random.normal(
                efficiencies_fit[composition][midpoints_fitmask], eff_err)
            # Fit with error bars
            popt, pcov = curve_fit(fit_func,
                                   bin_midpoints[midpoints_fitmask],
                                   eff_sample,
                                   p0=p0,
                                   sigma=eff_err)

            eff_fit_sample = fit_func(bin_midpoints, *popt)
            efficiencies_fit_samples[composition].append(eff_fit_sample)

    # Calculate median and error of efficiency fits
    eff_fit = pd.DataFrame()
    for composition in comp_list + ['total']:
        fit_median, fit_err_low, fit_err_high = np.percentile(
            efficiencies_fit_samples[composition], (50, 16, 84), axis=0)
        fit_err_low = np.abs(fit_err_low - fit_median)
        fit_err_high = np.abs(fit_err_high - fit_median)

        eff_fit['eff_median_{}'.format(composition)] = fit_median
        eff_fit['eff_err_low_{}'.format(composition)] = fit_err_low
        eff_fit['eff_err_high_{}'.format(composition)] = fit_err_high

    return efficiencies.loc[midpoints_fitmask, :], eff_fit
예제 #6
0
    parser = argparse.ArgumentParser(
        description=
        'Extracts and saves desired information from simulation/data .i3 files'
    )
    parser.add_argument('-c',
                        '--config',
                        dest='config',
                        nargs='*',
                        choices=comp.simfunctions.get_sim_configs(),
                        help='Detector configuration')
    args = parser.parse_args()

    for config in args.config:

        df_sim = comp.load_sim(config=config, test_size=0)

        comp_list = ['light', 'heavy']
        MC_comp_mask = {}
        for composition in comp_list:
            MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition
        light_mask = df_sim['MC_comp_class'] == 'light'
        heavy_mask = df_sim['MC_comp_class'] == 'heavy'

        energybins = comp.analysis.get_energybins()

        # Energy resolution
        energy_res = np.log10(df_sim['lap_energy'] / df_sim['MC_energy'])

        medians_light, stds_light, _ = comp.analysis.get_median_std(
            df_sim['MC_log_energy'][light_mask], energy_res[light_mask],
예제 #7
0
                        help='Energy that should be used.')
    args = parser.parse_args()

    config = args.config
    num_groups = args.num_groups
    n_splits = args.n_splits
    n_jobs = args.n_jobs
    energy_key = 'MC_log_energy' if args.energy == 'MC' else 'reco_log_energy'

    energybins = comp.get_energybins(config)
    comp_list = comp.get_comp_list(num_groups=num_groups)
    feature_list, feature_labels = comp.get_training_features()
    pipeline_str = 'BDT_comp_{}_{}-groups'.format(config, num_groups)

    df_train, df_test = comp.load_sim(config=config,
                                      log_energy_min=energybins.log_energy_min,
                                      log_energy_max=energybins.log_energy_max,
                                      test_size=0.5)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2)
    folds = []
    for train_index, test_index in skf.split(
            df_train, df_train['comp_target_{}'.format(num_groups)]):
        df_train_fold = df_train.iloc[train_index]
        df_test_fold = df_train.iloc[test_index]
        frac_correct = get_frac_correct(df_train_fold,
                                        df_test_fold,
                                        pipeline_str=pipeline_str,
                                        num_groups=num_groups,
                                        energy_key=energy_key)
        folds.append(frac_correct)
                        help='Number of jobs to run in parallel')
    args = parser.parse_args()

    config = args.config
    n_jobs = args.n_jobs
    energybins = comp.get_energybins(config=config)
    log_energy_min = energybins.log_energy_min
    log_energy_max = energybins.log_energy_max
    feature_list, feature_labels = comp.get_training_features()

    print('Loading full non-processed dataset for {} into memory...'.format(
        config))
    ddf = comp.load_sim(
        config=config,
        # processed=False,
        test_size=0,
        energy_reco=False,
        log_energy_min=None,
        log_energy_max=None,
        compute=False)

    # ddf = comp.load_data(config=config,
    #                      processed=False,
    #                      energy_reco=False,
    #                      log_energy_min=None,
    #                      log_energy_max=None,
    #                      compute=False)

    # Energy reconstruction model
    energy_pipeline = comp.load_trained_model(
        'linearregression_energy_{}'.format(config), return_metadata=False)
                        default=10,
                        type=int,
                        help='Number CV folds to run')
    parser.add_argument('--n_jobs',
                        dest='n_jobs',
                        default=20,
                        type=int,
                        help='Number of jobs to run in parallel')
    args = parser.parse_args()

    comp_list = comp.get_comp_list(num_groups=args.num_groups)
    energybins = comp.get_energybins(args.config)

    # Load simulation data and pipeline
    df_sim_train, df_sim_test = comp.load_sim(
        config=args.config,
        log_energy_min=energybins.log_energy_min,
        log_energy_max=energybins.log_energy_max)
    feature_list, feature_labels = comp.get_training_features()

    # pipeline_str = 'LinearSVC_comp_{}_{}-groups'.format(args.config, args.num_groups)
    # pipeline_str = 'BDT_comp_{}_{}-groups'.format(args.config, args.num_groups)
    pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, args.config,
                                                 args.num_groups)
    pipeline = comp.get_pipeline(pipeline_str)

    # Get learning curve scores
    X = df_sim_train[feature_list]
    y = df_sim_train['comp_target_{}'.format(args.num_groups)]
    train_sizes = np.linspace(0.1, 1.0, 10)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator=pipeline,
예제 #10
0
if __name__ == '__main__':

    description = 'Makes performance plots for IceTop Laputop reconstruction'
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('-c',
                        '--config',
                        dest='config',
                        nargs='*',
                        choices=comp.simfunctions.get_sim_configs(),
                        help='Detector configuration')
    args = parser.parse_args()

    for config in args.config:

        df_sim = comp.load_sim(config=config, test_size=0, verbose=True)

        comp_list = ['light', 'heavy']
        MC_comp_mask = {}
        for composition in comp_list:
            MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition
        light_mask = df_sim['MC_comp_class'] == 'light'
        heavy_mask = df_sim['MC_comp_class'] == 'heavy'

        energybins = comp.get_energybins()

        # Energy resolution
        energy_res = np.log10(df_sim['lap_energy'] / df_sim['MC_energy'])

        medians_light, stds_light, _ = comp.data_functions.get_median_std(
            df_sim['MC_log_energy'][light_mask], energy_res[light_mask],
예제 #11
0
import comptools.analysis.plotting as plotting

color_dict = comp.analysis.get_color_dict()

if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='Makes and saves feature importance plot')
    parser.add_argument('-c',
                        '--config',
                        dest='config',
                        choices=comp.simfunctions.get_sim_configs(),
                        help='Detector configuration')
    args = parser.parse_args()

    df_sim_train, df_sim_test = comp.load_sim(config=args.config)
    pipeline_str = 'BDT'
    pipeline = comp.get_pipeline(pipeline_str)
    feature_list, feature_labels = comp.analysis.get_training_features()

    pipeline.fit(df_sim_train[feature_list], df_sim_train['target'])

    num_features = len(feature_list)
    importances = pipeline.named_steps['classifier'].feature_importances_
    indices = np.argsort(importances)[::-1]

    for f in range(num_features):
        print('{}) {}'.format(f + 1, importances[indices[f]]))

    # Make feature importance plot
    fig, ax = plt.subplots()
def save_data_MC_plots(config, june_july_only):

    df_sim = comp.load_sim(config='IC86.2012', test_size=0, verbose=False)
    # energy_mask_sim = (df_sim['lap_log_energy'] > 6.0)
    # energy_mask_sim = (df_sim['lap_log_energy'] > 6.4) & (df_sim['lap_log_energy'] < 8.0)
    # df_sim = df_sim[energy_mask_sim]

    df_data = comp.load_data(config=config, verbose=False)
    df_data = df_data[np.isfinite(df_data['log_dEdX'])]
    # energy_mask_data = (df_data['lap_log_energy'] > 6.4) & (df_data['lap_log_energy'] < 8.0)
    # df_data = df_data[energy_mask_data]

    if june_july_only:
        print('Masking out all data events not in June or July')

        def is_june_july(time):
            i3_time = dataclasses.I3Time(time)
            return i3_time.date_time.month in [6, 7]

        june_july_mask = df_data.end_time_mjd.apply(is_june_july)
        df_data = df_data[june_july_mask].reset_index(drop=True)

    months = (6, 7) if june_july_only else None
    livetime, livetime_err = comp.get_detector_livetime(config, months=months)

    weights = get_sim_weights(df_sim)
    df_sim['weights'] = flux(df_sim['MC_energy']) * weights

    MC_comp_mask = {}
    comp_list = ['PPlus', 'Fe56Nucleus']
    for composition in comp_list:
        MC_comp_mask[composition] = df_sim['MC_comp'] == composition
    #     MC_comp_mask[composition] = df_sim['MC_comp_class'] == composition

    # S125 data-MC plot
    log_s125_bins = np.linspace(-0.5, 3.5, 50)
    gs_s125 = plot_data_MC_comparison(df_sim,
                                      df_data,
                                      'log_s125',
                                      log_s125_bins,
                                      '$\mathrm{\log_{10}(S_{125})}$',
                                      livetime,
                                      ylim_ratio=(0, 2))
    s125_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison',
                                's125_{}.png'.format(config))
    plt.savefig(s125_outfile)

    # dE/dX data-MC plot
    log_dEdX_bins = np.linspace(-2, 4, 50)
    gs_dEdX = plot_data_MC_comparison(df_sim,
                                      df_data,
                                      'log_dEdX',
                                      log_dEdX_bins,
                                      '$\mathrm{\log_{10}(dE/dX)}$',
                                      livetime,
                                      ylim_ratio=(0, 5.5))
    dEdX_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison',
                                'dEdX_{}.png'.format(config))
    plt.savefig(dEdX_outfile)

    # cos(zenith) data-MC plot
    cos_zenith_bins = np.linspace(0.8, 1.0, 50)
    gs_zenith = plot_data_MC_comparison(df_sim,
                                        df_data,
                                        'lap_cos_zenith',
                                        cos_zenith_bins,
                                        '$\mathrm{\cos(\\theta_{reco})}$',
                                        livetime,
                                        ylim_ratio=(0, 3))
    zenith_outfile = os.path.join(comp.paths.figures_dir, 'data-MC-comparison',
                                  'zenith_{}.png'.format(config))
    plt.savefig(zenith_outfile)

    # InIce median radius data-MC plot
    inice_radius_bins = np.linspace(0, 200, 50)
    gs_inice_radius = plot_data_MC_comparison(
        df_sim,
        df_data,
        'median_inice_radius',
        inice_radius_bins,
        '$\mathrm{\cos(\\theta_{reco})}$',
        livetime,
        ylim_ratio=(0, 3))
    inice_radius_outfile = os.path.join(
        comp.paths.figures_dir, 'data-MC-comparison',
        'median_inice_radius_{}.png'.format(config))
    plt.savefig(inice_radius_outfile)

    # log_d4r_peak_energy data-MC plot
    log_d4r_peak_energy_bins = np.linspace(-0.5, 3.5, 50)
    gs_d4R_peak_energy = plot_data_MC_comparison(
        df_sim,
        df_data,
        'log_d4r_peak_energy',
        log_d4r_peak_energy_bins,
        '$\mathrm{\log_{10}(E_{D4R}/GeV)}$',
        livetime,
        ylim_ratio=(0, 5.5))
    d4R_peak_energy_outfile = os.path.join(
        comp.paths.figures_dir, 'data-MC-comparison',
        'd4R_peak_energy_{}.png'.format(config))
    plt.savefig(d4R_peak_energy_outfile)

    # log_d4r_peak_sigma data-MC plot
    log_d4r_peak_sigma_bins = np.linspace(-1, 3, 50)
    gs_d4R_peak_sigma = plot_data_MC_comparison(
        df_sim,
        df_data,
        'log_d4r_peak_sigma',
        log_d4r_peak_sigma_bins,
        '$\mathrm{\log_{10}(E_{D4R}/GeV)}$',
        livetime,
        ylim_ratio=(0, 5.5))
    d4R_peak_sigma_outfile = os.path.join(
        comp.paths.figures_dir, 'data-MC-comparison',
        'd4R_peak_sigma_{}.png'.format(config))
    plt.savefig(d4R_peak_sigma_outfile)