def calculate_ratio(flux, flux_err_stat, flux_err_sys, true_flux, true_flux_err_stat, true_flux_err_sys): diff = flux - true_flux # Error bar calculation diff_err_sys = np.sqrt(flux_err_sys**2 + true_flux_err_sys**2) diff_err_stat = np.sqrt(flux_err_stat**2 + true_flux_err_stat**2) frac_diff, frac_diff_sys = comp.ratio_error(diff, diff_err_sys, true_flux, true_flux_err_sys) frac_diff, frac_diff_stat = comp.ratio_error(diff, diff_err_stat, true_flux, true_flux_err_stat) return frac_diff, frac_diff_stat, frac_diff_sys
def get_frac_correct(df_train, df_test, pipeline_str=None, num_groups=4, energy_key='MC_log_energy'): '''Calculates the fraction of correctly identified samples in each energy bin for each composition in comp_list. In addition, the statisitcal error for the fraction correctly identified is calculated.''' # Input validation if energy_key not in ['MC_log_energy', 'reco_log_energy']: raise ValueError( "Invalid energy_key ({}) entered. Must be either " "'MC_log_energy' or 'reco_log_energy'.".format(energy_key)) if pipeline_str is None: pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups) # Fit pipeline and get mask for correctly identified events feature_list, feature_labels = comp.get_training_features() pipeline = comp.get_pipeline(pipeline_str) comp_target_str = 'comp_target_{}'.format(num_groups) pipeline.fit(df_train[feature_list], df_train[comp_target_str]) test_predictions = pipeline.predict(df_test[feature_list]) correctly_identified_mask = (test_predictions == df_test[comp_target_str]) data = {} for composition in comp_list + ['total']: comp_mask = df_test['comp_group_{}'.format(num_groups)] == composition # Get number of MC comp in each energy bin num_MC_energy, _ = np.histogram(df_test.loc[comp_mask, energy_key], bins=energybins.log_energy_bins) num_MC_energy_err = np.sqrt(num_MC_energy) # Get number of correctly identified comp in each energy bin combined_mask = comp_mask & correctly_identified_mask num_reco_energy, _ = np.histogram(df_test.loc[combined_mask, energy_key], bins=energybins.log_energy_bins) num_reco_energy_err = np.sqrt(num_reco_energy) # Calculate correctly identified fractions as a function of energy frac_correct, frac_correct_err = comp.ratio_error( num_reco_energy, num_reco_energy_err, num_MC_energy, num_MC_energy_err) data['frac_correct_{}'.format(composition)] = frac_correct data['frac_correct_err_{}'.format(composition)] = frac_correct_err return data
'livetime'] rate_2012 = df_flux_2012['counts_total'] / df_flux_2012[ 'livetime'] ratio[config] = rate[6] / rate_2012[6] else: ratio = {config: 1.0 for config in args.config} print(ratio) # Plot rate for each year on single plot fig, ax = plt.subplots() for composition in comp_list + ['total']: for config in args.config: df_flux_config = df_flux.loc[config] rate, rate_err = comp.ratio_error( df_flux_config['counts_' + composition], np.sqrt(df_flux_config['counts_' + composition]), df_flux_config['livetime'], df_flux_config['livetime_err']) plotting.plot_steps(energybins.log_energy_bins, rate, yerr=rate_err, ax=ax, color=df_flux_config[composition + '_color'], label=config + ' ' + composition) ax.set_yscale("log", nonposy='clip') ax.set_xlabel('$\mathrm{\log_{10}(E_{reco}/GeV)}$') ax.set_ylabel('Rate $\mathrm{[s^{-1}]}$') ax.set_xlim([energybins.log_energy_min, energybins.log_energy_max]) # ax.set_ylim([10**3, 10**5]) ax.grid(linestyle='dotted', which="both")
def fit_efficiencies(df_file=None, config='IC86.2012', num_groups=2, sigmoid='slant', n_samples=1000): print('Loading df_file: {}'.format(df_file)) comp_list = comp.get_comp_list(num_groups=num_groups) energybins = comp.get_energybins(config=config) # Want to include energy bins for energies below the normal analysis energy # range so we can get a better estimate of how the detector efficiencies turn on low_energy_bins = np.arange(5.0, energybins.log_energy_min, 0.1) bins = np.concatenate((low_energy_bins, energybins.log_energy_bins)) bin_midpoints = (bins[1:] + bins[:-1]) / 2 df_sim = comp.load_sim(df_file=df_file, config=config, test_size=0, log_energy_min=None, log_energy_max=None) # Thrown areas are different for different energy bin thrown_radii = comp.simfunctions.get_sim_thrown_radius(bin_midpoints) thrown_areas = np.pi * thrown_radii**2 thrown_areas_max = thrown_areas.max() # Calculate efficiencies and effective areas for each composition group efficiencies = pd.DataFrame() effective_area, effective_area_err = {}, {} for composition in comp_list + ['total']: compositions = df_sim['comp_group_{}'.format(num_groups)] # Need list of simulation sets for composition to get number of thrown showers if composition == 'total': comp_mask = np.full_like(compositions, True) else: comp_mask = compositions == composition sim_list = df_sim.loc[comp_mask, 'sim'].unique() thrown_showers = thrown_showers_per_ebin(sim_list, log_energy_bins=bins) print('thrown_showers ({}) = {}'.format(composition, thrown_showers)) passed_showers = np.histogram(df_sim.loc[comp_mask, 'MC_log_energy'], bins=bins)[0] efficiency, efficiency_err = comp.ratio_error( num=passed_showers, num_err=np.sqrt(passed_showers), den=thrown_showers, den_err=np.sqrt(thrown_showers)) # Calculate effective area from efficiencies and thrown areas effective_area[composition] = efficiency * thrown_areas effective_area_err[composition] = efficiency_err * thrown_areas # Scale efficiencies by geometric factor to take into account # different simulated thrown radii thrown_radius_factor = thrown_areas / thrown_areas_max efficiencies['eff_{}'.format( composition)] = efficiency * thrown_radius_factor efficiencies['eff_err_{}'.format( composition)] = efficiency_err * thrown_radius_factor # Fit sigmoid function to efficiency vs. energy distribution # fit_func = sigmoid_flat if sigmoid == 'flat' else sigmoid_slant poly_degree = 1 num_params = poly_degree + 3 fit_func = generate_fit_func(degree=poly_degree) # p0 = [7e4, 8.0, 50.0] if sigmoid == 'flat' else [7e4, 8.5, 50.0, 800] init_params = [8.5, 50.0, 7e4, 800] p0 = np.empty(num_params) p0[:min(num_params, len(init_params))] = init_params[:num_params] efficiencies_fit = {} energy_min_fit, energy_max_fit = 5.8, energybins.log_energy_max midpoints_fitmask = np.logical_and(bin_midpoints > energy_min_fit, bin_midpoints < energy_max_fit) # Find best-fit sigmoid function for composition in comp_list + ['total']: eff = efficiencies.loc[midpoints_fitmask, 'eff_{}'.format(composition)] eff_err = efficiencies.loc[midpoints_fitmask, 'eff_err_{}'.format(composition)] popt, pcov = curve_fit(fit_func, bin_midpoints[midpoints_fitmask], eff, p0=p0, sigma=eff_err) eff_fit = fit_func(bin_midpoints, *popt) efficiencies_fit[composition] = eff_fit chi2 = np.sum((eff - eff_fit[midpoints_fitmask])**2 / (eff_err)**2) ndof = len(eff_fit[midpoints_fitmask]) - len(p0) print('({}) chi2 / ndof = {} / {} = {}'.format(composition, chi2, ndof, chi2 / ndof)) # Perform many fits to random statistical fluxuations of the best fit efficiency # This will be used to estimate the uncertainty in the best fit efficiency np.random.seed(2) efficiencies_fit_samples = defaultdict(list) for _ in xrange(n_samples): for composition in comp_list + ['total']: # Get new random sample to fit eff_err = efficiencies.loc[midpoints_fitmask, 'eff_err_{}'.format(composition)] eff_sample = np.random.normal( efficiencies_fit[composition][midpoints_fitmask], eff_err) # Fit with error bars popt, pcov = curve_fit(fit_func, bin_midpoints[midpoints_fitmask], eff_sample, p0=p0, sigma=eff_err) eff_fit_sample = fit_func(bin_midpoints, *popt) efficiencies_fit_samples[composition].append(eff_fit_sample) # Calculate median and error of efficiency fits eff_fit = pd.DataFrame() for composition in comp_list + ['total']: fit_median, fit_err_low, fit_err_high = np.percentile( efficiencies_fit_samples[composition], (50, 16, 84), axis=0) fit_err_low = np.abs(fit_err_low - fit_median) fit_err_high = np.abs(fit_err_high - fit_median) eff_fit['eff_median_{}'.format(composition)] = fit_median eff_fit['eff_err_low_{}'.format(composition)] = fit_err_low eff_fit['eff_err_high_{}'.format(composition)] = fit_err_high return efficiencies.loc[midpoints_fitmask, :], eff_fit
def get_classified_fractions(df_train, df_test, pipeline_str=None, num_groups=4, energy_key='MC_log_energy'): '''Calculates the fraction of correctly identified samples in each energy bin for each composition in comp_list. In addition, the statisitcal error for the fraction correctly identified is calculated.''' # Input validation if energy_key not in ['MC_log_energy', 'reco_log_energy']: raise ValueError( "Invalid energy_key ({}) entered. Must be either " "'MC_log_energy' or 'reco_log_energy'.".format(energy_key)) if pipeline_str is None: pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups) # Fit pipeline and get mask for correctly identified events feature_list, feature_labels = comp.get_training_features() if 'CustomClassifier' in pipeline_str: pipeline = comp.get_pipeline(pipeline_str) else: pipeline = comp.load_trained_model(pipeline_str) comp_target_str = 'comp_target_{}'.format(num_groups) if 'CustomClassifier' in pipeline_str: test_predictions = pipeline.predict( df_test['comp_target_{}'.format(num_groups)]) else: test_predictions = pipeline.predict(df_test[feature_list]) pred_comp = np.array( comp.decode_composition_groups(test_predictions, num_groups=num_groups)) data = {} for true_composition, identified_composition in product( comp_list, comp_list): true_comp_mask = df_test['comp_group_{}'.format( num_groups)] == true_composition ident_comp_mask = pred_comp == identified_composition # Get number of MC comp in each energy bin num_true_comp, _ = np.histogram(df_test.loc[true_comp_mask, energy_key], bins=energybins.log_energy_bins) num_true_comp_err = np.sqrt(num_true_comp) # Get number of correctly identified comp in each energy bin combined_mask = true_comp_mask & ident_comp_mask num_identified_comp, _ = np.histogram(df_test.loc[combined_mask, energy_key], bins=energybins.log_energy_bins) num_identified_comp_err = np.sqrt(num_identified_comp) # Calculate correctly identified fractions as a function of energy frac_identified, frac_identified_err = comp.ratio_error( num_identified_comp, num_identified_comp_err, num_true_comp, num_true_comp_err) data['true_{}_identified_{}'.format( true_composition, identified_composition)] = frac_identified data['true_{}_identified_{}_err'.format( true_composition, identified_composition)] = frac_identified_err return data