def load_gamma_subset(sourcefile, theta2_cut=0.0, conf_cut=0.9, num_off_positions=1, analysis_type='classic', with_runs=False): events = read_h5py(sourcefile, key='events') selection_columns = ['theta_deg', 'gamma_prediction', 'zd_tracking', 'conc_core'] theta_off_columns = ['theta_deg_off_{}'.format(i) for i in range(1, num_off_positions)] bg_prediction_columns = ['gamma_prediction_off_{}'.format(i) for i in range(1, num_off_positions)] if analysis_type == 'source': log.info('\tSelection events for source dependent analysis') log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut)) on_data, off_data = split_on_off_source_dependent( events=events, prediction_threshold=conf_cut, on_prediction_key='gamma_prediction', off_prediction_keys=bg_prediction_columns) on_mc = events.query('gamma_prediction >= {}'.format(conf_cut)) elif analysis_type == 'classic': log.info('\tSelection events for source independent analysis') log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut)) log.info("\t\ttheta2_cut={0:.2f}".format(theta2_cut)) on_data, off_data = split_on_off_source_independent( events=events.query('gamma_prediction >= {}'.format(conf_cut)), theta2_cut=theta2_cut, theta_key='theta_deg', theta_off_keys=theta_off_columns) on_mc = events.query( '(theta_deg <= {}) & (gamma_prediction >= {})'.format( theta2_cut, conf_cut)) log.info("\t{} Data Events (on region)".format(len(on_data))) log.info("\t\t{} Data Events ({} off regions)".format(len(off_data), num_off_positions)) log.info("\t{} MC gammas after selection".format(len(on_mc))) if with_runs: runs = read_h5py(sourcefile, key='runs') t_obs = runs.ontime.sum() n_events_per_off_region = len(off_data) / num_off_positions n_events_on_region = len(on_data) n_events_expected_signal = n_events_on_region - n_events_per_off_region return on_mc, on_data, off_data
def model_significance(estimator, data): ''' Evaluate significance on given trained model and given datset. Parameters: estimator: sklearn.model Trained model, so there the estimator can make predictions on the dataset. data: pd.DataFrame The dataset where the siginificance should be calculated Returns: max(significance): float Maximal signigicance on the dataset by given model. ''' feature = load_feature() data['gamma_prediction'] = estimator.predict_proba(data[feature])[:,1] significance = [] for threshold in np.linspace(0.01, 0.99, 99): on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >'+threshold.astype(str)), theta2_cut=0.03) significance.append(li_ma_significance(len(on_data), len(off_data), 0.2)) return max(significance)
def plot_significance(estimator, data, save=True, path= 'significance.pdf'): ''' Plot the significance in dependence to threshold. Parameters: estimator: sklearn.model Trained model, so there the estimator can make predictions on the dataset. data: pd.DataFrame The dataset where the siginificance should be calculated ''' feature = load_feature() data['gamma_prediction'] = estimator.predict_proba(data[feature])[:,1] significance = [] for threshold in np.linspace(0.01, 0.99, 99): on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >'+threshold.astype(str)), theta2_cut=0.03) significance.append(li_ma_significance(len(on_data), len(off_data), 0.2)) plt.plot(np.linspace(0.01, 0.99, 99), significance) if(save==True): plt.title('max('+str(round(max(significance),2))+')') plt.xlabel('threshold') plt.ylabel('confidence') plt.savefig(path)
def main( config, observation_file, gamma_file, corsika_file, output_file, seed, label, threshold, theta2_cut, ): ''' unfold fact data ''' setup_logging() log = logging.getLogger('fact_funfolding') log.setLevel(logging.INFO) random_state = np.random.RandomState(seed) np.random.set_state(random_state.get_state()) config = Config.from_yaml(config) e_ref = config.e_ref threshold = threshold or config.threshold theta2_cut = theta2_cut or config.theta2_cut log.info(f'Using threshold {threshold}') log.info(f'Using theta2 cut {theta2_cut}') # define binning in e_est and e_true bins_obs = logspace_binning(config.e_est_low, config.e_est_high, e_ref, config.n_bins_est) bins_true = logspace_binning(config.e_true_low, config.e_true_high, e_ref, config.n_bins_true) # read in files query = 'gamma_prediction > {} and theta_deg**2 < {}'.format( threshold, theta2_cut) log.info('Reading simulated gammas') gammas = read_h5py(gamma_file, key='events').query(query) with h5py.File(gamma_file, 'r') as f: sample_fraction = f.attrs.get('sample_fraction', 1.0) log.info('Using sampling fraction of {:.3f}'.format(sample_fraction)) query = 'gamma_prediction > {}'.format(threshold) log.info('Reading observations') observations = read_h5py(observation_file, key='events').query(query) on, off = split_on_off_source_independent(observations, theta2_cut=theta2_cut) observation_runs = read_h5py(observation_file, key='runs') obstime = observation_runs.ontime.sum() * u.s corsika_events = read_h5py( corsika_file, key='corsika_events', columns=['total_energy'], ) simulated_spectrum = read_simulated_spectrum(corsika_file) a_eff, bin_center, bin_width, a_eff_low, a_eff_high = collection_area( corsika_events.total_energy.values, gammas[E_TRUE].values, impact=simulated_spectrum['x_scatter'], bins=bins_true.to_value(u.GeV), sample_fraction=sample_fraction, ) # unfold using funfolding X_model = gammas[E_PRED].values y_model = gammas[E_TRUE].values X_data = on[E_PRED].values g_model = np.digitize(X_model, bins_obs.to(u.GeV).value) f_model = np.digitize(y_model, bins_true.to(u.GeV).value) g_data = np.digitize(X_data, bins_obs.to(u.GeV).value) model = ff.model.LinearModel(random_state=random_state) model.initialize(digitized_obs=g_model, digitized_truth=f_model) vec_g_data, _ = model.generate_vectors(digitized_obs=g_data) vec_g_model, vec_f_model = model.generate_vectors(digitized_obs=g_model, digitized_truth=f_model) if config.background: X_bg = off[E_PRED].values g_bg = np.digitize(X_bg, bins_obs.to(u.GeV).value) vec_g_bg, _ = model.generate_vectors(digitized_obs=g_bg, ) model.add_background(vec_g_bg * 0.2) llh = ff.solution.StandardLLH( tau=config.tau, log_f=True, reg_factor_f=1 / a_eff.value[1:-1] if config.tau else None, ) llh.initialize( vec_g=vec_g_data, model=model, ignore_n_bins_low=1, ignore_n_bins_high=1, ) sol_mcmc = ff.solution.LLHSolutionMCMC( n_burn_steps=config.n_burn_steps, n_used_steps=config.n_used_steps, random_state=random_state, ) sol_mcmc.initialize(llh=llh, model=model) sol_mcmc.set_x0_and_bounds(x0=np.random.poisson(vec_f_model * vec_g_data.sum() / vec_g_model.sum())) vec_f_est, sigma_vec_f, sample, probs, autocorr_time = sol_mcmc.fit() additional_features_to_save = dict() additional_features_to_save['a_eff'] = a_eff additional_features_to_save['a_eff_low'] = a_eff_low additional_features_to_save['a_eff_high'] = a_eff_high save_spectrum( output_file, bins_true, vec_f_est / a_eff / obstime / bin_width / u.GeV, sigma_vec_f / a_eff / obstime / bin_width / u.GeV, counts=vec_f_est, counts_err=sigma_vec_f, g=vec_g_data, bg=vec_g_bg, tau=config.tau, label=label or config.label, add_features=additional_features_to_save, )
def main(data_path, gamma_path, corsika_path, config_template, output_base, threshold, theta2_cut, gamma_fraction, title, start, end, zd_min, zd_max): with h5py.File(data_path, 'r') as f: source_dependent = 'gamma_prediction_off_1' in f['events'].keys() if source_dependent: other_columns.extend(bg_prediction_columns) theta_cut = np.inf theta2_cut = np.inf print('Source dependent separation, ignoring theta cut') theta_cut = np.sqrt(theta2_cut) data = read_h5py(data_path, key='events', columns=data_columns + output_columns + other_columns) gammas = read_h5py( gamma_path, key='events', columns=mc_columns + output_columns + other_columns, ) gammas.rename( columns={'corsika_evt_header_total_energy': 'true_energy'}, inplace=True, ) runs = read_h5py(data_path, key='runs') data['timestamp'] = pd.to_datetime( data['unix_time_utc_0'] * 1e6 + data['unix_time_utc_1'], unit='us', ) if start: data = data.query('timestamp >= @start') runs = runs.query('run_start >= @start') if end: data = data.query('timestamp <= @end') runs = runs.query('run_start <= @end') min_zenith = runs.zenith.min() max_zenith = runs.zenith.max() if zd_min: min_zenith = max(min_zenith, zd_min) if zd_max: max_zenith = min(max_zenith, zd_max) print('Zenith range of the input data:', min_zenith, max_zenith) if source_dependent: on_data, off_data = split_on_off_source_dependent(data, threshold) on_gammas = gammas.query('gamma_prediction >= {}'.format(threshold)) else: on_data, off_data = split_on_off_source_independent( data.query('gamma_prediction >= {}'.format(threshold)), theta2_cut=theta2_cut, ) on_gammas = gammas.query( '(theta_deg <= {}) & (gamma_prediction >= {})'.format( theta_cut, threshold, )) query = '(zd_tracking >= {}) and (zd_tracking <= {})'.format( min_zenith, max_zenith) on_gammas = on_gammas.query(query).copy() output_columns.append('theta_deg') on_gammas = on_gammas.loc[:, output_columns + ['true_energy']] on_data = on_data.loc[:, output_columns + data_columns] off_data = off_data.loc[:, output_columns + data_columns] off_data['weight'] = 0.2 on_data['weight'] = 1.0 on_gammas['weight'] = 1.0 rpd.to_root(on_data, output_base + '_on.root', key='events') rpd.to_root(off_data, output_base + '_off.root', key='events') rpd.to_root(on_gammas, output_base + '_mc.root', key='events') print('N_on: {}'.format(len(on_data))) print('N_off: {}'.format(len(off_data))) print('S(Li&Ma): {}'.format( li_ma_significance(len(on_data), len(off_data), 0.2))) print('N_mc: {}'.format(len(on_gammas))) n_excess = len(on_data) - 0.2 * len(off_data) fraction = n_excess / len(on_gammas) print('N_excess:', n_excess) print('Fraction: {:1.4f}'.format(fraction)) with open(config_template) as f: template = f.read() t_obs = runs.ontime.sum() try: corsika = pd.read_hdf(corsika_path, key='table') except KeyError: f = h5py.File(corsika_path) print("given key not in file: possible keys are: {}".format( list(f.keys()))) return corsika['zenith'] = np.rad2deg(corsika['zenith']) corsika = corsika.query('(zenith >= {}) and (zenith <= {})'.format( min_zenith, max_zenith)) print('Simulated events after zenith cut: {}'.format(len(corsika))) config = template.format( t_obs=t_obs, selection_fraction=gamma_fraction, n_gamma=len(corsika), source_file_on=output_base + '_on.root', source_file_off=output_base + '_off.root', source_file_mc=output_base + '_mc.root', tree_name='events', output_file=output_base + '_result.root', fraction=fraction, min_zenith=min_zenith, max_zenith=max_zenith, title=title, ) with open(output_base + '.config', 'w') as f: f.write(config)
mess_Tree.fit(mess_data.drop('label', axis=1), mess_data.label) mess_xgbc.fit(mess_data.drop('label', axis=1), mess_data.label) mc_Tree.fit(mc_data.drop('label', axis=1), mc_data.label) mc_xgbc.fit(mc_data.drop('label', axis=1), mc_data.label) pred_mess_tree = mess_Tree.predict_proba(eval_data[feature])[:, 1] pred_mess_xgbc = mess_xgbc.predict_proba(eval_data[feature])[:, 1] pred_mc_tree = mc_Tree.predict_proba(eval_data[feature])[:, 1] pred_mc_xgbc = mc_xgbc.predict_proba(eval_data[feature])[:, 1] sig_mess_tree = [] sig_mess_xgbc = [] sig_mc_tree = [] sig_mc_xgbc = [] for threshold in np.linspace(0.01, 0.99, 99): on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mess_tree], theta2_cut=0.03) sig_mess_tree.append(li_ma_significance(len(on_data), len(off_data), 0.2)) on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mess_xgbc], theta2_cut=0.03) sig_mess_xgbc.append(li_ma_significance(len(on_data), len(off_data), 0.2)) on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mc_tree], theta2_cut=0.03) sig_mc_tree.append(li_ma_significance(len(on_data), len(off_data), 0.2)) on_data, off_data = split_on_off_source_independent( eval_data[threshold <= pred_mc_xgbc], theta2_cut=0.03) sig_mc_xgbc.append(li_ma_significance(len(on_data), len(off_data), 0.2)) data = pd.DataFrame({ 'sig_mess_tree': np.transpose(sig_mess_tree), 'sig_mess_xgbc': np.transpose(sig_mess_xgbc), 'sig_mc_tree': np.transpose(sig_mc_tree),
from fact.analysis import li_ma_significance, split_on_off_source_independent from fact.io import read_data df = read_data('crab_gammas_dl3.hdf5', key='events') on, off = split_on_off_source_independent( df.query('gamma_prediction > 0.85'), 0.025, ) with open('build/significance.tex', 'w') as f: f.write(r'\SI{') f.write( '{:.1f}'.format(li_ma_significance(len(on), len(off), 0.2)) ) f.write(r'}{σ}')