Python split_on_off_source_independent示例，fact.analysis.split_on_off_source_independent Python示例

示例#1

0

显示文件

文件： fact_data.py 项目： jacobbieker/fact-learning

def load_gamma_subset(sourcefile,
                      theta2_cut=0.0, conf_cut=0.9, num_off_positions=1, analysis_type='classic', with_runs=False):
    events = read_h5py(sourcefile, key='events')

    selection_columns = ['theta_deg', 'gamma_prediction', 'zd_tracking', 'conc_core']
    theta_off_columns = ['theta_deg_off_{}'.format(i)
                         for i in range(1, num_off_positions)]
    bg_prediction_columns = ['gamma_prediction_off_{}'.format(i)
                             for i in range(1, num_off_positions)]

    if analysis_type == 'source':
        log.info('\tSelection events for source dependent analysis')
        log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut))
        on_data, off_data = split_on_off_source_dependent(
            events=events,
            prediction_threshold=conf_cut,
            on_prediction_key='gamma_prediction',
            off_prediction_keys=bg_prediction_columns)
        on_mc = events.query('gamma_prediction >= {}'.format(conf_cut))
    elif analysis_type == 'classic':
        log.info('\tSelection events for source independent analysis')
        log.info("\t\tgamma_pred_cut={0:.2f}".format(conf_cut))
        log.info("\t\ttheta2_cut={0:.2f}".format(theta2_cut))
        on_data, off_data = split_on_off_source_independent(
            events=events.query('gamma_prediction >= {}'.format(conf_cut)),
            theta2_cut=theta2_cut,
            theta_key='theta_deg',
            theta_off_keys=theta_off_columns)
        on_mc = events.query(
            '(theta_deg <= {}) & (gamma_prediction >= {})'.format(
                theta2_cut, conf_cut))

    log.info("\t{} Data Events (on region)".format(len(on_data)))
    log.info("\t\t{} Data Events ({} off regions)".format(len(off_data),
                                                          num_off_positions))
    log.info("\t{} MC gammas after selection".format(len(on_mc)))

    if with_runs:
        runs = read_h5py(sourcefile, key='runs')
        t_obs = runs.ontime.sum()

    n_events_per_off_region = len(off_data) / num_off_positions
    n_events_on_region = len(on_data)
    n_events_expected_signal = n_events_on_region - n_events_per_off_region

    return on_mc, on_data, off_data

示例#2

0

显示文件

def model_significance(estimator, data):
		'''
		Evaluate significance on given trained model and given datset.
		Parameters:
			estimator: sklearn.model
				Trained model, so there the estimator can make predictions 
				on the dataset.
			data: pd.DataFrame
				The dataset where the siginificance should be calculated
		Returns:
			max(significance): float
				Maximal signigicance on the dataset by given model.
		'''
		feature = load_feature()
		data['gamma_prediction'] = estimator.predict_proba(data[feature])[:,1]
		significance = []
		for threshold in np.linspace(0.01, 0.99, 99):
			on_data, off_data = split_on_off_source_independent(
							data.query('gamma_prediction >'+threshold.astype(str)),
							theta2_cut=0.03)
			significance.append(li_ma_significance(len(on_data), len(off_data), 0.2))
		return max(significance)

示例#3

0

显示文件

def plot_significance(estimator, data, save=True, path= 'significance.pdf'):
		'''
		Plot the significance in dependence to threshold.
		Parameters:
			estimator: sklearn.model
				Trained model, so there the estimator can make predictions 
				on the dataset.
			data: pd.DataFrame
				The dataset where the siginificance should be calculated
		'''
		feature = load_feature()
		data['gamma_prediction'] = estimator.predict_proba(data[feature])[:,1]
		significance = []
		for threshold in np.linspace(0.01, 0.99, 99):
			on_data, off_data = split_on_off_source_independent(
							data.query('gamma_prediction >'+threshold.astype(str)), 
							theta2_cut=0.03)
			significance.append(li_ma_significance(len(on_data), len(off_data), 0.2))
		plt.plot(np.linspace(0.01, 0.99, 99), significance)
		if(save==True):
			plt.title('max('+str(round(max(significance),2))+')')
			plt.xlabel('threshold')
			plt.ylabel('confidence')
			plt.savefig(path)

示例#4

0

显示文件

def main(
    config,
    observation_file,
    gamma_file,
    corsika_file,
    output_file,
    seed,
    label,
    threshold,
    theta2_cut,
):
    '''
    unfold fact data
    '''
    setup_logging()
    log = logging.getLogger('fact_funfolding')
    log.setLevel(logging.INFO)

    random_state = np.random.RandomState(seed)
    np.random.set_state(random_state.get_state())

    config = Config.from_yaml(config)
    e_ref = config.e_ref
    threshold = threshold or config.threshold
    theta2_cut = theta2_cut or config.theta2_cut

    log.info(f'Using threshold {threshold}')
    log.info(f'Using theta2 cut {theta2_cut}')

    # define binning in e_est and e_true
    bins_obs = logspace_binning(config.e_est_low, config.e_est_high, e_ref,
                                config.n_bins_est)
    bins_true = logspace_binning(config.e_true_low, config.e_true_high, e_ref,
                                 config.n_bins_true)

    # read in files
    query = 'gamma_prediction > {} and theta_deg**2 < {}'.format(
        threshold, theta2_cut)

    log.info('Reading simulated gammas')
    gammas = read_h5py(gamma_file, key='events').query(query)
    with h5py.File(gamma_file, 'r') as f:
        sample_fraction = f.attrs.get('sample_fraction', 1.0)
        log.info('Using sampling fraction of {:.3f}'.format(sample_fraction))

    query = 'gamma_prediction > {}'.format(threshold)

    log.info('Reading observations')
    observations = read_h5py(observation_file, key='events').query(query)

    on, off = split_on_off_source_independent(observations,
                                              theta2_cut=theta2_cut)

    observation_runs = read_h5py(observation_file, key='runs')
    obstime = observation_runs.ontime.sum() * u.s

    corsika_events = read_h5py(
        corsika_file,
        key='corsika_events',
        columns=['total_energy'],
    )

    simulated_spectrum = read_simulated_spectrum(corsika_file)

    a_eff, bin_center, bin_width, a_eff_low, a_eff_high = collection_area(
        corsika_events.total_energy.values,
        gammas[E_TRUE].values,
        impact=simulated_spectrum['x_scatter'],
        bins=bins_true.to_value(u.GeV),
        sample_fraction=sample_fraction,
    )

    # unfold using funfolding
    X_model = gammas[E_PRED].values
    y_model = gammas[E_TRUE].values

    X_data = on[E_PRED].values

    g_model = np.digitize(X_model, bins_obs.to(u.GeV).value)
    f_model = np.digitize(y_model, bins_true.to(u.GeV).value)

    g_data = np.digitize(X_data, bins_obs.to(u.GeV).value)

    model = ff.model.LinearModel(random_state=random_state)
    model.initialize(digitized_obs=g_model, digitized_truth=f_model)

    vec_g_data, _ = model.generate_vectors(digitized_obs=g_data)
    vec_g_model, vec_f_model = model.generate_vectors(digitized_obs=g_model,
                                                      digitized_truth=f_model)

    if config.background:
        X_bg = off[E_PRED].values
        g_bg = np.digitize(X_bg, bins_obs.to(u.GeV).value)
        vec_g_bg, _ = model.generate_vectors(digitized_obs=g_bg, )
        model.add_background(vec_g_bg * 0.2)

    llh = ff.solution.StandardLLH(
        tau=config.tau,
        log_f=True,
        reg_factor_f=1 / a_eff.value[1:-1] if config.tau else None,
    )
    llh.initialize(
        vec_g=vec_g_data,
        model=model,
        ignore_n_bins_low=1,
        ignore_n_bins_high=1,
    )

    sol_mcmc = ff.solution.LLHSolutionMCMC(
        n_burn_steps=config.n_burn_steps,
        n_used_steps=config.n_used_steps,
        random_state=random_state,
    )
    sol_mcmc.initialize(llh=llh, model=model)
    sol_mcmc.set_x0_and_bounds(x0=np.random.poisson(vec_f_model *
                                                    vec_g_data.sum() /
                                                    vec_g_model.sum()))

    vec_f_est, sigma_vec_f, sample, probs, autocorr_time = sol_mcmc.fit()

    additional_features_to_save = dict()
    additional_features_to_save['a_eff'] = a_eff
    additional_features_to_save['a_eff_low'] = a_eff_low
    additional_features_to_save['a_eff_high'] = a_eff_high

    save_spectrum(
        output_file,
        bins_true,
        vec_f_est / a_eff / obstime / bin_width / u.GeV,
        sigma_vec_f / a_eff / obstime / bin_width / u.GeV,
        counts=vec_f_est,
        counts_err=sigma_vec_f,
        g=vec_g_data,
        bg=vec_g_bg,
        tau=config.tau,
        label=label or config.label,
        add_features=additional_features_to_save,
    )

示例#5

0

显示文件

文件： create_truee_files.py 项目： fact-project/truee-wrapper

def main(data_path, gamma_path, corsika_path, config_template, output_base,
         threshold, theta2_cut, gamma_fraction, title, start, end, zd_min,
         zd_max):

    with h5py.File(data_path, 'r') as f:
        source_dependent = 'gamma_prediction_off_1' in f['events'].keys()

    if source_dependent:
        other_columns.extend(bg_prediction_columns)
        theta_cut = np.inf
        theta2_cut = np.inf
        print('Source dependent separation, ignoring theta cut')

    theta_cut = np.sqrt(theta2_cut)

    data = read_h5py(data_path,
                     key='events',
                     columns=data_columns + output_columns + other_columns)

    gammas = read_h5py(
        gamma_path,
        key='events',
        columns=mc_columns + output_columns + other_columns,
    )
    gammas.rename(
        columns={'corsika_evt_header_total_energy': 'true_energy'},
        inplace=True,
    )

    runs = read_h5py(data_path, key='runs')

    data['timestamp'] = pd.to_datetime(
        data['unix_time_utc_0'] * 1e6 + data['unix_time_utc_1'],
        unit='us',
    )

    if start:
        data = data.query('timestamp >= @start')
        runs = runs.query('run_start >= @start')
    if end:
        data = data.query('timestamp <= @end')
        runs = runs.query('run_start <= @end')

    min_zenith = runs.zenith.min()
    max_zenith = runs.zenith.max()

    if zd_min:
        min_zenith = max(min_zenith, zd_min)

    if zd_max:
        max_zenith = min(max_zenith, zd_max)

    print('Zenith range of the input data:', min_zenith, max_zenith)

    if source_dependent:
        on_data, off_data = split_on_off_source_dependent(data, threshold)
        on_gammas = gammas.query('gamma_prediction >= {}'.format(threshold))
    else:
        on_data, off_data = split_on_off_source_independent(
            data.query('gamma_prediction >= {}'.format(threshold)),
            theta2_cut=theta2_cut,
        )
        on_gammas = gammas.query(
            '(theta_deg <= {}) & (gamma_prediction >= {})'.format(
                theta_cut,
                threshold,
            ))

    query = '(zd_tracking >= {}) and (zd_tracking <= {})'.format(
        min_zenith, max_zenith)
    on_gammas = on_gammas.query(query).copy()

    output_columns.append('theta_deg')
    on_gammas = on_gammas.loc[:, output_columns + ['true_energy']]
    on_data = on_data.loc[:, output_columns + data_columns]
    off_data = off_data.loc[:, output_columns + data_columns]

    off_data['weight'] = 0.2
    on_data['weight'] = 1.0
    on_gammas['weight'] = 1.0

    rpd.to_root(on_data, output_base + '_on.root', key='events')
    rpd.to_root(off_data, output_base + '_off.root', key='events')
    rpd.to_root(on_gammas, output_base + '_mc.root', key='events')

    print('N_on: {}'.format(len(on_data)))
    print('N_off: {}'.format(len(off_data)))
    print('S(Li&Ma): {}'.format(
        li_ma_significance(len(on_data), len(off_data), 0.2)))
    print('N_mc: {}'.format(len(on_gammas)))

    n_excess = len(on_data) - 0.2 * len(off_data)
    fraction = n_excess / len(on_gammas)

    print('N_excess:', n_excess)
    print('Fraction: {:1.4f}'.format(fraction))

    with open(config_template) as f:
        template = f.read()

    t_obs = runs.ontime.sum()

    try:
        corsika = pd.read_hdf(corsika_path, key='table')
    except KeyError:
        f = h5py.File(corsika_path)
        print("given key not in file: possible keys are: {}".format(
            list(f.keys())))
        return

    corsika['zenith'] = np.rad2deg(corsika['zenith'])
    corsika = corsika.query('(zenith >= {}) and (zenith <= {})'.format(
        min_zenith, max_zenith))
    print('Simulated events after zenith cut: {}'.format(len(corsika)))

    config = template.format(
        t_obs=t_obs,
        selection_fraction=gamma_fraction,
        n_gamma=len(corsika),
        source_file_on=output_base + '_on.root',
        source_file_off=output_base + '_off.root',
        source_file_mc=output_base + '_mc.root',
        tree_name='events',
        output_file=output_base + '_result.root',
        fraction=fraction,
        min_zenith=min_zenith,
        max_zenith=max_zenith,
        title=title,
    )

    with open(output_base + '.config', 'w') as f:
        f.write(config)

示例#6

0

显示文件

mess_Tree.fit(mess_data.drop('label', axis=1), mess_data.label)
mess_xgbc.fit(mess_data.drop('label', axis=1), mess_data.label)
mc_Tree.fit(mc_data.drop('label', axis=1), mc_data.label)
mc_xgbc.fit(mc_data.drop('label', axis=1), mc_data.label)

pred_mess_tree = mess_Tree.predict_proba(eval_data[feature])[:, 1]
pred_mess_xgbc = mess_xgbc.predict_proba(eval_data[feature])[:, 1]
pred_mc_tree = mc_Tree.predict_proba(eval_data[feature])[:, 1]
pred_mc_xgbc = mc_xgbc.predict_proba(eval_data[feature])[:, 1]

sig_mess_tree = []
sig_mess_xgbc = []
sig_mc_tree = []
sig_mc_xgbc = []
for threshold in np.linspace(0.01, 0.99, 99):
    on_data, off_data = split_on_off_source_independent(
        eval_data[threshold <= pred_mess_tree], theta2_cut=0.03)
    sig_mess_tree.append(li_ma_significance(len(on_data), len(off_data), 0.2))
    on_data, off_data = split_on_off_source_independent(
        eval_data[threshold <= pred_mess_xgbc], theta2_cut=0.03)
    sig_mess_xgbc.append(li_ma_significance(len(on_data), len(off_data), 0.2))
    on_data, off_data = split_on_off_source_independent(
        eval_data[threshold <= pred_mc_tree], theta2_cut=0.03)
    sig_mc_tree.append(li_ma_significance(len(on_data), len(off_data), 0.2))
    on_data, off_data = split_on_off_source_independent(
        eval_data[threshold <= pred_mc_xgbc], theta2_cut=0.03)
    sig_mc_xgbc.append(li_ma_significance(len(on_data), len(off_data), 0.2))

data = pd.DataFrame({
    'sig_mess_tree': np.transpose(sig_mess_tree),
    'sig_mess_xgbc': np.transpose(sig_mess_xgbc),
    'sig_mc_tree': np.transpose(sig_mc_tree),

示例#7

0

显示文件

文件： calc_significance.py 项目： fact-project/open_data_talk

from fact.analysis import li_ma_significance, split_on_off_source_independent
from fact.io import read_data


df = read_data('crab_gammas_dl3.hdf5', key='events')


on, off = split_on_off_source_independent(
    df.query('gamma_prediction > 0.85'),
    0.025,
)

with open('build/significance.tex', 'w') as f:
    f.write(r'\SI{')
    f.write(
        '{:.1f}'.format(li_ma_significance(len(on), len(off), 0.2))
    )
    f.write(r'}{σ}')