def get_frac_correct(df_train, df_test, pipeline_str=None, num_groups=4, energy_key='MC_log_energy'): '''Calculates the fraction of correctly identified samples in each energy bin for each composition in comp_list. In addition, the statisitcal error for the fraction correctly identified is calculated.''' # Input validation if energy_key not in ['MC_log_energy', 'reco_log_energy']: raise ValueError( "Invalid energy_key ({}) entered. Must be either " "'MC_log_energy' or 'reco_log_energy'.".format(energy_key)) if pipeline_str is None: pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups) # Fit pipeline and get mask for correctly identified events feature_list, feature_labels = comp.get_training_features() pipeline = comp.get_pipeline(pipeline_str) comp_target_str = 'comp_target_{}'.format(num_groups) pipeline.fit(df_train[feature_list], df_train[comp_target_str]) test_predictions = pipeline.predict(df_test[feature_list]) correctly_identified_mask = (test_predictions == df_test[comp_target_str]) data = {} for composition in comp_list + ['total']: comp_mask = df_test['comp_group_{}'.format(num_groups)] == composition # Get number of MC comp in each energy bin num_MC_energy, _ = np.histogram(df_test.loc[comp_mask, energy_key], bins=energybins.log_energy_bins) num_MC_energy_err = np.sqrt(num_MC_energy) # Get number of correctly identified comp in each energy bin combined_mask = comp_mask & correctly_identified_mask num_reco_energy, _ = np.histogram(df_test.loc[combined_mask, energy_key], bins=energybins.log_energy_bins) num_reco_energy_err = np.sqrt(num_reco_energy) # Calculate correctly identified fractions as a function of energy frac_correct, frac_correct_err = comp.ratio_error( num_reco_energy, num_reco_energy_err, num_MC_energy, num_MC_energy_err) data['frac_correct_{}'.format(composition)] = frac_correct data['frac_correct_err_{}'.format(composition)] = frac_correct_err return data
def get_config_flux(config): sim_config = data_config_to_sim_config(config) pipeline_str = 'BDT' pipeline = comp.get_pipeline(pipeline_str) energybins = comp.analysis.get_energybins() # Load simulation and training features df_sim_train, df_sim_test = comp.load_sim(config=sim_config, verbose=False) feature_list, feature_labels = comp.analysis.get_training_features() # Load data df_data = comp.load_data(config=config) X_data = comp.dataframe_functions.dataframe_to_array( df_data, feature_list + ['lap_log_energy']) log_energy = X_data[:, -1] X_data = X_data[:, :-1] pipeline.fit(df_sim_train[feature_list], df_sim_train['target']) data_predictions = pipeline.predict(X_data) # Get composition masks data_labels = np.array([ comp.dataframe_functions.label_to_comp(pred) for pred in data_predictions ]) data_light_mask = data_labels == 'light' data_heavy_mask = data_labels == 'heavy' # Get number of identified comp in each energy bin df_flux = {} comp_list = ['light', 'heavy'] for composition in comp_list: comp_mask = data_labels == composition df_flux['counts_' + composition] = np.histogram( log_energy[comp_mask], bins=energybins.log_energy_bins)[0] df_flux['counts_' + composition + '_err'] = np.sqrt( df_flux['counts_' + composition]) df_flux['counts_total'] = np.histogram(log_energy, bins=energybins.log_energy_bins)[0] df_flux['counts_total_err'] = np.sqrt(df_flux['counts_total']) # Solid angle max_zenith_rad = df_sim_train['lap_zenith'].max() solid_angle = 2 * np.pi * (1 - np.cos(max_zenith_rad)) df_flux['solid_angle'] = solid_angle # Livetime livetime, livetime_err = comp.get_detector_livetime(config=config) df_flux['livetime'] = livetime df_flux['livetime_err'] = livetime_err return df_flux
def save_anisotropy_dataframe(config, outfile): print('Loading data...') data_df = comp.load_dataframe(datatype='data', config=config, verbose=False) keep_columns = [ 'lap_zenith', 'lap_azimuth', 'start_time_mjd', 'pred_comp', 'lap_log_energy' ] comp_list = ['light', 'heavy'] pipeline_str = 'GBDT' pipeline = comp.get_pipeline(pipeline_str) feature_list, feature_labels = comp.get_training_features() data_df.loc[:, feature_list].dropna(axis=0, how='any', inplace=True) print('Loading simulation...') if 'IC86' in config: sim_config = 'IC86.2012' else: sim_config = 'IC79' sim_df = comp.load_dataframe(datatype='sim', config=sim_config, verbose=False, split=False) X_train, y_train = comp.dataframe_functions.dataframe_to_X_y( sim_df, feature_list) print('Training classifier...') pipeline = pipeline.fit(X_train, y_train) X_data = comp.dataframe_functions.dataframe_to_array(data_df, feature_list) data_pred = pd.Series(pipeline.predict(X_data), dtype=int) data_df['pred_comp'] = data_pred.apply( comp.dataframe_functions.label_to_comp) # print('decision_function = {}'.format(pipeline.decision_function(X_data))) # data_df['score'] = pipeline.decision_function(X_data) print('Saving anisotropy DataFrame for {}'.format(config)) with pd.HDFStore(outfile, 'w') as store: store.put('dataframe', data_df.loc[:, keep_columns], format='table') return
def get_composition_pipeline(pipeline, p, use_sample_weights): if p is None: pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config, num_groups) pipeline = comp.load_trained_model(pipeline_str) if use_sample_weights is not None: model = use_sample_weights pipeline_str = '{}_comp_{}_{}-groups'.format(pipeline, config, num_groups) pipeline = comp.get_pipeline(pipeline_str) compositions = df_sim_train['comp_group_{}'.format(num_groups)].values energies = df_sim_train['reco_energy'].values sample_weight = calculate_sample_weights(compositions, energies, model=model) X = df_sim_train[feature_list].values y = df_sim_train['comp_target_{}'.format(num_groups)].values fit_params = {'classifier__sample_weight': sample_weight} pipeline.fit(X, y, **fit_params) return pipeline
df['reco_log_energy'] = energy_pipeline.predict( df[feature_list].values) df['reco_energy'] = 10**df['reco_log_energy'] print('Loading or fitting composition classifier...') if any([ args.weights_model, args.energy_spectrum_weights, args.compositon_weights ]): model = args.weights_model energy_spectrum_weights = args.energy_spectrum_weights compositon_weights = args.compositon_weights pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, config, num_groups) pipeline = comp.get_pipeline(pipeline_str) compositions = df_sim_train['comp_group_{}'.format(num_groups)].values energies = df_sim_train['reco_energy'].values sample_weight = calculate_sample_weights( compositions, energies, model=model, compositon_weights=compositon_weights, energy_spectrum_weights=energy_spectrum_weights) X = df_sim_train[feature_list].values y = df_sim_train['comp_target_{}'.format(num_groups)].values fit_params = {'classifier__sample_weight': sample_weight} pipeline.fit(X, y, **fit_params) elif p is None: pipeline_str = '{}_comp_{}_{}-groups'.format(args.pipeline, config, num_groups)
def get_classified_fractions(df_train, df_test, pipeline_str=None, num_groups=4, energy_key='MC_log_energy'): '''Calculates the fraction of correctly identified samples in each energy bin for each composition in comp_list. In addition, the statisitcal error for the fraction correctly identified is calculated.''' # Input validation if energy_key not in ['MC_log_energy', 'reco_log_energy']: raise ValueError( "Invalid energy_key ({}) entered. Must be either " "'MC_log_energy' or 'reco_log_energy'.".format(energy_key)) if pipeline_str is None: pipeline_str = 'BDT_comp_IC86.2012_{}-groups'.format(num_groups) # Fit pipeline and get mask for correctly identified events feature_list, feature_labels = comp.get_training_features() if 'CustomClassifier' in pipeline_str: pipeline = comp.get_pipeline(pipeline_str) else: pipeline = comp.load_trained_model(pipeline_str) comp_target_str = 'comp_target_{}'.format(num_groups) if 'CustomClassifier' in pipeline_str: test_predictions = pipeline.predict( df_test['comp_target_{}'.format(num_groups)]) else: test_predictions = pipeline.predict(df_test[feature_list]) pred_comp = np.array( comp.decode_composition_groups(test_predictions, num_groups=num_groups)) data = {} for true_composition, identified_composition in product( comp_list, comp_list): true_comp_mask = df_test['comp_group_{}'.format( num_groups)] == true_composition ident_comp_mask = pred_comp == identified_composition # Get number of MC comp in each energy bin num_true_comp, _ = np.histogram(df_test.loc[true_comp_mask, energy_key], bins=energybins.log_energy_bins) num_true_comp_err = np.sqrt(num_true_comp) # Get number of correctly identified comp in each energy bin combined_mask = true_comp_mask & ident_comp_mask num_identified_comp, _ = np.histogram(df_test.loc[combined_mask, energy_key], bins=energybins.log_energy_bins) num_identified_comp_err = np.sqrt(num_identified_comp) # Calculate correctly identified fractions as a function of energy frac_identified, frac_identified_err = comp.ratio_error( num_identified_comp, num_identified_comp_err, num_true_comp, num_true_comp_err) data['true_{}_identified_{}'.format( true_composition, identified_composition)] = frac_identified data['true_{}_identified_{}_err'.format( true_composition, identified_composition)] = frac_identified_err return data