def fit_feature_transformers(pack): # * Unpack key, d, clip_dict, file_list, \ n_wanted_sample, n_wanted_histogram, particle_code, transformer = pack # * Read some data all_data = [] for file in file_list: # * once enough data has been read, break out if len(all_data) > n_wanted_sample: break data = hf.read_h5_dataset(file, key, prefix='raw/') if data[0].shape: for entry in data: all_data.extend(entry) else: all_data.extend(data) # * Data read. Now draw a random sample indices = np.array(range(len(all_data))) random.shuffle(indices) random_subsample = sorted( indices[:min(len(indices), int(n_wanted_histogram))]) # * Draw histogram and save it. plot_data = np.array(sorted(np.array(all_data)[random_subsample])) plot_data_unclipped = np.array(sorted( np.array(all_data)[random_subsample])) if clip_dict: minimum = clip_dict['min'] maximum = clip_dict['max'] plot_data = np.clip(plot_data, minimum, maximum) d['data'] = [plot_data] d['title'] = key + '- Entries = %.1e' % (plot_data_unclipped.shape[0]) path = hf.get_project_root() + '/reports/plots/features/' d['savefig'] = path + particle_code + '_' + key + '.png' fig = rpt.make_plot(d) # * Fit a transformer/scaler transformer.fit(plot_data_unclipped.reshape(-1, 1)) # * Transform plot data plot_data_transformed = transformer.transform( plot_data_unclipped.reshape(-1, 1)) # * Plot and save d_transformed = {'data': [plot_data_transformed]} d_transformed['title'] = key + ' transformed - Entries = %.1e' % ( plot_data_unclipped.shape[0]) d_transformed[ 'savefig'] = path + particle_code + '_transformed_' + key + '.png' fig = rpt.make_plot(d_transformed) d_transformer = {key: transformer} return d_transformer
def energy_plot(models, perf_classes, title=None, savefig=None): # * t plot edges, y, yerr, label = [], [], [], [] data, bins, weights, histtype, log = [], [], [], [], [] for model, pc in zip(models, perf_classes): pd = pc.get_relE_dict() pd_edges = [pd['edges'][0][:]] pd_y = [pd['y'][0][:]] pd_yerr = [pd['yerr'][0][:]] edges.extend(pd_edges) y.extend(pd_y) yerr.extend(pd_yerr) label.append(model) pd_h = pc.get_energy_dict() data.extend(pd_h['data']) bins.extend(pd_h['bins']) weights.extend(pd_h['weights']) histtype.extend(pd_h['histtype']) log.extend(pd_h['log']) del pd_h['color'] edges.append(pc.bin_edges) y.append(pc.relE_crs_sigmas) yerr.append(pc.relE_crs_errors) label.append('Icecube') pd['edges'] = edges pd['y'] = y pd['yerr'] = yerr pd['label'] = label pd_h['data'] = data pd_h['bins'] = bins pd_h['weights'] = weights pd_h['histtype'] = histtype pd_h['log'] = log pd['grid'] = True pd['y_minor_ticks_multiple'] = 0.2 if savefig: pd_h['savefig'] = savefig if title: pd_h['title'] = title fig = rpt.make_plot(pd) fig = rpt.make_plot(pd_h, h_figure=fig, axes_index=0) return fig
def t_plot(models, perf_classes, title=None, savefig=None): # * t plot edges, y, yerr, label = [], [], [], [] data, bins, weights, histtype, log = [], [], [], [], [] for model, pc in zip(models, perf_classes): pd = pc.get_t_dict() edges.extend(pd['edges']) y.extend(pd['y']) yerr.extend(pd['yerr']) label.append(model) pd_h = pc.get_energy_dict() data.extend(pd_h['data']) bins.extend(pd_h['bins']) weights.extend(pd_h['weights']) histtype.extend(pd_h['histtype']) log.extend(pd_h['log']) del pd_h['color'] edges.append(pc.bin_edges) y.append(pc.t_crs_sigmas) yerr.append(pc.t_crs_errors) pd['edges'] = edges pd['y'] = y pd['yerr'] = yerr pd['xlabel'] = r'$\log_{10}$E [E/GeV]' pd['ylabel'] = r'$\sigma_{t}$ [ns]' pd['yrange'] = [0, 420] if savefig: pd['savefig'] = savefig fig = rpt.make_plot(pd) return fig
def energy_plot(models, perf_classes, title=None, savefig=None): # * t plot edges, y, yerr, label = [], [], [], [] data, bins, weights, histtype, log = [], [], [], [], [] for model, pc in zip(models, perf_classes): pd = pc.get_relE_dict() pd_edges = [pd['edges'][0][:]] pd_y = [pd['y'][0][:]] pd_yerr = [pd['yerr'][0][:]] edges.extend(pd_edges) y.extend(pd_y) yerr.extend(pd_yerr) label.append(model) edges.append(pc.bin_edges) y.append(pc.relE_crs_sigmas) yerr.append(pc.relE_crs_errors) label.append('Icecube') pd['edges'] = edges pd['y'] = y pd['yerr'] = yerr pd['xlabel'] = r'$\log_{10}$E [E/GeV]' pd['ylabel'] = r'Relative Error, $\sigma\left( \left(E_{pred}-E_{true}\right)/E_{true}\right)$' if savefig: pd['savefig'] = savefig fig = rpt.make_plot(pd) return fig
def z_plot(models, perf_classes, title=None, savefig=None): # * t plot edges, y, yerr, label = [], [], [], [] data, bins, weights, histtype, log = [], [], [], [], [] for model, pc in zip(models, perf_classes): pd = pc.get_z_dict() edges.extend(pd['edges']) y.extend(pd['y']) yerr.extend(pd['yerr']) label.append(model) pd_h = pc.get_energy_dict() data.extend(pd_h['data']) bins.extend(pd_h['bins']) weights.extend(pd_h['weights']) histtype.extend(pd_h['histtype']) log.extend(pd_h['log']) del pd_h['color'] edges.append(pc.bin_edges) y.append(pc.z_crs_sigmas) yerr.append(pc.z_crs_errors) label.append('Icecube') pd['edges'] = edges pd['y'] = y pd['yerr'] = yerr pd['label'] = label pd_h['data'] = data pd_h['bins'] = bins pd_h['weights'] = weights pd_h['histtype'] = histtype pd_h['log'] = log if savefig: pd_h['savefig'] = savefig if title: pd_h['title'] = title fig = rpt.make_plot(pd) fig = rpt.make_plot(pd_h, h_figure=fig, axes_index=0) # mod = pd['y'][0] # ice = pd['y'][2] # print(-(np.array(mod)-np.array(ice))/np.array(ice)) return fig
) else: weights, interpolator, savename = make_weights( name, ids, db, debug=args.dev, interpolator=interpolator, alpha=args.alpha ) # Save in DB ids_strings = [str(idx) for idx in ids] print(get_time(), 'Writing %s to database'%(savename)) db.write('scalar', savename, ids_strings, weights) print(get_time(), 'Weights saved!') # Save a figure of the weights if args.make_plot: if name == 'uniform_direction_weights': x = np.linspace(-1.0, 1.0) else: x = np.linspace(0.0, 3.0) y = interpolator(x) d = {'x': [x], 'y': [y]} d['savefig'] = '/'.join([get_project_root(), 'reports/plots', savename+'.png']) d['yscale'] = 'log' _ = make_plot(d) if args.save_interpolator: path = PATH_DATA_OSCNEXT + '/weights/' + savename + '.pickle' with open(path, 'wb') as f: pickle.dump(interpolator, f)
index for index in range(len(lrs)) if from_lr <= lrs[index] <= to_lr ] chosen_lrs = np.array(lrs)[indices] chosen_losses = np.array(losses)[indices] if args.max_yrange != np.inf: maxy = args.max_yrange else: # maxy = np.max(chosen_losses) maxy = None if args.min_yrange != np.inf: miny = args.min_yrange else: # miny = np.min(chosen_losses) miny = None d = { 'x': [chosen_lrs], 'y': [chosen_losses], 'xscale': 'log', 'savefig': model + '/lr_vs_loss.png', 'xlabel': 'Learning Rate', 'ylabel': 'Loss', 'yrange': { 'bottom': miny, 'top': maxy } } fig = make_plot(d)
perf_class_path = path + '/data/Performance.pickle' perf_class = pickle.load(open(perf_class_path, "rb")) perf_classes.append(perf_class) # attrs = vars(perf_classes[0]) # for attr in attrs: # print(attr) perf = perf_classes[0] for pred_key, reco_key in zip(perf._performance_keys, perf._reco_keys): d = perf._get_perf_dict(pred_key, reco_key) d['ylabel'] = 'Energy Resolution [%]' d['yrange'] = [-0.05, 1.3] d['title'] = 'Energy Regression Performance' if perf._reco_keys: h_fig = rpt.make_plot(d, position=[0.125, 0.26, 0.775, 0.62]) d = perf._get_rel_perf_dict(pred_key) d['subplot'] = True d['axhline'] = [0.0] h_fig = rpt.make_plot(d, h_figure=h_fig, position=[0.125, 0.11, 0.775, 0.15]) d_energy = perf._get_energy_dict() _ = rpt.make_plot(d_energy, h_figure=h_fig, axes_index=0) # path = get_project_root() + '/plots/polar_L2_vs_sqr_angle.png' # title = 'Energy: Stacked 256 LSTM-size Huber (blue), 1028 LSTM L2 (orange)' # fig = energy_plot(models, perf_classes, title=title)#, savefig=path)
help='Saves figure(s) in root directory', action='store_true') args = parser.parse_args() if __name__ == '__main__': # * First create plot dictionaries plot_dicts = [] for model in args.inputs: #* Locate the model directory paths = hf.find_files(model) for path in paths: if path.split('/')[-1] == model: break # * Make a plotting dictionary with the datasets from the different models plot_dicts = rprt.get_performance_plot_dicts(path, plot_dicts) # * Now display (or save) desired performance plots for i, plot_dict in enumerate(plot_dicts): if args.save: plot_dict['savefig'] = hf.get_project_root( ) + '/comparisons/' + plot_dict['title'] + '.png' try: fig = rprt.make_plot(plot_dict) except FileNotFoundError: Path(hf.get_project_root() + '/comparisons/').mkdir() fig = rprt.make_plot(plot_dict)
FRAC = 0.1 from_, to_ = 0.0, 0.1 end = int(FRAC * len(tot_energy)) from_i, to_i = int(from_ * len(tot_energy)), int(to_ * len(tot_energy)) from2_, to2_ = 0.9, 1.0 end = int(FRAC * len(tot_energy)) from2_i, to2_i = int(from2_ * len(tot_energy)), int(to2_ * len(tot_energy)) path1 = get_project_root() + '/plots/transformed_E_dist.png' title1 = 'Transformed energy distribution' d1 = { 'data': [tot_charge_sorted[from_i:to_i], tot_charge_sorted[from2_i:to2_i]], 'density': [True, True] } #, 'title': title1, 'savefig': path1} a1 = rpt.make_plot(d1) x = np.arange(len(energy_sorted)) d = {'x': [x], 'y': [energy_sorted]} f = rpt.make_plot(d) # * MAKING A CUT IN TOT CHARGE tot_charge_sorted, energy_sorted = sort_pairs(tot_tot_charge, tot_energy) tot_charge_sorted = np.array(tot_charge_sorted) energy_sorted = np.array(energy_sorted) charge_cut = 80.0 indices = tot_charge_sorted < charge_cut energy_cutted = energy_sorted[indices] d1 = { 'data': [energy_sorted[indices], energy_sorted[~indices]], 'density': [False, False]
def load_and_fit_transformer(pack): ids, (key, feature_dict), db_path, n_data = pack with shelve.open(db_path, 'r') as db: id_iter = iter(ids) data =np.array([]) loaded = 0 transformer = feature_dict['transformer'] clip_d = feature_dict.get('clip', None) # * If we are dealing with a feature that needs to be transforme, make the transformer! if transformer: # * Extract the function needed for derived features fnc = feature_dict['feature_calculator'] # * Loop until we have enough samples for the transformer while loaded < n_data: # * If we iterated over all data, thats it - just exit loop. try: event = db[next(id_iter)]['raw'] except StopIteration: break # * If dealing with a derived feature, calculate it! if fnc: new_data = fnc(event) # * If not, just load it else: new_data = event[key] data = np.append(data, new_data) if isinstance(new_data, np.ndarray): loaded += new_data.shape[0] elif isinstance(new_data, (float, int)): loaded += 1 else: raise ValueError('load_and_fit_transformer: Unknown type (%s) encountered'%(type(new_data))) # * Save plot of pre-transformed data path = get_project_root()+'/reports/shelve_data' if not Path(path).exists(): Path(path).mkdir() plot_d = {'data': [data], 'savefig': path+'/%s.png'%key} _ = make_plot(plot_d) # * Now fit a transformer transformer.fit(data.reshape(-1, 1)) # * save plot of transformed data if clip_d: data_transformed = np.clip(data, clip_d['min'], clip_d['max']) data_transformed = transformer.transform(data_transformed.reshape(-1, 1)) else: data_transformed = transformer.transform(data.reshape(-1, 1)) plot_d = {'data': [data_transformed], 'savefig': path+'/%s_transformed.png'%key} _ = make_plot(plot_d) return {key: transformer}
# energy_train.extend(energy) n_read = 0 for index in rand_val: file = get_project_root() + train[index] if n_read >= n_wanted: break with h5.File(file, 'r') as f: # energy = f[key] n_in_file_val.append(f['meta/events'][()]) # n_read += len(energy) # energy_val.extend(energy) # d = {'data': [energy_train, energy_val]} # fig = rpt.make_plot(d) title = 'Distribution of number of events in files' path_save = get_project_root() + '/plots/n_events_in_files.png' d = { 'data': [n_in_file_train, n_in_file_val], 'density': [True, True], 'title': title, 'label': ['Train', 'Val'], 'savefig': path_save } fig2 = rpt.make_plot(d) # d = {'x': [np.arange(len(energy_train))], 'y': [energy_train]} # train_fig = rpt.make_plot(d_train) # tot_n_doms = [entry for entry in tot_n_doms if entry<100]
# from src.modules.classes import * import src.modules.loss_funcs as lf from src.modules.helper_functions import * from src.modules.eval_funcs import * import src.modules.reporting as rpt particle = 'muon_neutrino' dataset = get_project_root()+get_path_from_root('/CubeML/data/oscnext-genie-level5-v01-01-pass2') tot_energy = [] events_wanted = np.inf events_loaded = 0 for file in Path(dataset).iterdir(): if events_loaded >= events_wanted: break if not (file.suffix == '.h5' and confirm_particle_type(get_particle_code(particle), file)): continue with h5.File(file, 'r') as f: energy = f['raw/true_primary_energy'] events_loaded += len(energy) tot_energy.extend(energy) # tot_n_doms = [entry for entry in tot_n_doms if entry<100] # %% path1 = get_project_root() + '/plots/transformed_E_dist.png' title1 = 'Transformed energy distribution' d1 = {'data': [tot_energy]}#, 'title': title1, 'savefig': path1} a1 = rpt.make_plot(d1)
tot_n_doms.extend(n_doms) tot_energy.extend(energy) doms, energy = sort_pairs(tot_n_doms, tot_energy) # tot_n_doms = [entry for entry in tot_n_doms if entry<100] # %% FRAC = 0.1 from_, to_ = 0.0, 0.9 end = int(FRAC * len(doms)) from_i, to_i = int(from_ * len(doms)), int(to_ * len(doms)) path1 = get_project_root() + '/plots/%s_energy_vs_seqlen.png' % (particle) title1 = '%s: Bottom and upper %.0f %% seq. length log(e) dist' % (particle, FRAC * 100) d1 = { 'data': [energy[0:end], energy[-end:-50]], 'title': title1, 'savefig': path1 } a1 = rpt.make_plot(d1) path2 = get_project_root() + '/plots/%s_seqlen.png' % (particle) title2 = '%s: Seq. length dist (entries: %.2e)' % (particle, len(doms[from_i:to_i])) d2 = {'data': [doms[from_i:to_i]], 'title': title2, 'savefig': path2} a2 = rpt.make_plot(d2) # %% morethan200 = len([entry for entry in doms if entry > 200]) print(morethan200 / len(doms))