best_run = argmedian(runs.loss_train) best_auc = np.argmax(runs.auc) gym = training.best_model_factory() model = gym.model model.load_state_dict(runs.model[best_run].to_dict()) model.eval() #%% loss = runs.losses_train[best_run] x = np.linspace(1, (len(loss) + 1) * dl_config["batch_size"], len(loss)) plt.plot(x, loss, label="training loss") plt.xlabel("# of waveforms used for training") plt.ylabel(f"EMD loss") plt.xscale("log") plt.show_and_save("loss during training") # %% val_losses = runs.loss_val[best_run] names = dataset_val.df["MC_name"].values unames = np.unique(names) data = [val_losses[names == name] for name in unames] name_translator = { "valid": "normal", "gaussian_reshaped": "Gaussian reshaped", "double pulse": "double peak" } label_names = [name_translator[n] for n in unames]
interactive.save_value("ensemble size 1e5 without outliers", len(without_outliers.runs), ".1e") # %% for run in [small_runs, big_runs]: data = [ run.with_outliers.auc, run.without_outliers.auc[run.without_outliers.auc != None] ] labels = ["with outliers", "without outliers"] plt.hist(data, bins=30, histtype="step", density=True, label=labels) plt.xlabel("AUC") plt.ylabel("frequency") plt.legend(loc="upper left") plt.show_and_save("training without outliers - AUC comparison " + run.name) KS_statistic, p_value = scipy.stats.ks_2samp(*data, mode='exact') interactive.save_value( "p-value null hypo AUC of training wo outliers is the same as training with outliers " + run.name, p_value, ".2e") interactive.save_value( "ks statistic null hypo AUC of training wo outliers is the same as training with outliers " + run.name, KS_statistic, ".1e") interactive.save_value("mean AUC with outliers " + run.name, f"{data[0].mean():.2f} ") interactive.save_value("mean AUC without outliers " + run.name, f"{data[1].mean():.2f}") data = [ run.with_outliers.loss_val.mean(axis=1), run.without_outliers.loss_val.mean(axis=1)
plt.set_plot_path(__file__) interactive.set_saved_value_yaml_root_by_filename(__file__) #%% ana = tune.Analysis("~/ray_results/final-1/") # %% dfs = ana.trial_dataframes interactive.save_value("number of configurations", len(dfs), ".1e") #%% plot_training_overview = False if plot_training_overview: ax = None # This plots everything on the same plot for d in tqdm(dfs.values()): d.auc.plot(ax=ax, legend=False) plt.show_and_save("training overview") #%% aucs = [] times = [] for d in tqdm(dfs.values()): aucs.extend(d.auc) times.extend(range(len(d.auc))) times = np.asarray(times) # +1)*hpo.waveforms_per_step plt.hist(times, bins=100) plt.xlabel("HPO steps") plt.ylabel("models still training") plt.show_and_save("HPO models vs time") interactive.save_value("waveforms used for training per HPO step", hpo.waveforms_per_step, ".1e") interactive.save_value(
unique_seps = np.unique(separations) loss_valid = losses[separations == 0] aucs = [] for sep in unique_seps: if sep == 0: continue loss_peak = losses[separations == sep] classes = [0] * len(loss_valid) + [1] * len(loss_peak) auc = analysis.calc_auc(np.hstack((loss_valid, loss_peak)), classes) aucs.append(auc) # %% plt.plot(unique_seps[1:], aucs) plt.xlabel("peak separation in s") plt.ylabel("AUC") plt.show_and_save("peak separation vs auc") # %% means = [] stds = [] for sep in unique_seps: loss = losses[separations == sep] means.append(loss.mean()) stds.append(loss.std(ddof=1)) means = np.array(means) stds = np.array(stds) plt.plot(unique_seps, means, label="mean loss of outlier waveforms") plt.fill_between(unique_seps, means + stds, means - stds,
validation, max_validation_steps=max_iter) yield np.hstack(g.validation_loss()) loss_MC, loss_toy = compare(model_MC, model_toy, val_toy) # %% plt.hist([loss_MC, loss_toy], bins=int(np.sqrt(len(loss_MC))), density=True, histtype="step", label=["IceCube MC", "Toy MC"]) plt.xlabel("EMD loss") plt.ylabel("frequency") plt.legend() plt.show_and_save("toy vs MC on val toy") # %% analysis.plot_auc(loss_MC, dataset_val_toy.df.MC_type != 0) plt.show_and_save("MC model on toy val") interactive.save_value( "AUC of MC model on toy val", analysis.calc_auc(loss_MC, dataset_val_toy.df.MC_type != 0), ".2f") #%% max_iter = 20 loss_MC, loss_toy = compare(model_MC, model_toy, val_MC, max_iter=max_iter) # %% plt.hist([loss_MC, loss_toy], bins=int(np.sqrt(len(loss_MC))), density=True,
r = TrainingStability("Ribbles_w_outliers_1e6",None) r.load() interactive.save_value("trained models used for stability estimates",len(r.model)) #%% loss_val = r.loss_val.mean(axis=1) cut = np.quantile(r.loss_train,0.8)#np.mean(loss_val) #+ np.std(loss_val)*0.5 #cut2 = np.mean(r.auc) + np.std(loss_val)*0.5 filter = (r.loss_train < cut) #& (r.auc>cut2) interactive.save_value("excluded values from stability plot",f"{100*sum(~filter)/len(filter):0.1f}") plt.hexbin(loss_val[filter], r.auc[filter], linewidths=0.2, gridsize=int(np.sqrt(len(r.auc[filter])))) plt.xlabel("EMD validation loss") plt.ylabel("AUC") plt.colorbar(label="count") plt.show_and_save(f"va training stability on {len(r.auc)} samples") # %% plt.hexbin(r.loss_train[filter], r.auc[filter],linewidths=0.2, gridsize=int(np.sqrt(len(r.auc[filter])))) plt.xlabel("EMD training loss") plt.ylabel("AUC") plt.colorbar(label="count") plt.show_and_save(f"ta training stability on {len(r.auc)} samples") #%% #cut = #np.quantile(r.loss_train,0.8) filter = (r.loss_train < 0.08)#cut) cut_percentage = 1-len(r.loss_train[filter])/len(r.loss_train) interactive.save_value("underperformance percentage",cut_percentage*100,".1f") plt.subplot(2,1,1)
bins = int(np.sqrt(len(data))) #plt.subplot(2, 1, 1) plt.hist(low_loss, bins=bins) plt.xlabel("EMD loss") plt.ylabel("count") plt.yscale("log") #plt.xlim(0,max(low_loss)*1.01) # plt.subplot(2, 1, 2) # plt.hist(data[data > 0], bins=bins) # plt.xlabel("EMD loss") # plt.ylabel("count") # plt.yscale("log") plt.show_and_save("loss hist 1e6") interactive.save_value("loss hist 1e6 number of models", len(data), ".1e") interactive.save_value( "number of waveforms for loss hist 1e6 plot", equivalent_1e6 * batch_size, ".1e" ) interactive.save_value( "percentage of bad models", len(data[data > 0.07]) / len(data) * 100, ".1f" ) interactive.save_value( "median loss for 1e6", np.median(data), ".3f" ) #%% # remove failed models
try: modelname = model.model.name except: modelname = model.model.__class__.__name__ name = f"{i}, Model:{modelname}, Loss: {lossname}" print(f"Training {name}:") loss = model.train_batches(steps) x = np.linspace(0, (steps + 1) * dl_config["batch_size"], len(loss)) plt.plot(x, loss) plt.xlabel("# of waveforms used for training") plt.ylabel(f"loss {lossname}") plt.xscale("log") plt.figtext(0, 0, name) plt.show_and_save(f"{name} + training") loss_func = lambda p, t: EMD.torch_auto(p, t, mean=False) val_losses = np.hstack( model.validation_loss(loss_func)) # restack batches names = dataset_val.df["MC_name"].values _, bins, _ = plt.hist(val_losses, int(np.sqrt(len(val_losses))), label="everything") plt.clf() unames = np.unique(names) data = [val_losses[names == name] for name in unames] plt.hist(data, bins=bins, label=unames, stacked=True) plt.xlabel("EMD loss")
plt.xscale("log") plt.ylim([0.5, 0.9]) plt.legend() # loc='lower bottom') ax1 = plt.gca() ax2 = ax1.twiny() ax2.plot(times, times) labels = [ "${10^{" + f"{np.log10(i):.0f}" + "}}$" for i in ax1.get_xticks() / training_set_size ] ax2.set_xscale("log") ax2.set_xticklabels(labels) ax2.set_xlabel("epochs") plt.show_and_save("training time vs outliers mean") #%% for i, latent_dim in enumerate(latents): plt.errorbar( times, val_loss_mean[:, i], yerr=val_loss_mean_std[:, i], label=f"latent dim: {latent_dim}", color=f"C{i}", ) # plt.plot(times,train_loss_mean[:,i],color=f"C{i}") plt.ylabel("mean validation EMD loss") plt.xlabel("waveforms used for training")
lossname=model.loss_func.__class__.__name__ try: modelname = model.model.name except: modelname = model.model.__class__.__name__ name = f"on IceCube MC latent space 6: {modelname}, Loss: {lossname}" print(f"Training {name}:") count_training_waveforms = (steps + 1) *dataset_train.batch_loading_size*train.batch_size x = np.linspace(0, count_training_waveforms, len(loss)) plt.plot(x, loss) plt.xlabel("# of waveforms used for training") plt.ylabel(f"loss {lossname}") plt.xscale("log") plt.figtext(0, 0, name) plt.show_and_save(f"{name} + training") plt.clf() #%% bins = 200 # int(np.sqrt(len(val_losses))) _, bins, _ = plt.hist(val_losses, bins, density=True) plt.xlabel("EMD loss") plt.ylabel("frequency") #plt.yscale('log') plt.show_and_save(f"{name} + hist") plt.clf() # %% from icae.tools.config_loader import config torch.save(model.model.state_dict(), config.root + f"icae/models/trained/{count_training_waveforms:.0e} samples {val_losses.mean():.1e} loss latent_space 6 IceMC.pt") # %%