Пример #1
0
best_run = argmedian(runs.loss_train)
best_auc = np.argmax(runs.auc)
gym = training.best_model_factory()
model = gym.model
model.load_state_dict(runs.model[best_run].to_dict())
model.eval()

#%%
loss = runs.losses_train[best_run]
x = np.linspace(1, (len(loss) + 1) * dl_config["batch_size"], len(loss))
plt.plot(x, loss, label="training loss")
plt.xlabel("# of waveforms used for training")
plt.ylabel(f"EMD loss")
plt.xscale("log")
plt.show_and_save("loss during training")

# %%
val_losses = runs.loss_val[best_run]

names = dataset_val.df["MC_name"].values
unames = np.unique(names)
data = [val_losses[names == name] for name in unames]

name_translator = {
    "valid": "normal",
    "gaussian_reshaped": "Gaussian reshaped",
    "double pulse": "double peak"
}
label_names = [name_translator[n] for n in unames]
Пример #2
0
    interactive.save_value("ensemble size 1e5 without outliers",
                           len(without_outliers.runs), ".1e")


# %%
for run in [small_runs, big_runs]:
    data = [
        run.with_outliers.auc,
        run.without_outliers.auc[run.without_outliers.auc != None]
    ]
    labels = ["with outliers", "without outliers"]
    plt.hist(data, bins=30, histtype="step", density=True, label=labels)
    plt.xlabel("AUC")
    plt.ylabel("frequency")
    plt.legend(loc="upper left")
    plt.show_and_save("training without outliers - AUC comparison " + run.name)
    KS_statistic, p_value = scipy.stats.ks_2samp(*data, mode='exact')
    interactive.save_value(
        "p-value null hypo AUC of training wo outliers is the same as training with outliers "
        + run.name, p_value, ".2e")
    interactive.save_value(
        "ks statistic null hypo AUC of training wo outliers is the same as training with outliers "
        + run.name, KS_statistic, ".1e")
    interactive.save_value("mean AUC with outliers " + run.name,
                           f"{data[0].mean():.2f} ")
    interactive.save_value("mean AUC without outliers " + run.name,
                           f"{data[1].mean():.2f}")

    data = [
        run.with_outliers.loss_val.mean(axis=1),
        run.without_outliers.loss_val.mean(axis=1)
Пример #3
0
plt.set_plot_path(__file__)
interactive.set_saved_value_yaml_root_by_filename(__file__)

#%%
ana = tune.Analysis("~/ray_results/final-1/")
# %%
dfs = ana.trial_dataframes
interactive.save_value("number of configurations", len(dfs), ".1e")
#%%
plot_training_overview = False
if plot_training_overview:
    ax = None  # This plots everything on the same plot
    for d in tqdm(dfs.values()):
        d.auc.plot(ax=ax, legend=False)
    plt.show_and_save("training overview")

#%%
aucs = []
times = []
for d in tqdm(dfs.values()):
    aucs.extend(d.auc)
    times.extend(range(len(d.auc)))
times = np.asarray(times)  # +1)*hpo.waveforms_per_step
plt.hist(times, bins=100)
plt.xlabel("HPO steps")
plt.ylabel("models still training")
plt.show_and_save("HPO models vs time")
interactive.save_value("waveforms used for training per HPO step",
                       hpo.waveforms_per_step, ".1e")
interactive.save_value(
unique_seps = np.unique(separations)
loss_valid = losses[separations == 0]

aucs = []
for sep in unique_seps:
    if sep == 0: continue
    loss_peak = losses[separations == sep]
    classes = [0] * len(loss_valid) + [1] * len(loss_peak)
    auc = analysis.calc_auc(np.hstack((loss_valid, loss_peak)), classes)
    aucs.append(auc)

# %%
plt.plot(unique_seps[1:], aucs)
plt.xlabel("peak separation in s")
plt.ylabel("AUC")
plt.show_and_save("peak separation vs auc")

# %%
means = []
stds = []
for sep in unique_seps:
    loss = losses[separations == sep]
    means.append(loss.mean())
    stds.append(loss.std(ddof=1))

means = np.array(means)
stds = np.array(stds)
plt.plot(unique_seps, means, label="mean loss of outlier waveforms")
plt.fill_between(unique_seps,
                 means + stds,
                 means - stds,
                validation,
                max_validation_steps=max_iter)
        yield np.hstack(g.validation_loss())


loss_MC, loss_toy = compare(model_MC, model_toy, val_toy)
# %%
plt.hist([loss_MC, loss_toy],
         bins=int(np.sqrt(len(loss_MC))),
         density=True,
         histtype="step",
         label=["IceCube MC", "Toy MC"])
plt.xlabel("EMD loss")
plt.ylabel("frequency")
plt.legend()
plt.show_and_save("toy vs MC on val toy")

# %%
analysis.plot_auc(loss_MC, dataset_val_toy.df.MC_type != 0)
plt.show_and_save("MC model on toy val")
interactive.save_value(
    "AUC of MC model on toy val",
    analysis.calc_auc(loss_MC, dataset_val_toy.df.MC_type != 0), ".2f")

#%%
max_iter = 20
loss_MC, loss_toy = compare(model_MC, model_toy, val_MC, max_iter=max_iter)
# %%
plt.hist([loss_MC, loss_toy],
         bins=int(np.sqrt(len(loss_MC))),
         density=True,
Пример #6
0
r = TrainingStability("Ribbles_w_outliers_1e6",None)
r.load()

interactive.save_value("trained models used for stability estimates",len(r.model))
#%%
loss_val = r.loss_val.mean(axis=1)
cut = np.quantile(r.loss_train,0.8)#np.mean(loss_val) #+ np.std(loss_val)*0.5
#cut2 = np.mean(r.auc) + np.std(loss_val)*0.5
filter = (r.loss_train < cut) #& (r.auc>cut2)
interactive.save_value("excluded values from stability plot",f"{100*sum(~filter)/len(filter):0.1f}")

plt.hexbin(loss_val[filter], r.auc[filter], linewidths=0.2, gridsize=int(np.sqrt(len(r.auc[filter]))))
plt.xlabel("EMD validation loss")
plt.ylabel("AUC")
plt.colorbar(label="count")
plt.show_and_save(f"va training stability on {len(r.auc)} samples")

# %%
plt.hexbin(r.loss_train[filter], r.auc[filter],linewidths=0.2, gridsize=int(np.sqrt(len(r.auc[filter]))))
plt.xlabel("EMD training loss")
plt.ylabel("AUC")
plt.colorbar(label="count")
plt.show_and_save(f"ta training stability on {len(r.auc)} samples")

#%%
#cut = #np.quantile(r.loss_train,0.8)
filter = (r.loss_train < 0.08)#cut)
cut_percentage = 1-len(r.loss_train[filter])/len(r.loss_train)
interactive.save_value("underperformance percentage",cut_percentage*100,".1f")

plt.subplot(2,1,1)
bins = int(np.sqrt(len(data)))

#plt.subplot(2, 1, 1)
plt.hist(low_loss, bins=bins)
plt.xlabel("EMD loss")
plt.ylabel("count")
plt.yscale("log")
#plt.xlim(0,max(low_loss)*1.01)

# plt.subplot(2, 1, 2)
# plt.hist(data[data > 0], bins=bins)
# plt.xlabel("EMD loss")
# plt.ylabel("count")
# plt.yscale("log")

plt.show_and_save("loss hist 1e6")

interactive.save_value("loss hist 1e6 number of models", len(data), ".1e")
interactive.save_value(
    "number of waveforms for loss hist 1e6 plot", equivalent_1e6 * batch_size, ".1e"
)
interactive.save_value(
    "percentage of bad models", len(data[data > 0.07]) / len(data) * 100, ".1f"
)

interactive.save_value(
    "median loss for 1e6", np.median(data), ".3f"
)

#%%
# remove failed models
Пример #8
0
        try:
            modelname = model.model.name
        except:
            modelname = model.model.__class__.__name__
        name = f"{i}, Model:{modelname}, Loss: {lossname}"
        print(f"Training {name}:")

        loss = model.train_batches(steps)

        x = np.linspace(0, (steps + 1) * dl_config["batch_size"], len(loss))
        plt.plot(x, loss)
        plt.xlabel("# of waveforms used for training")
        plt.ylabel(f"loss {lossname}")
        plt.xscale("log")
        plt.figtext(0, 0, name)
        plt.show_and_save(f"{name} + training")

        loss_func = lambda p, t: EMD.torch_auto(p, t, mean=False)
        val_losses = np.hstack(
            model.validation_loss(loss_func))  # restack batches
        names = dataset_val.df["MC_name"].values

        _, bins, _ = plt.hist(val_losses,
                              int(np.sqrt(len(val_losses))),
                              label="everything")
        plt.clf()

        unames = np.unique(names)
        data = [val_losses[names == name] for name in unames]
        plt.hist(data, bins=bins, label=unames, stacked=True)
        plt.xlabel("EMD loss")
Пример #9
0
plt.xscale("log")
plt.ylim([0.5, 0.9])
plt.legend()  # loc='lower bottom')

ax1 = plt.gca()
ax2 = ax1.twiny()
ax2.plot(times, times)
labels = [
    "${10^{" + f"{np.log10(i):.0f}" + "}}$"
    for i in ax1.get_xticks() / training_set_size
]
ax2.set_xscale("log")
ax2.set_xticklabels(labels)
ax2.set_xlabel("epochs")

plt.show_and_save("training time vs outliers mean")

#%%
for i, latent_dim in enumerate(latents):
    plt.errorbar(
        times,
        val_loss_mean[:, i],
        yerr=val_loss_mean_std[:, i],
        label=f"latent dim: {latent_dim}",
        color=f"C{i}",
    )

    # plt.plot(times,train_loss_mean[:,i],color=f"C{i}")

plt.ylabel("mean validation EMD loss")
plt.xlabel("waveforms used for training")
Пример #10
0
    lossname=model.loss_func.__class__.__name__
try:
    modelname = model.model.name
except:
    modelname = model.model.__class__.__name__
name = f"on IceCube MC latent space 6: {modelname}, Loss: {lossname}"
print(f"Training {name}:")

count_training_waveforms = (steps + 1) *dataset_train.batch_loading_size*train.batch_size
x = np.linspace(0, count_training_waveforms, len(loss))
plt.plot(x, loss)
plt.xlabel("# of waveforms used for training")
plt.ylabel(f"loss {lossname}")
plt.xscale("log")
plt.figtext(0, 0, name)
plt.show_and_save(f"{name} + training")
plt.clf()
#%%
bins = 200 # int(np.sqrt(len(val_losses)))
_, bins, _ = plt.hist(val_losses, bins, density=True)
plt.xlabel("EMD loss")
plt.ylabel("frequency")
#plt.yscale('log')
plt.show_and_save(f"{name} + hist")
plt.clf()

# %%
from icae.tools.config_loader import config
torch.save(model.model.state_dict(), config.root + f"icae/models/trained/{count_training_waveforms:.0e} samples {val_losses.mean():.1e} loss latent_space 6 IceMC.pt")
# %%