def fit_model(Y, U, T, model): models = ["clv", "alr", "lra", "glv", "glv-ra"] if model not in ["clv", "alr", "lra", "glv", "glv-ra"]: print("model", (model), "must be one of", models, file=sys.stderr) exit(1) folds = 7 for fold in range(folds): print("running fold", fold) train_Y = [] train_U = [] train_T = [] test_Y = [] test_U = [] test_T = [] for i in range(len(Y)): if i % folds == fold: test_Y.append(Y[i]) test_U.append(U[i]) test_T.append(T[i]) else: train_Y.append(Y[i]) train_U.append(U[i]) train_T.append(T[i]) parameter_filename = "tmp/bucci_diet_predictions-{}".format(fold) if model == "clv": try: pred_clv = pkl.load(open(parameter_filename + "-clv", "rb")) except FileNotFoundError: pred_clv = fit_clv(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_clv, open(parameter_filename + "-clv", "wb")) if model == "alr": try: pred_alr = pkl.load(open(parameter_filename + "-alr", "rb")) except FileNotFoundError: pred_alr = fit_linear_alr(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_alr, open(parameter_filename + "-alr", "wb")) if model == "lra": try: pred_lra = pkl.load(open(parameter_filename + "-lra", "rb")) except FileNotFoundError: pred_lra = fit_linear_rel_abun(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_lra, open(parameter_filename + "-lra", "wb")) if model == "glv": try: pred_glv = pkl.load(open(parameter_filename + "-glv", "rb")) except FileNotFoundError: pred_glv = fit_glv(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_glv, open(parameter_filename + "-glv", "wb")) if model == "glv-ra": try: pred_glv_ra = pkl.load( open(parameter_filename + "-glv-ra", "rb")) except FileNotFoundError: pred_glv_ra = fit_glv(train_Y, train_T, train_U, test_Y, test_T, test_U, use_rel_abun=True, folds=3) pkl.dump(pred_glv_ra, open(parameter_filename + "-glv-ra", "wb"))
def prediction_experiment(Y, U, T): # plot fit on test data with cross validation baseline_err_cv = [] en_err_cv = [] linear_err_cv = [] rel_abun_err_cv = [] glv_err_cv = [] glv_rel_abun_err_cv = [] en_err_stratified_cv = [] linear_err_stratified_cv = [] rel_abun_err_stratified_cv = [] glv_err_stratified_cv = [] glv_rel_abun_err_stratified_cv = [] folds = 7 for fold in range(folds): print("running fold", fold) train_Y = [] train_U = [] train_T = [] test_Y = [] test_U = [] test_T = [] for i in range(len(Y)): if i % folds == fold: test_Y.append(Y[i]) test_U.append(U[i]) test_T.append(T[i]) else: train_Y.append(Y[i]) train_U.append(U[i]) train_T.append(T[i]) parameter_filename = "tmp_c2b2/bucci_diet_predictions-{}".format(fold) print("cLV") try: pred_clv = pkl.load(open(parameter_filename + "-clv", "rb")) except FileNotFoundError: pred_clv = fit_clv(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_clv, open(parameter_filename + "-clv", "wb")) #plot_trajectories(pred_clv, test_T, "tmp_plots", "diet-clv-{}".format(fold)) print("Linear ALR") try: pred_alr = pkl.load(open(parameter_filename + "-alr", "rb")) except FileNotFoundError: pred_alr = fit_linear_alr(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_alr, open(parameter_filename + "-alr", "wb")) print("Linear Rel Abun") try: pred_lra = pkl.load(open(parameter_filename + "-lra", "rb")) except FileNotFoundError: pred_lra = fit_linear_rel_abun(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_lra, open(parameter_filename + "-lra", "wb")) print("gLV") try: pred_glv = pkl.load(open(parameter_filename + "-glv", "rb")) except FileNotFoundError: pred_glv = fit_glv(train_Y, train_T, train_U, test_Y, test_T, test_U, folds=3) pkl.dump(pred_glv, open(parameter_filename + "-glv", "wb")) # print("gLV Rel Abun") # try: # pred_glv_ra = pkl.load(open(parameter_filename + "-glv-ra", "rb")) # except FileNotFoundError: # pred_glv_ra = fit_glv(train_Y, train_T, train_U, test_Y, test_T, test_U, use_rel_abun=True, folds=3) # pkl.dump(pred_glv_ra, open(parameter_filename + "-glv-ra", "wb")) baseline_err_cv += [compute_baseline_errors(test_Y)] en_err_cv += [compute_errors(test_Y, pred_clv)] linear_err_cv += [compute_errors(test_Y, pred_alr)] rel_abun_err_cv += [compute_errors(test_Y, pred_lra)] glv_err_cv += [compute_errors(test_Y, pred_glv)] #glv_rel_abun_err_cv = [compute_errors(test_Y, pred_glv_ra)] en_err_stratified_cv += compute_errors_by_time(test_Y, pred_clv) linear_err_stratified_cv += compute_errors_by_time(test_Y, pred_alr) rel_abun_err_stratified_cv += compute_errors_by_time(test_Y, pred_lra) glv_err_stratified_cv += compute_errors_by_time(test_Y, pred_glv) #glv_rel_abun_err_stratified_cv += compute_errors_by_time(test_Y, pred_glv_ra) baseline = [] linear = [] rel_abun = [] glv = [] # compute p-values for difference in total error per sample baseline_sum = [] linear_sum = [] rel_abun_sum = [] glv_sum = [] clv_sum = [] for cl, bl, ln, ra, gl in zip(en_err_cv, baseline_err_cv, linear_err_cv, rel_abun_err_cv, glv_err_cv): baseline += (bl - cl).tolist() linear += (ln - cl).tolist() rel_abun += (ra - cl).tolist() glv += (gl - cl).tolist() baseline_sum += [np.sum(bl)] linear_sum += [np.sum(ln)] rel_abun_sum += [np.sum(ra)] glv_sum += [np.sum(gl)] clv_sum += [np.sum(cl)] baseline = np.array(baseline) linear = np.array(linear) rel_abun = np.array(rel_abun) glv = np.array(glv) baseline_p = wilcoxon_exact(baseline_sum, clv_sum, alternative="greater")[1] linear_p = wilcoxon_exact(linear_sum, clv_sum, alternative="greater")[1] rel_abun_p = wilcoxon_exact(rel_abun_sum, clv_sum, alternative="greater")[1] glv_p = wilcoxon_exact(glv_sum, clv_sum, alternative="greater")[1] df = pd.DataFrame(np.array([baseline, glv, linear, rel_abun]).T, columns=[ "baseline\n$p={:.3f}$".format(baseline_p), "gLV\n$p={:.3f}$".format(glv_p), "alr-linear\n$p={:.3f}$".format(linear_p), "ra-linear\n$p={:.3f}$".format(rel_abun_p) ]) ax = df.boxplot(showmeans=True) ax.set_ylabel("Square Error(X) $-$ Square Error(cLV)") ax.set_title("Diet Dataset") plt.savefig("plots/bucci-diet_prediction-comparison.pdf") for idx, en_glv_linear_rel in enumerate( zip(en_err_stratified_cv, glv_err_stratified_cv, linear_err_stratified_cv, rel_abun_err_stratified_cv)): obs_dim = Y[0].shape[1] en, glv, linear, rel = en_glv_linear_rel fig, ax = plt.subplots(nrows=4, ncols=2, figsize=(8, 10)) plot_bar(ax[0][0], en[:, 2:(2 + obs_dim)], en[:, 0]) ax[0][0].set_xticks(en[:0].tolist()) ax[0][0].set_xticklabels(en[:0].tolist()) ax[0][0].set_title("Truth") plot_bar(ax[0][1], en[:, (2 + obs_dim):(2 + 2 * obs_dim)], en[:, 0]) ax[0][1].set_xticks(en[:0].tolist()) ax[0][1].set_xticklabels(en[:0].tolist()) ax[0][1].set_title("cLV") plot_bar(ax[1][0], glv[:, (2 + obs_dim):(2 + 2 * obs_dim)], glv[:, 0]) ax[1][0].set_xticks(en[:0].tolist()) ax[1][0].set_xticklabels(en[:0].tolist()) ax[1][0].set_title("gLV") plot_bar(ax[2][0], linear[:, (2 + obs_dim):(2 + 2 * obs_dim)], linear[:, 0]) ax[2][0].set_xticks(en[:0].tolist()) ax[2][0].set_xticklabels(en[:0].tolist()) ax[2][0].set_title("alr-linear") plot_bar(ax[3][0], rel[:, (2 + obs_dim):(2 + 2 * obs_dim)], rel[:, 0]) ax[3][0].set_xticks(en[:0].tolist()) ax[3][0].set_xticklabels(en[:0].tolist()) ax[3][0].set_title("ra-linear") ax[1][1].scatter(glv[:, 0], glv[:, 1] - en[:, 1]) ax[2][1].scatter(linear[:, 0], linear[:, 1] - en[:, 1]) ax[3][1].scatter(rel[:, 0], rel[:, 1] - en[:, 1]) ymin = np.min( np.array([ glv[1:, 1] - en[1:, 1], linear[1:, 1] - en[1:, 1], rel[1:, 1] - en[1:, 1] ])) - 0.15 ymax = np.max( np.array([ glv[1:, 1] - en[1:, 1], linear[1:, 1] - en[1:, 1], rel[1:, 1] - en[1:, 1] ])) + 0.15 for i in range(1, 4): ax[i][1].set_ylim(ymin, ymax) ax[i][1].set_xlim(ax[i][0].get_xlim()) ax[i][1].axhline(y=0, linestyle=":", color="black", linewidth=0.5) ax[i][0].set_yticklabels([]) ax[0][0].set_yticklabels([]) ax[0][1].set_yticklabels([]) ax[1][1].set_ylabel("Sqr Err(gLV) $-$ Sqr Err(cLV)", fontsize=9) ax[2][1].set_ylabel("Sqr Err(alr-linear) $-$ Sqr Err(cLV)", fontsize=9) ax[3][1].set_ylabel("Sqr Err(ra-linear) $-$ Sqr Err(cLV)", fontsize=9) plt.tight_layout() plt.savefig( "plots/bucci-diet_prediction_comparison-test-{}.pdf".format(idx)) return (baseline, baseline_p), (linear, linear_p), (rel_abun, rel_abun_p), (glv, glv_p)