plt.title('Optimal lambda: 1e{0}'.format(np.log10(opt_lambda)))
plt.loglog(lambdas,train_err_vs_lambda.T,'b.-',lambdas,test_err_vs_lambda.T,'r.-')
plt.xlabel('Regularization factor')
plt.ylabel('Squared error (crossvalidation)')
plt.legend(['Train error','Validation error'])
plt.grid()

print("Optimal regularization strenght is: {0}".format(round(opt_lambda, 4)))


#%%
#------- REGULARIZED MUTINOMINAL REGRESSION ---------------------------
# Parameters
lambdas = np.logspace(-5, 5, 20)
cvf = 10
opt_val_err, opt_lambda, mean_w_vs_lambda, train_err_vs_lambda, test_err_vs_lambda = regmultinominal_regression(xIn, y_class, lambdas, cvf=cvf)

# Display the results for the last cross-validation fold
plt.figure(1, figsize=(12,8))
plt.subplot(1,2,1)
plt.semilogx(lambdas,mean_w_vs_lambda.T[:,1:],'.-') # Don't plot the bias term
plt.xlabel('Regularization factor')
plt.ylabel('Mean Coefficient Values')
plt.grid()
plt.legend(attributeNames[1:], loc='best')

plt.subplot(1,2,2)
plt.title('Optimal lambda: 1e{0}'.format(np.round(np.log10(opt_lambda), 4)))
plt.loglog(lambdas,train_err_vs_lambda.T,'b.-',lambdas,test_err_vs_lambda.T,'r.-')
plt.xlabel('Regularization factor')
plt.ylabel('Squared error (crossvalidation)')
def twoLevelCV_classification(xIn, yIn, models, K1, K2, lambdas, hidden_units,
                              CV_ann, n_replicates, max_iter, tolerance):

    M = xIn.shape[1]
    CV_outer = model_selection.KFold(n_splits=K1, shuffle=True)
    CV_inner = model_selection.KFold(n_splits=K2, shuffle=True)

    # Initialize variables
    error_train = np.empty((K2, len(models)))
    error_val = np.empty((K2, len(models)))
    error_test = np.empty((K1, len(models)))

    inner_lambdas = np.zeros(K2)  # Inner loop values for optimal lambda
    outer_lambdas = np.zeros(K1)  # Outer loop values for optimal lambda

    inner_hidden_units = np.zeros(
        K2)  # Inner loop values for optimal number of hidden units
    outer_hidden_units = np.zeros(
        K1)  # Outer loop values for optimal number of hidden units

    best_models_idx = np.empty((1, len(models)))
    estimatedGenError = np.empty((1, len(models)))

    # r parameter for the correlated t test initialization
    r = np.empty((K1, len(models)))

    # Outer cross-validation loop. Performance Evaluation
    k1 = 0
    for par_index, test_index in CV_outer.split(xIn):

        # extract par and test set for current CV fold
        X_par = xIn[par_index, :]
        y_par = yIn[par_index]
        X_test = xIn[test_index, :]
        y_test = yIn[test_index]

        # Inner cross-validation loop. Model selection and parameter optimization
        k2 = 0
        models_rmr = []
        models_ann = []
        models_baseline = []

        for train_index, val_index in CV_inner.split(X_par):
            print("\nOuter Iteration {0}/{1} -----------------------------".
                  format(k1 + 1, K1))
            print("\nInner Iteration {0}/{1} -----------------------------".
                  format(k2 + 1, K2))

            # Extract train and test set for current CV fold
            X_train = X_par[train_index, :]
            y_train = y_par[train_index]
            X_val = X_par[val_index, :]
            y_val = y_par[val_index]

            for s, model in enumerate(models):

                if s == 0:  # REGULARIZED MULTINOMINAL LOGISTIC REGRESSION

                    print(
                        "\nInner {}/{} - Regularized Multinominal Regression".
                        format(k2 + 1, K2))

                    opt_lambda = regmultinominal_regression(xIn,
                                                            yIn,
                                                            lambdas,
                                                            cvf=10)[1]
                    # Save the values of the optimal regularization strength
                    inner_lambdas[k2] = opt_lambda
                    print("Optimal Lambda = {}".format(np.round(opt_lambda,
                                                                3)))
                    # Fit multinomial logistic regression model
                    modelRMR = lm.LogisticRegression(solver='lbfgs',
                                                     multi_class='multinomial',
                                                     tol=1e-4,
                                                     random_state=1,
                                                     penalty='l2',
                                                     C=1 / opt_lambda,
                                                     max_iter=1000)
                    m = modelRMR.fit(X_train, y_train)

                    # Save the trained model
                    models_rmr.append(m)

                    # Compute Errors Rate = {number of misclassified observations}/len(y_val)
                    error_train[k2, s] = np.sum(
                        m.predict(X_train) != y_train) / len(y_train)
                    error_val[k2, s] = np.sum(
                        m.predict(X_val) != y_val) / len(y_val)

                if s == 1:  # ANN MULTI-CLASSIFICATION

                    print("\nInner {}/{} - ANN MultiClassification".format(
                        k2 + 1, K2))

                    opt_n_hidden_units = ann_multiclass_validate(
                        X_train,
                        y_train,
                        3,
                        hidden_units,
                        CV_ann,
                        n_replicates=n_replicates,
                        max_iter=max_iter,
                        tolerance=tolerance)[0]
                    inner_hidden_units[k2] = opt_n_hidden_units

                    model = lambda: torch.nn.Sequential(
                        torch.nn.Linear(M, opt_n_hidden_units),
                        torch.nn.Tanhshrink(),
                        torch.nn.Linear(opt_n_hidden_units, 3),
                        torch.nn.Softmax(dim=1))

                    # Training the ann model with the optimal number of hidden units
                    print(
                        "\n\tTraining the model with the optimal number of hidden units"
                    )
                    loss_fn = torch.nn.CrossEntropyLoss()
                    net = train_neural_net(
                        model,
                        loss_fn,
                        X=torch.from_numpy(X_train).float(),
                        y=torch.from_numpy(y_train).long().squeeze(),
                        n_replicates=n_replicates,
                        max_iter=max_iter,
                        tolerance=tolerance)[0]
                    # Save the trained model
                    models_ann.append(net)

                    # Determine probability of each class using trained network
                    softmax_logits_train = net(
                        torch.from_numpy(X_train).float())
                    softmax_logits_val = net(torch.from_numpy(X_val).float())

                    # Get the estimated class as the class with highest probability (argmax on softmax_logits)
                    y_train_est = (torch.max(softmax_logits_train,
                                             dim=1)[1]).data.numpy()
                    y_val_est = (torch.max(softmax_logits_val,
                                           dim=1)[1]).data.numpy()

                    # Compute Errors Rate = {number of misclassified observations}/len(y_val)
                    e_train = (y_train_est != y_train)
                    e_val = (y_val_est != y_val)

                    error_train[k2, s] = np.sum(e_train) / len(y_train)
                    error_val[k2, s] = np.sum(e_val) / len(y_val)

                if s == 2:  # BASELINE CLASSIFICATION

                    print("\nInner {}/{} - Baseline Classification".format(
                        k2 + 1, K2))
                    baseline_class = np.array((np.sum(y_train.squeeze() == 0),
                                               np.sum(y_train.squeeze() == 1),
                                               np.sum(y_train.squeeze() == 2)))
                    models_baseline.append(baseline_class)

                    # Compute Errors Rate = {number of misclassified observations}/len(y_val)
                    baseline_prediction = np.argmax(baseline_class) * np.ones(
                        y_val.shape[0])
                    error_val[k2, s] = np.sum(
                        (baseline_prediction != y_val)) / len(y_val)

                print("Validation error - Model {0}: {1}".format(
                    s + 1, np.round(error_val[k2, s], 4)))

            k2 += 1

        print("\nSummary Optimal models Outer {}/{}".format(k1 + 1, K1))
        for s, model in enumerate(models):

            # Find the CV index of optimal models
            best_models_idx[0, s] = error_val[:, s].argmin()
            print("\n- The best model {0} was: CV number {1}".format(
                s + 1, int(best_models_idx[0, s] + 1)))

            if s == 0:  # Save the optimal lambda of the optimal model

                # Trace back the model according to its CV fold index
                modelrmr_opt = models_rmr[int(best_models_idx[0, s])]

                # Compute Error Test Rate for the optimal model
                error_test[k1,
                           s] = np.square(y_test - modelrmr_opt.predict(X_test)
                                          ).sum() / y_test.shape[0]
                outer_lambdas[k1] = inner_lambdas[int(best_models_idx[0, s])]

            if s == 1:  # Save the optimal number of hidden units of the optimal model

                # Trace back the model according to its CV fold index
                net_opt = models_ann[int(best_models_idx[0, s])]

                # Compute Error Test Rate for the optimal model
                softmax_logits_test = net_opt(torch.from_numpy(X_test).float())
                y_test_est = (torch.max(softmax_logits_test,
                                        dim=1)[1]).data.numpy()
                error_test[k1, s] = np.sum(
                    (y_test_est != y_test)) / len(y_test)
                outer_hidden_units[k1] = inner_hidden_units[int(
                    best_models_idx[0, s])]

            if s == 2:  # Baseline computing test error

                # Trace back the model according to its CV fold index
                modelbaseline_opt = models_baseline[int(best_models_idx[0, s])]

                # Compute Error Test Rate for the optimal baseline
                baseline_prediction_opt = np.argmax(
                    modelbaseline_opt) * np.ones(y_test.shape[0])
                error_test[k1, s] = np.sum(
                    baseline_prediction_opt != y_test) / len(y_test)

        # Append the list of the differences in the generalization errors of two models. r is a matrix of k1 rows and 3 columns
        # column 0: ann vs lrl - column 1: ann vs baseline - column 2: lrl vs baseline  (same notation as the project description)
        r[k1, 0] = np.mean(error_test[:, 1]) - np.mean(error_test[:, 0])
        r[k1, 1] = np.mean(error_test[:, 1]) - np.mean(error_test[:, 2])
        r[k1, 2] = np.mean(error_test[:, 0]) - np.mean(error_test[:, 2])

        k1 += 1
        print("\n")
    estimatedGenError = np.round(np.mean(error_test, axis=0), 4)

    print("\n")
    for s in range(len(models)):
        print("Estimated Generalization Error for Model {0}: {1}".format(
            s + 1, estimatedGenError[s]))

    return error_test, outer_lambdas, outer_hidden_units, r, estimatedGenError