errors.append(error_rate)  # store error rate for current CV fold

    # Make a subplot for current cross validation fold that displays the
    # decision boundary over the original data, "background color" corresponds
    # to the output of the sigmoidal transfer function (i.e. before threshold),
    # white areas are areas of uncertainty, and a deaper red/blue means
    # that the network "is more sure" of a given class.
    plt.figure(decision_boundaries.number)
    plt.subplot(subplot_size_1, subplot_size_2, k + 1)
    plt.title('CV fold {0}'.format(k + 1), color=color_list[k])
    predict = lambda x: net(torch.tensor(x, dtype=torch.float)).data.numpy()
    visualize_decision_boundary(
        predict,
        X,
        y,  # provide data, along with function for prediction
        attributeNames,
        classNames,  # provide information on attribute and class names
        train=train_index,
        test=test_index,  # provide information on partioning
        show_legend=k == (K - 1))  # only display legend for last plot

    # Display the learning curve for the best net in the current fold
    h, = summaries_axes[0].plot(learning_curve, color=color_list[k])
    h.set_label('CV fold {0}'.format(k + 1))
    summaries_axes[0].set_xlabel('Iterations')
    summaries_axes[0].set_xlim((0, max_iter))
    summaries_axes[0].set_ylabel('Loss')
    summaries_axes[0].set_title('Learning curves')

# Display the error rate across folds
summaries_axes[1].bar(np.arange(1, K + 1),
            dim=1)  # final tranfer function, normalisation of logit output
    )
    # Since we're training a multiclass problem, we cannot use binary cross entropy,
    # but instead use the general cross entropy loss:
    loss_fn = torch.nn.CrossEntropyLoss()
    # Train the network:
    net, _, _ = train_neural_net(model,
                                 loss_fn,
                                 X=torch.tensor(X_train, dtype=torch.float),
                                 y=torch.tensor(y_train, dtype=torch.long),
                                 n_replicates=3,
                                 max_iter=10000)
    # Determine probability of each class using trained network
    softmax_logits = net(torch.tensor(X_test, dtype=torch.float))
    # Get the estimated class as the class with highest probability (argmax on softmax_logits)
    y_test_est = (torch.max(softmax_logits, dim=1)[1]).data.numpy()
    # Determine errors
    e = (y_test_est != y_test)
    print('Number of miss-classifications for ANN:\n\t {0} out of {1}'.format(
        sum(e), len(e)))
    error[i] = np.sum(e) / len(y_test)
    predict = lambda x: (torch.max(net(torch.tensor(x, dtype=torch.float)),
                                   dim=1)[1]).data.numpy()
    figure(i, figsize=(9, 9))
    visualize_decision_boundary(predict, [X_train, X_test], [y_train, y_test],
                                attributeNames, classNames)
    title('ANN decision boundaries')

show()
print(error)
print('Ran Exercise 8.3.1')