Exemplo n.º 1
0
def generate_train_test(N, task):
    # generates a training and a test set with the same
    # data distribution from two possibilities: easy dataset with low class
    # overlap, or hard dataset with high class overlap
    #
    # Input:
    #
    # N             - Number of samples per classs
    # task          - String, either 'easy' or 'hard'

    if task == 'easy':
        mu1 = [0, 0]
        mu2 = [4, 2]
        sigma1 = [[1, 0], [0, 1]]
        sigma2 = [[1, -1], [-1, 3]]

    if task == 'hard':
        mu1 = [0, 0]
        mu2 = [1, 1]
        sigma1 = [[3, 0], [0, 2]]
        sigma2 = [[2, 0], [0, 3]]

    trainX, trainY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)
    testX, testY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)

    return trainX, trainY, testX, testY
Exemplo n.º 2
0
def generate_train_test(N, task):
    # generates a training and a test set with the same
    # data distribution from two possibilities: easy dataset with low class
    # overlap, or hard dataset with high class overlap
    #
    # Input:
    #
    # N             - Number of samples per classs
    # task          - String, either 'easy' or 'hard'

    if task == 'easy':
        #-------------------------------------------------------------------#
        #TODO: modify these values to create an easy train/test dataset pair
        #-------------------------------------------------------------------#
        pass


    if task == 'hard':
        #-------------------------------------------------------------------#
        #TODO: modify these values to create an difficult train/test dataset pair
        #-------------------------------------------------------------------#
        pass

    trainX, trainY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)
    testX, testY = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)

    return trainX, trainY, testX, testY
Exemplo n.º 3
0
def distance_classification_test():
    train_data, train_labels = seg.generate_gaussian_data(2)
    test_data, test_labels = seg.generate_gaussian_data(1)

    D = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean')
    test_labels = train_labels[np.argmin(D, axis=1)]

    print(test_labels)
Exemplo n.º 4
0
def nn_classifier_test_samples():
    train_data, train_labels = seg.generate_gaussian_data(20)
    test_data, test_labels = seg.generate_gaussian_data(10)
    predicted_labels = seg.nn_classifier(train_data, train_labels, test_data)

    # predicted_labels = predicted_labels.astype(bool)
    # test_labels = test_labels.astype(bool)
    err = util.classification_error(test_labels, predicted_labels)

    print('True labels:\n{}'.format(test_labels))
    print('Predicted labels:\n{}'.format(predicted_labels))
    print('Error:\n{}'.format(err))
Exemplo n.º 5
0
def covariance_matrix_test():

    N=100
    mu1=[0,0]
    mu2=[0,0]
    sigma1=[[3,1],[1,1]]
    sigma2=[[3,1],[1,1]]
    X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)
Exemplo n.º 6
0
def distance_classification_test():
    #------------------------------------------------------------------#
    # TODO: Use the provided code to generate training and testing data
    #  Classify the points in test_data, based on their distances d to the points in train_data
    train_data, train_labels = seg.generate_gaussian_data(2)
    test_data, test_labels = seg.generate_gaussian_data(1)

    d = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean')
    min_index = np.argmin(d, axis=1)
    print(d)

    predicted_labels = np.zeros([test_data.shape[0], 1])

    for i in range(predicted_labels.shape[0]):
        predicted_labels[i] = train_labels[min_index[i]]

    return predicted_labels
Exemplo n.º 7
0
def distance_test():
    #------------------------------------------------------------------#
    # TODO: Generate a Gaussian dataset, with 100 samples per class, and compute the distances.
    #  Use plt.imshow() to visualize the distance matrix as an image.
    X, Y = seg.generate_gaussian_data(
        100)  # Generates 100 samples per Gaussian class
    D = scipy.spatial.distance.cdist(X, X, metric='euclidean')
    plt.imshow(D)
Exemplo n.º 8
0
def distance_classification_test():
    #------------------------------------------------------------------#
    # Use the provided code to generate training and testing data
    #  Classify the points in test_data, based on their distances d to the points in train_data
    train_data, train_labels = seg.generate_gaussian_data(2);
    test_data, test_labels = seg.generate_gaussian_data(1);
    # train_data=np.array([[1,1],[0,0],[0,1],[1,0]]);
    # train_labels=np.array([[0],[1],[0],[1]])
    util.scatter_data(train_data, train_labels, 0, 1)
    util.scatter_data(test_data, test_labels, 0, 1)

    D = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean')
    min_index = np.argmin(D, axis=1)
    newlabels=train_labels[min_index]

    print(test_data)
    print(newlabels)
    util.scatter_data(test_data,newlabels,0,1)
Exemplo n.º 9
0
def small_samples_distance_test():
    #------------------------------------------------------------------#
    # TODO: Generate a small sample Gaussian dataset X,
    #  create dataset C as per the instructions,
    #  and calculate and plot the distances between the datasets.
    X, Y = seg.generate_gaussian_data(2)
    C = np.array([[0, 0], [1, 1]])
    D = scipy.spatial.distance.cdist(X, C, metric='euclidean')
    plt.imshow(D)

    return X, Y, C, D
def distance_test():
    #------------------------------------------------------------------#
    # Generate a Gaussian dataset, with 100 samples per class, and compute the distances.
    #  Use plt.imshow() to visualize the distance matrix as an image.

    X, Y = seg.generate_gaussian_data(100)
    X = np.round(X * 3)  # Stretch and round the numbers
    D = scipy.spatial.distance.cdist(X, X, metric='euclidean')
    plt.imshow(D)

    #------------------------------------------------------------------#
    pass
def test_mypca():

    #Generates some toy data in 2D, computes PCA, and plots both datasets
    N = 100
    mu1 = [0, 0]
    mu2 = [2, 0]
    sigma1 = [[2, 1], [1, 1]]
    sigma2 = [[2, 1], [1, 1]]

    XG, YG = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)

    fig = plt.figure(figsize=(15, 6))
    ax1 = fig.add_subplot(121)

    util.scatter_data(XG, YG, ax=ax1)
    sigma = np.cov(XG, rowvar=False)
    w, v = np.linalg.eig(sigma)
    ax1.plot([0, v[0, 0]], [0, v[1, 0]],
             c='g',
             linewidth=3,
             label='Eigenvector1')
    ax1.plot([0, v[0, 1]], [0, v[1, 1]],
             c='k',
             linewidth=3,
             label='Eigenvector2')
    ax1.set_title('Original data')
    ax_settings(ax1)

    ax2 = fig.add_subplot(122)
    X_pca, v, w, fraction_variance = seg.mypca(XG)
    util.scatter_data(X_pca, YG, ax=ax2)
    sigma2 = np.cov(X_pca, rowvar=False)
    w2, v2 = np.linalg.eig(sigma2)
    ax2.plot([0, v2[0, 0]], [0, v2[1, 0]],
             c='g',
             linewidth=3,
             label='Eigenvector1')
    ax2.plot([0, v2[0, 1]], [0, v2[1, 1]],
             c='k',
             linewidth=3,
             label='Eigenvector2')
    ax2.set_title('My PCA')
    ax_settings(ax2)

    handles, labels = ax2.get_legend_handles_labels()
    plt.figlegend(handles,
                  labels,
                  loc='upper center',
                  bbox_to_anchor=(0.5, -0.05),
                  bbox_transform=plt.gcf().transFigure,
                  ncol=4)

    print(fraction_variance)
Exemplo n.º 12
0
def covariance_matrix_test():
    N = 100
    mu1 = [0, 0]
    mu2 = [0, 0]
    sigma1 = [[3, 1], [1, 1]]
    sigma2 = [[3, 1], [1, 1]]
    X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)

    sigma = np.cov(X.T)
    mu = np.mean(X, axis=0)

    return X, Y, sigma
Exemplo n.º 13
0
def small_samples_distance_test():
    #------------------------------------------------------------------#
    # Generate a small sample Gaussian dataset X,
    #  create dataset C as per the instructions,
    #  and calculate and plot the distances between the datasets.

    C = np.array([[0, 0], [1, 1]])
    X, Y = seg.generate_gaussian_data(2)  # Generates 2 samples per Gaussian class
    X = np.round(X * 3)  # Stretch and round the numbers
    D = scipy.spatial.distance.cdist(X, C, metric='euclidean')
    plt.imshow(D)
    #------------------------------------------------------------------#
    return X,Y,C,D
Exemplo n.º 14
0
def initialize_cluster_centers(N=100, num_clusters=2):
    # Generate 100 samples per Gaussian class
    X, Y = seg.generate_gaussian_data(N)

    # Select num_clusters rows from X and store in w_initial
    start = np.random.randint(0, 98)
    n_cluster_rows = X[start:(start + num_clusters), :]
    w_initial = np.array(n_cluster_rows)

    ax1 = util.scatter_data(X, Y)
    ax1.scatter(w_initial[:, 0], w_initial[:, 1], c='y')
    plt.show()

    return X, w_initial
Exemplo n.º 15
0
def distance_classification_test():
    #------------------------------------------------------------------#
    # TODO: Use the provided code to generate training and testing data
    #  Classify the points in test_data, based on their distances d to the points in train_data
    train_data, trainlabels = seg.generate_gaussian_data(2)
    test_data, testlabels = seg.generate_gaussian_data(1)
    
    D = scipy.spatial.distance.cdist(test_data, train_data, metric='euclidean') #distances between X and C
    min_index = np.argmin(D, axis=1)
    min_dist = np.zeros((len(min_index),1))
    for i in range(len(min_index)):
        min_dist[i,0] = D.item((i, min_index[i]))
    
    # Sort by intensity of cluster center
    sorted_order = np.argsort(train_data[:,0], axis=0)
    
    # Update the cluster indices based on the sorted order and return results in
    # predicted_labels
    predicted_labels = np.empty(*min_index.shape)
    predicted_labels[:] = np.nan
    
    for i in np.arange(len(sorted_order)):
        predicted_labels[min_index==sorted_order[i]] = i
    return predicted_labels
Exemplo n.º 16
0
def covariance_matrix_test():

    N=100
    mu1=[0,0]
    mu2=[0,0]
    sigma1=[[3,1],[1,1]]
    sigma2=[[3,1],[1,1]]
    X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)
    #------------------------------------------------------------------#
    # TODO: Calculate the mean and covariance matrix of the data,
    #  and compare them to the parameters you used as input.
    matrix_mean = np.mean(X)
    matrix_cov = np.cov(X)
    print("Mean: {}".format(matrix_mean))
    print("Covariant matrix: {}".format(matrix_cov))
    return  X, Y, matrix_cov
def covariance_matrix_test():
    N = 100
    mu1 = [0, 0]
    mu2 = [0, 0]
    sigma1 = [[3, 1], [1, 1]]
    sigma2 = [[3, 1], [1, 1]]
    X, Y = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)
    #------------------------------------------------------------------#
    #  Calculate the mean and covariance matrix of the data,
    #  and compare them to the parameters you used as input.
    mn = np.mean(X)
    print(mn)
    co = np.cov(X, rowvar=False)
    print(co)
    print(co.shape)
    return X, Y, co
Exemplo n.º 18
0
def small_samples_distance_test():
    #------------------------------------------------------------------#
    # TODO: Generate a small sample Gaussian dataset X,
    #  create dataset C as per the instructions,
    #  and calculate and plot the distances between the datasets.
    X, Y = seg.generate_gaussian_data(100) 
    C = np.array([[0,0],[1,1]])
    
    D1 = scipy.spatial.distance.cdist(X, C, metric='euclidean') #distances between X and C
    D2 = scipy.spatial.distance.cdist(C,X, metric = 'euclidean')
    
    
    fig = plt.figure(figsize=(10,10))
    ax1  = fig.add_subplot(121)
    ax1.imshow(D1)
    ax2  = fig.add_subplot(122)
    ax2.imshow(D2)  
    #mirroring when changing the order of inputs

    #------------------------------------------------------------------#
    return X, Y, C, D1
Exemplo n.º 19
0
def distance_test():
    X, Y = seg.generate_gaussian_data()
    D = scipy.spatial.distance.cdist(X, X, metric='euclidean')
    ax = plt.imshow(D)
Exemplo n.º 20
0
def logistic_regression():
    # dataset preparation
    num_training_samples = 300
    num_validation_samples = 100

    # here we reuse the function from the segmentation practicals
    m1 = [2, 3]
    m2 = [-0, -4]
    s1 = [[8, 7], [7, 8]]
    s2 = [[8, 6], [6, 8]]

    [trainingX,
     trainingY] = seg.generate_gaussian_data(num_training_samples, m1, m2, s1,
                                             s2)
    r, c = trainingX.shape
    print('Training sample shape: {}'.format(trainingX.shape))

    # we need a validation set to monitor for overfitting
    [validationX,
     validationY] = seg.generate_gaussian_data(num_validation_samples, m1, m2,
                                               s1, s2)
    r_val, c_val = validationX.shape
    print('Validation sample shape: {}'.format(validationX.shape))

    validationXones = util.addones(validationX)

    # train a logistic regression model:
    # the learning rate for the gradient descent method
    # (the same as in intensity-based registration)
    mu = 0.001

    # we are actually using stochastic gradient descent
    batch_size = 30

    # initialize the parameters of the model with small random values,
    # we need one parameter for each feature and a bias
    Theta = 0.02 * np.random.rand(c + 1, 1)

    # number of gradient descent iterations
    num_iterations = 300

    # variables to keep the loss and gradient at every iteration
    # (needed for visualization)
    iters = np.arange(num_iterations)
    loss = np.full(iters.shape, np.nan)
    validation_loss = np.full(iters.shape, np.nan)

    # Create base figure
    fig = plt.figure(figsize=(15, 8))
    ax1 = fig.add_subplot(121)
    im1, Xh_ones, num_range_points = util.plot_lr(trainingX, trainingY, Theta,
                                                  ax1)
    seg_util.scatter_data(trainingX, trainingY, ax=ax1)
    ax1.grid()
    ax1.set_xlabel('x_1')
    ax1.set_ylabel('x_2')
    ax1.legend()
    ax1.set_title('Training set')
    text_str1 = '{:.4f};  {:.4f};  {:.4f}'.format(0, 0, 0)
    txt1 = ax1.text(0.3,
                    0.95,
                    text_str1,
                    bbox={
                        'facecolor': 'white',
                        'alpha': 1,
                        'pad': 10
                    },
                    transform=ax1.transAxes)

    ax2 = fig.add_subplot(122)
    ax2.set_xlabel('Iteration')
    ax2.set_ylabel('Loss (average per sample)')
    ax2.set_title('mu = ' + str(mu))
    h1, = ax2.plot(iters, loss, linewidth=2, label='Training loss')
    h2, = ax2.plot(iters,
                   validation_loss,
                   linewidth=2,
                   label='Validation loss')
    ax2.set_ylim(0, 0.7)
    ax2.set_xlim(0, num_iterations)
    ax2.grid()
    ax1.legend()

    text_str2 = 'iter.: {}, loss: {:.3f}, val. loss: {:.3f}'.format(0, 0, 0)
    txt2 = ax2.text(0.3,
                    0.95,
                    text_str2,
                    bbox={
                        'facecolor': 'white',
                        'alpha': 1,
                        'pad': 10
                    },
                    transform=ax2.transAxes)

    # iterate
    for k in np.arange(num_iterations):
        # pick a batch at random
        idx = np.random.randint(r, size=batch_size)

        # the loss function for this particular batch
        loss_fun = lambda Theta: cad.lr_nll(util.addones(trainingX[idx, :]),
                                            trainingY[idx], Theta)

        # gradient descent:
        # here we reuse the code for numerical computation of the gradient
        # of a function
        Theta = Theta - mu * reg.ngradient(loss_fun, Theta)

        # compute the loss for the current model parameters for the
        # training and validation sets
        # note that the loss is divided with the number of samples so
        # it is comparable for different number of samples
        loss[k] = loss_fun(Theta) / batch_size
        validation_loss[k] = cad.lr_nll(validationXones, validationY,
                                        Theta) / r_val

        # upldate the visualization
        ph = cad.sigmoid(Xh_ones.dot(Theta)) > 0.5
        decision_map = ph.reshape(num_range_points, num_range_points)
        decision_map_trns = np.flipud(decision_map)
        im1.set_data(decision_map_trns)
        text_str1 = '{:.4f};  {:.4f};  {:.4f}'.format(Theta[0, 0], Theta[1, 0],
                                                      Theta[2, 0])
        txt1.set_text(text_str1)
        h1.set_ydata(loss)
        h2.set_ydata(validation_loss)
        text_str2 = 'iter.={}, loss={:.3f}, val. loss={:.3f} '.format(
            k, loss[k], validation_loss[k])
        txt2.set_text(text_str2)

        display(fig)
        clear_output(wait=True)
Exemplo n.º 21
0
import segmentation as seg
import matplotlib.pyplot as plt
import cad
from IPython.display import display, clear_output, HTML
import numpy as np

num_training_samples = 300
num_validation_samples = 100

# here we reuse the function from the segmentation practicals
m1 = [2, 3]
m2 = [-0, -4]
s1 = [[8, 7], [7, 8]]
s2 = [[8, 6], [6, 8]]

[trainingX, trainingY] = seg.generate_gaussian_data(num_training_samples, m1,
                                                    m2, s1, s2)
r, c = trainingX.shape
print('Training sample shape: {}'.format(trainingX.shape))

# we need a validation set to monitor for overfitting
[validationX,
 validationY] = seg.generate_gaussian_data(num_validation_samples, m1, m2, s1,
                                           s2)
r_val, c_val = validationX.shape
print('Validation sample shape: {}'.format(validationX.shape))

validationXones = util.addones(validationX)

# train a logistic regression model:
# the learning rate for the gradient descent method
# (the same as in intensity-based registration)
Exemplo n.º 22
0
def learning_curve():

    # Load training and test data
    train_data, train_labels = seg.generate_gaussian_data(1000)
    test_data, test_labels = seg.generate_gaussian_data(1000)
    [train_data, test_data] = seg.normalize_data(train_data, test_data)

    #Define parameters
    train_sizes = np.array([1, 3, 10, 30, 100, 300])
    k = 1
    num_iter = 3  #How often to repeat the experiment

    #Store errors
    test_error = np.empty([len(train_sizes),num_iter])
    test_error[:] = np.nan
    test_dice = np.empty([len(train_sizes),num_iter])
    test_dice[:] = np.nan

    #------------------------------------------------------------------#
    #TODO: Store errors for training data
    #------------------------------------------------------------------#

    ## Train and test with different values
    for i in np.arange(len(train_sizes)):
        for j in np.arange(num_iter):
            print('train_size = {}, iter = {}'.format(train_sizes[i], j))
            #Subsample training set
            ix = np.random.randint(len(train_data), size=train_sizes[i])
            subset_train_data = train_data[ix,:]
            subset_train_labels = train_labels[ix,:]

            #Train classifier
            neigh = KNeighborsClassifier(n_neighbors=k)
            neigh.fit(subset_train_data, subset_train_labels.ravel())
            #Evaluate
            predicted_test_labels = neigh.predict(test_data)

            test_labels = test_labels.astype(bool)
            predicted_test_labels = predicted_test_labels.astype(bool)

            test_error[i,j] = util.classification_error(test_labels, predicted_test_labels)
            test_dice[i,j] = util.dice_overlap(test_labels, predicted_test_labels)

            #------------------------------------------------------------------#
            #TODO: Predict training labels and evaluate
            #------------------------------------------------------------------#

    ## Display results
    fig = plt.figure(figsize=(8,8))
    ax1 = fig.add_subplot(111)
    x = np.log(train_sizes)
    y_test = np.mean(test_error,1)
    yerr_test = np.std(test_error,1)
    p1 = ax1.errorbar(x, y_test, yerr=yerr_test, label='Test error')

    #------------------------------------------------------------------#
    #TODO: Plot training size
    #------------------------------------------------------------------#

    ax1.set_xlabel('Number of training samples (k)')
    ax1.set_ylabel('error')
    ticks = list(x)
    ax1.set_xticks(ticks)
    tick_lbls = [str(i) for i in train_sizes]
    ax1.set_xticklabels(tick_lbls)
    ax1.grid()
    ax1.legend()
Exemplo n.º 23
0
import segmentation_util as util
import matplotlib.pyplot as plt
import segmentation as seg
from scipy import ndimage, stats
import scipy
from sklearn.neighbors import KNeighborsClassifier
import timeit
from IPython.display import display, clear_output

N = 100
mu1 = [0, 0]
mu2 = [2, 0]
sigma1 = [[2, 1], [1, 1]]
sigma2 = [[2, 1], [1, 1]]

XG, YG = seg.generate_gaussian_data(N, mu1, mu2, sigma1, sigma2)

X_pca, v, w, fraction_variance = seg.mypca(XG)

X = XG
X = X - np.mean(X, axis=0)

#------------------------------------------------------------------#
#TODO: Calculate covariance matrix of X, find eigenvalues and eigenvectors,
# sort them, and rotate X using the eigenvectors
cov_matrix = np.cov(X, rowvar=False)
np.sort(cov_matrix)
w, v = np.linalg.eig(cov_matrix)
print(w)
print(v)
Exemplo n.º 24
0
def small_samples_distance_test():
    X, Y = seg.generate_gaussian_data(50)
    C = np.array([[0, 0], [1, 1]])
    D = scipy.spatial.distance.cdist(X, C, metric='euclidean')

    return X, Y, C, D