Пример #1
0
def adaboostLSLC(X, Y, K, nSamples):
    # Adaboost with least squares linear classifier as weak classifier
    # for a D-dim dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (iteration number of Adaboost) (scalar)
    # nSamples  : number of data which are weighted sampled (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (K x 1)
    # para      : parameters of least square classifier (K x 3)
    #             For a D-dim dataset each least square classifier has D+1 parameters
    #             w0, w1, w2........wD

    #####Start Subtask 1e#####
    N, D = X.shape
    alphaK = np.zeros(K)
    para = np.ndarray((K, D + 1))
    W = (1 / N * np.ones(N)).reshape(N, 1)

    for k in range(K):
        # Train classifier
        sIdx = choice(N, nSamples, True, W.ravel())

        weight, bias = leastSquares(X[sIdx, :], Y[sIdx])

        para[k, :] = [bias, weight[0], weight[1]]

        # Calculate labeled classification vector
        C = np.sign(X.dot(weight) + bias).reshape(N, 1)

        C = C * Y
        mask = [item[0] for sublist in [C < 0] for item in sublist]
        # Compute weighted error of classifier
        I = np.zeros(N)
        I[mask] = 1
        epsilon = max(I.dot(W), 0.001)

        # Calculate voting weight
        alpha = 0.5 * np.log((1 - epsilon) / epsilon)
        alphaK[k] = alpha

        # Update weights and normalize
        W = W * np.exp((-alpha) * C)
        W = W / sum(W)
    #####End Subtask#####

    return [alphaK, para]
Пример #2
0
def adaboostLSLC(X, Y, K, nSamples):
    # Adaboost with least squares linear classifier as weak classifier
    # for a D-dim dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (iteration number of Adaboost) (scalar)
    # nSamples  : number of data which are weighted sampled (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (K x 1)
    # para      : parameters of least square classifier (K x 3)
    #             For a D-dim dataset each least square classifier has D+1 parameters
    #             w0, w1, w2........wD
    numSamples, numDim = np.shape(X)
    X = np.reshape(X, (numSamples, numDim))
    Y = np.reshape(Y, (numSamples, 1))
    W = np.ones([numSamples, 1]) * 1 / numSamples
    alphaK = np.zeros([K, 1])
    para = np.zeros([K, numDim + 1])
    for i in range(K):
        my_randomorder = choice(numSamples, nSamples, replace=False)
        training_set = X[my_randomorder]
        training_lables = Y[my_randomorder]
        W_nsample = W[my_randomorder]
        result_lables = np.ones([nSamples, 1])
        weight, bias = leastSquares(training_set, training_lables)
        para[i, 0] = bias
        para[i, 1:] = np.reshape(weight, (1, numDim))
        result_lables[np.dot(training_set, para[i, 1:]) + bias <= 0] = -1
        error_temp = W_nsample[result_lables != training_lables]
        if np.size(error_temp) == 0 or sum(error_temp) == 0:
            alphaK[i, 0] = 100
        else:
            error = sum(error_temp) / sum(W_nsample)
            alphaK[i, 0] = 0.5 * np.log((1 - error) / error)
            W_nsample = W_nsample * (
                result_lables == training_lables) + W_nsample * (
                    result_lables != training_lables) * np.exp(alphaK[i, 0])
        W[my_randomorder] = W_nsample
    #####Insert your code here for subtask 1e#####

    return [alphaK, para]
import numpy as np
from linclass import linclass
from leastSquares import leastSquares
from plot_ import plot_

train = {}
test = {}
## Load the data
train.update({'data': np.loadtxt('lc_train_data.dat')})
train.update({'label': np.loadtxt('lc_train_label.dat')})
test.update({'data': np.loadtxt('lc_test_data.dat')})
test.update({'label': np.loadtxt('lc_test_label.dat')})

## Train the classifier using the training dataset
weight, bias = leastSquares(train['data'], train['label'])

## Evaluate the classifier on the training dataset
train.update({'prediction': linclass(weight, bias, train['data'])})

# Print and show the performance of the classifier
train.update(
    {'acc': sum(train['prediction'] == train['label']) / len(train['label'])})
print('Accuracy on train set: {0}'.format(train['acc']))
plot_(train['data'], train['label'], weight, bias, 'Train Set')

# Test the classifier on the test dataset
test.update({'prediction': linclass(weight, bias, test['data'])})

# Print and show the performance of the classifier
test.update(
    {'acc': sum(test['prediction'] == test['label']) / len(test['label'])})
Пример #4
0
def adaboostUSPS(X, Y, K, nSamples, percent):
    # Adaboost with least squares linear classifier as weak classifier on USPS data
    # for a high dimensional dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (scalar)
    # nSamples  : number of data points obtained by weighted sampling (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (1 x k)
    # para      : parameters of simple classifier (K x (D+1))
    #             For a D-dim dataset each simple classifier has D+1 parameters
    # error     : training error (1 x k)
    numSamples, numDim = np.shape(X)
    test_num = int(numSamples * percent)
    training_num = numSamples - test_num
    if training_num < nSamples:
        print("Error: 测试集错误")

    X = np.reshape(X, (numSamples, numDim))
    Y = np.reshape(Y, (numSamples, 1))
    # initialization
    W = np.ones([training_num, 1]) * 1 / training_num
    alphaK = np.zeros([K, 1])
    para = np.zeros([K, numDim + 1])
    error = np.zeros([K, 1])
    choice_test = choice(numSamples, test_num, replace=False)
    testX = X[choice_test]
    testY = Y[choice_test]
    trainingX = []
    trainingY = []
    for i in range(numSamples):
        if i in choice_test:
            pass
        else:
            trainingX.append(X[i, :])
            trainingY.append(Y[i, :])
    trainingX = np.reshape(trainingX, (training_num, numDim))
    trainingY = np.reshape(trainingY, (training_num, 1))
    for i in range(K):
        my_randomorder = choice(training_num, nSamples, replace=False)
        training_set = trainingX[my_randomorder]
        training_lables = trainingY[my_randomorder]
        W_nsample = W[my_randomorder]
        result_lables = np.ones([nSamples, 1])
        weight, bias = leastSquares(training_set, training_lables)
        para[i, 0] = bias
        para[i, 1:] = np.reshape(weight, (1, numDim))
        result_lables[np.dot(training_set, para[i, 1:]) + bias <= 0] = -1
        error_temp = W_nsample[result_lables != training_lables]
        if np.size(error_temp) == 0 or sum(error_temp) == 0:
            alphaK[i, 0] = 100
        else:
            error = sum(error_temp) / sum(W_nsample)
            alphaK[i, 0] = 0.5 * np.log((1 - error) / error)
            W_nsample = W_nsample * (
                result_lables == training_lables) + W_nsample * (
                    result_lables != training_lables) * np.exp(alphaK[i, 0])
        W[my_randomorder] = W_nsample
        # estimate the validation of  round i in K
        classLabels, test_result = eval_adaBoost_leastSquare(
            testX, alphaK[0:i, :], para[:i, :])
        error[i, 0] = sum((np.reshape(classLabels,
                                      (test_num, 1)) != testY) * 1) / test_num
    error = error[:, 0]
    #####Insert your code here for subtask 1f#####
    # Sample random a percentage of data as test data set
    return [alphaK, para, error]
Пример #5
0
def adaboostLSLC(X, Y, K, nSamples):
    # Adaboost with least squares linear classifier as weak classifier
    # for a D-dim dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (iteration number of Adaboost) (scalar)
    # nSamples  : number of data which are weighted sampled (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (K x 1)
    # para      : parameters of least square classifier (K x 3)
    #             For a D-dim dataset each least square classifier has D+1 parameters
    #             w0, w1, w2........wD

    #####Insert your code here for subtask 1e#####
    """
    least squares classifier returns weight w and bias b
    such that y = w*x + b
    """
    N, D = X.shape

    new_dim = D + 1

    # weight is the factor used to regulate the choosing probability of each data point
    w = (np.ones(N) / N).reshape(N, 1)

    alphaK = np.zeros(K)
    para = np.zeros((K, new_dim))

    #X = np.insert(X, 0, 1, axis = 1)

    for k in range(K):

        index = choice(N, nSamples, True, w.ravel())

        sX = X[index, :]
        sY = Y[index]

        weight, bias = leastSquares(sX, sY)

        para[k, :] = [bias, weight[0], weight[1]]

        cX = np.sign(X.dot(weight) + bias).reshape(N, 1)

        cX = cX * Y

        mask = [item[0] for sublist in [cX < 0] for item in sublist]
        # Compute weighted error of classifier
        I = np.zeros(N)
        I[mask] = 1

        ek = max(I.dot(w), 0.001)

        alphaK[k] = 0.5 * np.log((1 - ek) / ek)
        w = w * np.exp((-alphaK[k]) * cX)
        w = w / np.sum(
            w)  # normalization, otherwise, the weights grow exponentially

    return [alphaK, para]
Пример #6
0
def adaboostUSPS(X, Y, K, nSamples, percent):
    # Adaboost with least squares linear classifier as weak classifier on USPS data
    # for a high dimensional dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (scalar)
    # nSamples  : number of data points obtained by weighted sampling (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (1 x k) 
    # para      : parameters of simple classifier (K x (D+1))            
    #             For a D-dim dataset each simple classifier has D+1 parameters
    # error     : training error (1 x k)

    #####Start Subtask 1f#####
    # Sample random a percentage of data as test data set
    N = len(X)
    numb = round( N * percent)
    pos = choice(N, numb, False)
    allpos = range(N)
    restpos = np.setdiff1d(allpos, pos)

    testX = X[pos]
    testY = Y[pos]
    newX = X[restpos]
    newY = Y[restpos]
    X = newX
    Y = newY

    # Initialization
    n = N - numb

    w = (np.ones(n)/n).reshape(n, 1)
    alphaK = np.ones(K)
    error = np.ones(K)
    para = np.ndarray((K, X.shape[1]+1))

    #initialize loop
    for k in range(K):
        
        # weight sampling of data
        #print(w.shape)
        index = choice(n,nSamples,True,w.ravel())
        X_sampled = X[index]
        Y_sampled = Y[index]

       # Train the weak classifier Ck
        weights, bias = leastSquares(X_sampled, Y_sampled)

        para[k,:] = np.append(weights, [bias])

        # classify
        cY = np.sign(np.append(np.ones(n).reshape(n,1), X, axis = 1).dot(para[k].T)).T

        # calculate error for given classifier
        temp = np.where([Y[i] != cY[i] for i in range(n)], 1, 0).reshape(n, 1)
        ek = np.sum(w * temp)

        # If the error is zero, the data set is correct classified - break the loop
        if ek < 1.0e-01:
            alphaK[k]=1
            break

        # Compute the voting weight for the weak classifier alphak
        alphaK[k] = 0.5 * np.log((1 - ek) / ek)

        # recalculate the weights
        w = w * np.exp(-alphaK[k] * (Y*(cY.reshape(len(cY),1)))) #TODO: check mult
        w = w/sum(w)

        # calculate error for boosted classifier
        classlabels, _ = eval_adaBoost_leastSquare(testX, alphaK[:k+1], para[:k+1])
        classlabels = classlabels.reshape(len(classlabels), 1)

        error[k] = sum([classlabels[i] != testY[i] for i in range(len(classlabels))])/len(testY)

    #####End Subtask#####
    return [alphaK, para, error]
Пример #7
0
def adaboostUSPS(X, Y, K, nSamples, percent):
    # Adaboost with least squares linear classifier as weak classifier on USPS data
    # for a high dimensional dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (scalar)
    # nSamples  : number of data points obtained by weighted sampling (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (1 x k)
    # para      : parameters of simple classifier (K x (D+1))
    #             For a D-dim dataset each simple classifier has D+1 parameters
    # error     : training error (1 x k)

    N, D = X.shape

    #this number will be used for the number of  training data
    index_number = round(percent * N)

    #this indicates the number of validation dataset
    test_number = N - index_number

    index = choice(N, index_number, False)
    allpos = range(N)
    restpos = np.setdiff1d(allpos, index)

    trainingX = X[index, :]
    trainingY = Y[index]

    testX = X[restpos, :]
    testY = Y[restpos]

    w = (np.ones(index_number)/index_number).reshape(index_number, 1)
    error = np.zeros(K)
    para = np.zeros((K, D+1))
    alphaK = np.zeros(K)

    for k in range(K):

        #choose the index randomly
        idx = choice(index_number, nSamples, True, w.ravel())
        #samples from the training data according to the idx
        sX = trainingX[idx, :]
        #sampled target label according to the idx
        sY = trainingY[idx]

        weight, bias = leastSquares(sX, sY)

        para[k, :] = np.append(weight, [bias])
        ones = np.ones(len(trainingX)).reshape(len(trainingX), 1)
        cX = np.sign(np.append(ones, trainingX, axis = 1).dot(para[k].T)).T

        cY = np.where([cX[i] != trainingY[i] for i in range(len(cX))], 1, 0).reshape(len(cX), 1)

        ek = np.sum(cY*w)

        if ek < 1.0e-01:
            alphaK[k] = 1
            break

        alphaK[k] = 0.5 * np.log((1 - ek) / ek)
        w = w * np.exp((-alphaK[k]) * (trainingY*(cX.reshape(len(cX),1))))
        w = w / np.sum(w)  # normalization, otherwise, the weights grow exponentially

        classlabel, _ = eval_adaBoost_leastSquare(testX, alphaK[:k+1], para[:k+1])

        classlabel = classlabel.reshape(len(classlabel), 1)

        error[k] = np.sum(np.where([classlabel[i] != testY[i] for i in range(len(testY))], 1, 0))/len(testY)

    return [alphaK, para, error]
Пример #8
0
def adaboostUSPS(X, Y, K, nSamples, percent):
    # Adaboost with least squares linear classifier as weak classifier on USPS data
    # for a high dimensional dataset
    #
    # INPUT:
    # X         : the dataset (numSamples x numDim)
    # Y         : labeling    (numSamples x 1)
    # K         : number of weak classifiers (scalar)
    # nSamples  : number of data points obtained by weighted sampling (scalar)
    #
    # OUTPUT:
    # alphaK    : voting weights (1 x k)
    # para      : parameters of simple classifier (K x (D+1))
    #             For a D-dim dataset each simple classifier has D+1 parameters
    # error     : training error (1 x k)

    #Used the least-squares based AdaBoost on real data, i.e. the USPS data (provided in usps.mat). The dataset consists of a matrix X and a label vector Y. Each 	row of the matrix X is an image of size 20 × 14. The first 5000 rows of X contain the images of the digit 2, and the rest contains the images of the digit 	9. Perform a random split of the 10000 data points into two equally sized subsets, one for training and one for validation. Run this at least three times and 		plot the cross validation error estimates vs. the number k of iterations.

    # Sample random a percentage of data as test data set
    N = len(X)
    numb = round(N * percent)
    pos = choice(N, numb, False)
    allpos = range(N)
    restpos = np.setdiff1d(allpos, pos)

    testX = X[pos]
    testY = Y[pos]
    newX = X[restpos]
    newY = Y[restpos]
    X = newX
    Y = newY

    # Initialization
    n = N - numb

    w = (np.ones(n) / n).reshape(n, 1)
    alphaK = np.ones(K)
    error = np.ones(K)
    para = np.ndarray((K, X.shape[1] + 1))

    #initialize loop
    for k in range(K):

        # weight sampling of data
        #print(w.shape)
        index = choice(n, nSamples, True, w.ravel())
        X_sampled = X[index]
        Y_sampled = Y[index]

        # Train the weak classifier Ck
        weights, bias = leastSquares(X_sampled, Y_sampled)

        para[k, :] = np.append(weights, [bias])

        # classify
        cY = np.sign(
            np.append(np.ones(n).reshape(n, 1), X, axis=1).dot(para[k].T)).T

        # calculate error for given classifier
        temp = np.where([Y[i] != cY[i] for i in range(n)], 1, 0).reshape(n, 1)
        ek = np.sum(w * temp)

        # If the error is zero, the data set is correct classified - break the loop
        if ek < 1.0e-01:
            alphaK[k] = 1
            break

        # Compute the voting weight for the weak classifier alphak
        alphaK[k] = 0.5 * np.log((1 - ek) / ek)

        # recalculate the weights
        w = w * np.exp(-alphaK[k] * (Y * (cY.reshape(len(cY), 1))))
        # check mult
        w = w / sum(w)

        # calculate error for boosted classifier
        classlabels, _ = eval_adaBoost_leastSquare(testX, alphaK[:k + 1],
                                                   para[:k + 1])
        classlabels = classlabels.reshape(len(classlabels), 1)

        error[k] = sum(
            [classlabels[i] != testY[i]
             for i in range(len(classlabels))]) / len(testY)

    return [alphaK, para, error]