def cv_test():
    """
        tests the cross validation. needs working krr class!
    """
    Xtr, Ytr = noisysincfunction(100, 0.1)
    Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :]

    krr = imp.krr()

    pl.figure()
    pl.subplot(1, 2, 1)
    params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", np.logspace(-2, 2, 10)]
    cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2)
    cvkrr.predict(Xte)
    print cvkrr.kernelparameter
    print cvkrr.regularization

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)
    pl.title("CV with fixed regularization")

    pl.subplot(1, 2, 2)
    params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", [0]]
    cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2)
    cvkrr.predict(Xte)
    print cvkrr.kernelparameter
    print cvkrr.regularization

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)
    pl.title("CV with efficient LOOCV")
    print "\n(time the test takes on my notebook: approx. 6 seconds)"
示例#2
0
 def fold_cross_validation(self):
     """
     # Excercise 3c
     Perform a cross validation on a training set of 2500
     Perform split_data before
     """
     # Fivefold Cross validation
     train_samples = random.sample(range(0, 5000), 2500)
     Xtr2500 = self.Xtr[train_samples]
     Ytr2500 = self.Ytr[train_samples]
     D2500 = np.linalg.norm(Xtr2500[None, :] - Xtr2500[:, None], axis=2)
     quantiles = np.quantile(D2500, [0.1, 0.5, 0.9])
     params = {
         'kernel': ['gaussian'],
         'kernelparameter': quantiles,
         'regularization': np.logspace(-7, 0, 10)
     }
     self.cvkrr = imp.cv(Xtr2500,
                         Ytr2500,
                         imp.krr,
                         params,
                         loss_function=mean_absolute_error,
                         nfolds=5)
     y_pred2500 = self.cvkrr.predict(self.Xte)
     MAE = mean_absolute_error(self.Yte, y_pred2500)
     print("The mean absolute error is: {} ".format(round(MAE, 2)))
     print("The best regularzation parameter C is: {}".format(
         self.cvkrr.regularization))
     print("The best kernelparameter sigma is: {}".format(
         self.cvkrr.kernelparameter))
示例#3
0
    def test_cv(self):
        Xtr, Ytr = noisysincfunction(100, 0.1)
        Xte = np.arange(-np.pi, np.pi, 0.01).reshape(-1, 1)

        pl.figure()
        pl.subplot(1, 2, 1)
        params = {
            'kernel': ['gaussian'],
            'kernelparameter': np.logspace(-4, 4, 20),
            'regularization': np.logspace(-2, 2, 10)
        }
        cvkrr = imp.cv(Xtr,
                       Ytr,
                       imp.krr,
                       params,
                       loss_function=squared_error_loss,
                       nrepetitions=2)
        ypred = cvkrr.predict(Xte)
        print('Regularization range: 10**-4 .. 10**4')
        print('Gaussian kernel parameter: ', cvkrr.kernelparameter)
        print('Regularization paramter: ', cvkrr.regularization)

        pl.plot(Xtr, Ytr)
        pl.plot(Xte, ypred)

        pl.subplot(1, 2, 2)
        params = {
            'kernel': ['gaussian'],
            'kernelparameter': np.logspace(-2, 2, 10),
            'regularization': [0]
        }
        cvkrr = imp.cv(Xtr,
                       Ytr,
                       imp.krr,
                       params,
                       loss_function=squared_error_loss,
                       nrepetitions=2)
        ypred = cvkrr.predict(Xte)
        print('Regularization via efficient leave on out')
        print('Kernel parameter: ', cvkrr.kernelparameter)
        print('Regularization paramter: ', cvkrr.regularization)

        pl.plot(Xtr, Ytr)
        pl.plot(Xte, ypred)
        pl.show()
示例#4
0
def cv_test():
    '''
        tests the cross validation. needs working krr class!
    '''
    Xtr, Ytr = noisysincfunction(100, 0.1)
    Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :]

    krr = imp.krr()

    pl.figure()
    pl.subplot(1, 2, 1)
    params = [
        'kernel', ['gaussian'], 'kernelparam',
        np.logspace(-2, 2, 10), 'regularization',
        np.logspace(-2, 2, 10)
    ]
    cvkrr = imp.cv(Xtr,
                   Ytr,
                   krr,
                   params,
                   loss_function=squared_error_loss,
                   nrepetitions=2)
    cvkrr.predict(Xte)

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)

    pl.subplot(1, 2, 2)
    params = [
        'kernel', ['gaussian'], 'kernelparam',
        np.logspace(-2, 2, 10), 'regularization', [0]
    ]
    cvkrr = imp.cv(Xtr,
                   Ytr,
                   krr,
                   params,
                   loss_function=squared_error_loss,
                   nrepetitions=2)
    cvkrr.predict(Xte)

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)
def krr_app(reg=False):
    ''' Applies krr to all data sets and saves the result to a file
    '''
    datasets = ['banana','diabetis','flare-solar','image','ringnorm']
    #dataset = ['image'] # for computing the results via console, the dataset was changed manually
    path = 'ps3_datasets/'
    results = dict()
    for data in datasets:
        Xtr = np.loadtxt(path+'U04_'+data+'-xtrain.dat')
        Ytr = np.loadtxt(path+'U04_'+data+'-ytrain.dat')
        Xte = np.loadtxt(path+'U04_'+data+'-xtest.dat')
        d,n = Xtr.shape
        print data, ' was loaded with %d dimensions'%d
        krr = imp.krr()
        kernels = ['gaussian','polynomial','linear']
        kernel_params = [np.logspace(-2,2,10),np.arange(10),np.arange(10)]
        tmp_results = dict()
        for i in range(len(kernels)):
            params = [ 'kernel',[kernels[i]], 'kernelparam', kernel_params[i],
                  'regularization', [0]]
            cvkrr = imp.cv(Xtr, Ytr.reshape(1,-1), krr, params, loss_function=squared_error_loss,
                    nrepetitions=2)
            cvkrr.predict(Xte)
            result = dict()
            result['cvloss'] = cvkrr.cvloss
            result['kernel'] = kernels[i]
            result['kernelparameter'] = cvkrr.kernelparameter
            result['regularization'] = cvkrr.regularization
            result['ypred'] = cvkrr.ypred
            tmp_results[i] = result
            print 'finished %s kernel on %s'%(kernels[i],data)
        CVloss = np.zeros(len(kernels))
        for i in range(len(kernels)):
            CVloss[i] = tmp_results[i]['cvloss']
        print 'CVloss for dataset %s'%data,CVloss
        results[data] = tmp_results[np.argmin(CVloss)]
def apply_krr(reg=False):
    ''' This function applies the krr to the provided data set in order
        to find a good classification. The results are stored in a dictionary
        which is at the end pickled.
        
        Usage:
            It is important to adapt the path of the datasets.
        
        Input:
            reg : boolean variable indicating whether the regularization constant
                shall be estimated by LOOCV or drawn from a provided range. True
                means that the provided range is used and False means that the
                LOOCV will be used.
                
        Author:
            Till Rohrmann, [email protected]
    '''
    
    # IMPORTANT: Adapt path to where the data sets have been stored
    path = 'ps3_datasets';
    testSuffix = 'xtest';
    trainXSuffix = 'xtrain';
    trainYSuffix = 'ytrain';
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))];
    
    datasetNames = set();
    
    for filename in files:
        m = re.search('U04_([^\.]*)\.dat', filename);
        if m != None:
            datasetNames.add(m.group(1)[:m.group(1).rfind('-')]);
            
    result = {};
    
    if reg:
        regularization = np.logspace(-2, 2, 10);
    else:
        regularization = [0];
        
    gaussianKernelParams = np.logspace(-2, 2, 10);
    polynomialKernelParams = np.arange(1, 10);
    
    nrep = 5;
    nfs = 10;
            
    for dataset in datasetNames:
        print('Dataset: ' + dataset);
        # training phase
        filenameX = 'U04_' + dataset + '-' + trainXSuffix + '.dat';
        filenameY = 'U04_' + dataset + '-' + trainYSuffix + '.dat';
        filenameTestX = 'U04_' + dataset + '-' + testSuffix + '.dat';
        X = np.loadtxt(os.path.join(path, filenameX), dtype=float);
        Y = np.loadtxt(os.path.join(path, filenameY), dtype=float)[np.newaxis, :];
        testX = np.loadtxt(os.path.join(path, filenameTestX), dtype=float);
        
        print('Shape: ' + str(X.shape));
        
        # linear cv
        startTime = time.time();
        krrLinear = imp.krr();
        linearParams = ['kernel', ['linear'], 'kernelparam', [0], 'regularization', regularization];
        imp.cv(X, Y, krrLinear, linearParams, nrepetitions=nrep, nfolds=nfs);
        timeLinear = time.time() - startTime;
        
        # polynomial cv
        startTime = time.time();
        krrPolynomial = imp.krr();
        polynomialParams = ['kernel', ['polynomial'], 'kernelparam', 
                            polynomialKernelParams, 'regularization', regularization];
        imp.cv(X, Y, krrPolynomial, polynomialParams, nrepetitions=nrep, nfolds=nfs);
        timePolynomial = time.time() - startTime;
        
        # gaussian cv
        startTime = time.time();
        krrGaussian = imp.krr();
        gaussianParams = ['kernel', ['gaussian'], 'kernelparam', 
                          gaussianKernelParams, 'regularization', regularization];
        imp.cv(X, Y, krrGaussian, gaussianParams, nrepetitions=nrep, nfolds=nfs);
        timeGaussian = time.time() - startTime;
        
        krr = [krrLinear, krrPolynomial, krrGaussian][np.argmin([krrLinear.cvloss, 
                                                                 krrPolynomial.cvloss, krrGaussian.cvloss])];
        minTime = [timeLinear, timePolynomial, timeGaussian][np.argmin([krrLinear.cvloss, 
                                                                        krrPolynomial.cvloss, krrGaussian.cvloss])];
        
        krr.predict(testX);
        
        dictionary = dict();
        dictionary['kernel'] = krr.kernel;
        dictionary['kernelparameter'] = krr.kernelparameter;
        dictionary['regularization'] = krr.regularization;
        dictionary['cvloss'] = krr.cvloss;
        dictionary['ypred'] = krr.ypred;
        
        result[dataset] = dictionary;
        
        # plot ROC curve and calculate AUC
        params = ['kernel', [krr.kernel], 'kernelparam', [krr.kernelparameter], 
                  'regularization', [krr.regularization]];
        rocKRR = imp.krr();
        imp.cv(X, Y, rocKRR, params, loss_function=roc_fun, nrepetitions=nrep, nfolds=nfs);
        
        truePositiveRate = rocKRR.cvloss[0];
        falsePositiveRate = rocKRR.cvloss[1];
        
        # Simpson rule for integration
        xdiff = falsePositiveRate[1:] - falsePositiveRate[:-1];
        ysum = (truePositiveRate[1:] + truePositiveRate[:-1]) / 2
        AUC = np.dot(ysum, xdiff);
    
        pl.figure();
        pl.plot(falsePositiveRate, truePositiveRate);
        pl.ylabel('True positive rate');
        pl.xlabel('False positive rate');

        if reg == True:
            pl.title('ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + 
                     ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + 
                  '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + 
                  ' regularization:' + ('%.3f' % dictionary['regularization']));
        else:
            pl.title('LOOCV ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + 
                     ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + 
                  '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + 
                  ' regularization:' + ('%.3f' % dictionary['regularization']));
        
        print('Dataset:' + dataset + ' kernel:' + dictionary['kernel'] + ' cvloss:' + 
              str(dictionary['cvloss']) + ' AUC:' + str(AUC) + ' time:' + ('%.1f' % minTime));
        
    if reg:
        filename = 'results.p'
    else:
        filename = 'resultsLOOCV.p'
        
    pickle.dump(result, open(filename, 'wb'));
示例#7
0
def assignment_4():
    # 4b
    # load data
    import pandas as pd
    cwd = os.getcwd()

    xtrain_names = [
        'U04_banana-xtrain.dat', 'U04_diabetis-xtrain.dat',
        'U04_flare-solar-xtrain.dat', 'U04_image-xtrain.dat',
        'U04_ringnorm-xtrain.dat'
    ]
    ytrain_names = [
        'U04_banana-ytrain.dat', 'U04_diabetis-ytrain.dat',
        'U04_flare-solar-ytrain.dat', 'U04_image-ytrain.dat',
        'U04_ringnorm-ytrain.dat'
    ]
    xtest_names = [
        'U04_banana-xtest.dat', 'U04_diabetis-xtest.dat',
        'U04_flare-solar-xtest.dat', 'U04_image-xtest.dat',
        'U04_ringnorm-xtest.dat'
    ]
    ytest_names = [
        'U04_banana-ytest.dat', 'U04_diabetis-ytest.dat',
        'U04_flare-solar-ytest.dat', 'U04_image-ytest.dat',
        'U04_ringnorm-ytest.dat'
    ]

    xtrain_data = []
    ytrain_data = []
    xtest_data = []
    ytest_data = []

    all_datasets = ['banana', 'diabetis', 'flare-solar', 'image', 'ringnorm']

    folds = [10, 9, 9, 10, 10]

    for (xtrain, ytrain, xtest, ytest) in zip(xtrain_names, ytrain_names,
                                              xtest_names, ytest_names):
        path_to_data = cwd + '/data/' + xtrain
        assert os.path.exists(path_to_data), "The path does not exist."
        xtrain_data.append(np.loadtxt(path_to_data))

        path_to_data = cwd + '/data/' + ytrain
        assert os.path.exists(path_to_data), "The path does not exist."
        ytrain_data.append(np.loadtxt(path_to_data))

        path_to_data = cwd + '/data/' + xtest
        assert os.path.exists(path_to_data), "The path does not exist."
        xtest_data.append(np.loadtxt(path_to_data))

        path_to_data = cwd + '/data/' + ytest
        assert os.path.exists(path_to_data), "The path does not exist."
        ytest_data.append(np.loadtxt(path_to_data))

    # 4b - GENERATE DICTIONARY RESULTS FOR EACH DATASET

    params = {
        'kernel': ['linear', 'polynomial'],
        'kernelparameter': [1, 2, 3],
        'regularization': [0]
    }

    results = {
        'banana': {
            'cvloss': [0],
            'kernel': [0],
            'kernelparameter': [0],
            'regularization': [0],
            'y_pred': [0]
        },
        'diabetis': {
            'cvloss': [0],
            'kernel': [0],
            'kernelparameter': [0],
            'regularization': [0],
            'y_pred': [0]
        },
        'flare-solar': {
            'cvloss': [0],
            'kernel': [0],
            'kernelparameter': [0],
            'regularization': [0],
            'y_pred': [0]
        },
        'image': {
            'cvloss': [0],
            'kernel': [0],
            'kernelparameter': [0],
            'regularization': [0],
            'y_pred': [0]
        },
        'ringnorm': {
            'cvloss': [0],
            'kernel': [0],
            'kernelparameter': [0],
            'regularization': [0],
            'y_pred': [0]
        }
    }

    # bug description - "setting an array element with a sequence" if len(xtrain_data) is not equally divisible by nfolds
    # solving the bug is very difficult, because it would require converting the unequal sequences into numpy arrays
    # and filling the missing values. These values are then indexed on the training data, and will result in an error or a datapoint being used repeatedly
    # depending on how you choose to fill the values

    # so the obvious solution is to pick n_folds so that len(xtrain)%n_folds=0 ie n_folds is a multiple of xtrain
    for (xtrain, ytrain, xtest, ytest, dataset,
         fold) in zip(xtrain_data, ytrain_data, xtest_data, ytest_data,
                      all_datasets, folds):
        print('Xtrain\n', xtrain.shape)
        print('ytrain\n', ytrain.shape)
        print('Xtest\n', xtest.shape)
        print('ytest\n', ytest.shape)

        cvkrr = imp.cv(xtrain.T,
                       ytrain,
                       imp.krr,
                       params,
                       loss_function=zero_one_loss,
                       nfolds=fold,
                       nrepetitions=5)
        y_pred = cvkrr.predict(xtest.T)

        results[dataset]['y_pred'] = y_pred
        results[dataset]['kernel'] = cvkrr.kernel
        results[dataset]['kernelparameter'] = cvkrr.kernelparameter
        results[dataset]['regularization'] = cvkrr.regularization
        results[dataset]['cvloss'] = cvkrr.cvloss

    params = {
        'kernel': ['linear', 'gaussian'],
        'kernelparameter': [0.1, 0.5, 0.9],
        'regularization': [0]
    }

    for (xtrain, ytrain, xtest, ytest, dataset,
         fold) in zip(xtrain_data, ytrain_data, xtest_data, ytest_data,
                      all_datasets, folds):
        print('Xtrain\n', xtrain.shape)
        print('ytrain\n', ytrain.shape)
        print('Xtest\n', xtest.shape)
        print('ytest\n', ytest.shape)

        cvkrr = imp.cv(xtrain.T,
                       ytrain,
                       imp.krr,
                       params,
                       loss_function=zero_one_loss,
                       nfolds=fold,
                       nrepetitions=5)
        y_pred = cvkrr.predict(xtest.T)

        if results[dataset]['cvloss'] > cvkrr.cvloss:
            results[dataset]['y_pred'] = y_pred
            results[dataset]['kernel'] = cvkrr.kernel
            results[dataset]['kernelparameter'] = cvkrr.kernelparameter
            results[dataset]['regularization'] = cvkrr.regularization
            results[dataset]['cvloss'] = cvkrr.cvloss

    # manually remove kernelparameter from linear soln.
    results['flare-solar']['kernelparameter'] = None

    # open a file, where you want to store the data
    file = open('results.p', 'wb')

    # dump information to that file
    pickle.dump(results, file)

    # close the file
    file.close()

    #4C - PLOT ROC CURVES FOR VARYING BIASES

    for (xtrain, ytrain, dataset, fold) in zip(xtrain_data, ytrain_data,
                                               all_datasets, folds):
        print('Xtrain\n', xtrain.shape)
        print('ytrain\n', ytrain.shape)

        params = {
            'kernel': [str(results[dataset]['kernel'])],
            'kernelparameter': [(results[dataset]['kernelparameter'])],
            'regularization': [(results[dataset]['regularization'])]
        }
        # print(params['kernel'])
        cvkrr = imp.cv(xtrain.T,
                       ytrain,
                       imp.krr,
                       params,
                       loss_function=roc_fun,
                       nfolds=fold,
                       nrepetitions=4)

        loss = cvkrr.cvloss
        # print('fpr\n',loss[0])
        # print('tpr\n',loss[1])
        # print(loss)

        fpr = np.append(loss[0], 0)
        fpr = np.insert(fpr, 0, 1)

        tpr = np.append(loss[1], 0)
        tpr = np.insert(tpr, 0, 1)

        # plot ROC fun
        plt.figure(figsize=(4.5, 4.5))
        plt.plot(fpr, tpr, label='KRR algorithm')
        plt.plot(np.arange(0, 1.1, 0.1),
                 np.arange(0, 1.1, 0.1),
                 label='Random guess')
        plt.ylabel('True Positive Rate (TPR)')
        plt.xlabel('False Positive Rate (FPR)')
        plt.title('%s dataset\'s average ROC curve from a varying bias' %
                  dataset)
        plt.legend()

    # 4.d - COMPARE LOOCV TO CV REGULARISATION
    cv_regularisation = []

    for (xtrain, ytrain, xtest, ytest, dataset,
         fold) in zip(xtrain_data, ytrain_data, xtest_data, ytest_data,
                      all_datasets, folds):
        print('Xtrain\n', xtrain.shape)
        print('ytrain\n', ytrain.shape)
        print('Xtest\n', xtest.shape)
        print('ytest\n', ytest.shape)

        params = {
            'kernel': [results[dataset]['kernel']],
            'kernelparameter': [results[dataset]['kernelparameter']],
            'regularization': np.logspace(-5, 5, 11)
        }

        cvkrr = imp.cv(xtrain.T,
                       ytrain,
                       imp.krr,
                       params,
                       loss_function=zero_one_loss,
                       nfolds=fold,
                       nrepetitions=5)
        y_pred = cvkrr.predict(xtest.T)

        cv_regularisation.append(cvkrr.cvloss)
示例#8
0
    def plot_energies_for_1000(self):
        """ 
        Excercise 3e, perform under-, well- and overfit for 1000 training samples
        """
        # split data
        # Random Partitioning
        X_pos = np.linspace(0, len(self.X) - 1, len(self.X))
        random.Random(4).shuffle(X_pos)
        Xtr1000 = self.X[X_pos[:1000].astype('int')]
        Xte1000 = self.X[X_pos[1000:].astype('int')]
        Ytr1000 = self.y[X_pos[:1000].astype('int')]
        Yte1000 = self.y[X_pos[1000:].astype('int')]

        # get parameter for good fit
        # Fivefold Cross validation

        D1000 = np.linalg.norm(Xtr1000[None, :] - Xtr1000[:, None], axis=2)
        quantiles = np.quantile(D1000, [0.1, 0.5, 0.9])
        params = {
            'kernel': ['gaussian'],
            'kernelparameter': quantiles,
            'regularization': np.logspace(-7, 0, 10)
        }
        cvkrr = imp.cv(Xtr1000,
                       Ytr1000,
                       imp.krr,
                       params,
                       loss_function=mean_absolute_error,
                       nfolds=5)
        y_pred1000 = cvkrr.predict(Xte1000)
        MAE = mean_absolute_error(Yte1000, y_pred1000)

        # result of CV
        print("The mean absolute error is: {} ".format(round(MAE, 2)))
        print("The best regularzation parameter C is: {}".format(
            cvkrr.regularization))
        print("The best kernelparameter sigma is: {}".format(
            cvkrr.kernelparameter))
        print("The cvloss: {}".format(cvkrr.cvloss))

        # define parameters for training
        params = {
            'kernel': ['linear', 'gaussian', 'gaussian'],
            'kernelparameter': [False, cvkrr.kernelparameter, 1],
            'regularization': [cvkrr.regularization, cvkrr.regularization, 0]
        }

        # plot
        plt.figure(figsize=(10, 6))
        for i in [0, 1, 2]:
            model = imp.krr(params['kernel'][i], params['kernelparameter'][i],
                            params['regularization'][i])
            model.fit(Xtr1000, Ytr1000)
            y_pred_train = model.predict(Xtr1000)
            y_pred = model.predict(self.Xte)
            plt.subplot(1, 3, i + 1)
            plt.plot(self.Yte, y_pred, 'bo')
            plt.plot(Ytr1000, y_pred_train, 'ro')
            plt.xlabel("y_true")
            plt.ylabel("y_pred")
            plt.legend(labels=['test', 'train'])
        plt.tight_layout(pad=3.0)