예제 #1
0
    def test_krr(self):
        '''
            tests the class krr
        '''
        Xtr, Ytr = noisysincfunction(100, 0.1)
        Xte = np.arange(-np.pi, np.pi, 0.01).reshape(-1, 1)

        pl.figure()
        kernels = ['gaussian', 'polynomial', 'linear']
        titles = ['gaussian', 'polynomial', 'linear']
        params = [0.5, 6, 0]
        regularizations = [0.01, 0.01, 0.01]
        for i in range(3):
            for j in range(2):
                pl.subplot(2, 3, 1 + i + 3 * j)
                if j == 0:
                    krr = imp.krr(kernel=kernels[i],
                                  kernelparameter=params[i],
                                  regularization=regularizations[i])
                    krr.fit(Xtr, Ytr)
                if j == 1:
                    krr = imp.krr(kernel=kernels[i],
                                  kernelparameter=params[i],
                                  regularization=0)
                    krr.fit(Xtr, Ytr)
                ypred = krr.predict(Xte)
                pl.plot(Xtr, Ytr)
                pl.plot(Xte, ypred)
                if j == 0 and i == 0:
                    pl.ylabel('fixed regularization')
                if j == 1 and i == 0:
                    pl.ylabel('reg. by efficent cv')
                pl.title(titles[i])
        pl.show()
예제 #2
0
def krr_test():
    """
        tests the class krr
    """
    Xtr, Ytr = noisysincfunction(100, 0.1)
    Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :]

    pl.figure()
    kernels = ["gaussian", "polynomial", "linear"]
    titles = ["gaussian", "polynomial", "linear"]
    params = [0.5, 4, 0]
    regularizations = [0.01, 0.01, 0.01]
    for i in range(3):
        for j in range(2):
            pl.subplot(2, 3, 1 + i + 3 * j)
            krr = imp.krr()
            if j == 0:
                krr.fit(Xtr, Ytr, kernel=kernels[i], kernelparameter=params[i], regularization=regularizations[i])
                print "reg_fixed: ", krr.regularization
            if j == 1:
                krr.fit(Xtr, Ytr, kernel=kernels[i], kernelparameter=params[i], regularization=0)
                print "reg_loocv: ", krr.regularization
            krr.predict(Xte)
            pl.plot(Xtr.T, Ytr.T)
            pl.plot(Xte.T, krr.ypred.T)
            if j == 0 and i == 0:
                pl.ylabel("fixed regularization")
            if j == 1 and i == 0:
                pl.ylabel("reg. by efficent cv")
            pl.title(titles[i])
    print "\n(time the test takes on my notebook: approx. 400 milliseconds)"
예제 #3
0
 def plot_MAE_for_different_nsamples(self):
     """
     # Excercise 3d
     Plot MAE for different nsamples
     Perform fold_cross_validation before
     """
     MAE = []
     n_samples = [
         100, 300, 600, 900, 1200, 1700, 2000, 2700, 3000, 3900, 4200, 4500,
         4700, 4800, 4900, 4950, 5000
     ]
     for i in tqdm(n_samples):
         train_samples = random.sample(range(0, 5000), i)
         Xtr_nsample = self.Xtr[train_samples]
         Ytr_nsample = self.Ytr[train_samples]
         model = imp.krr([self.cvkrr.kernel][0],
                         [self.cvkrr.kernelparameter][0],
                         [self.cvkrr.regularization])
         model.fit(Xtr_nsample, Ytr_nsample)
         y_pred = model.predict(self.Xte)
         MAE.append(mean_absolute_error(self.Yte, y_pred))
     plt.figure(figsize=(8, 6))
     plt.plot(n_samples, MAE, 'bo')
     plt.xlabel("n training samples")
     plt.ylabel("Mean Absolute Error [kcal/mol]")
예제 #4
0
def cv_test():
    """
        tests the cross validation. needs working krr class!
    """
    Xtr, Ytr = noisysincfunction(100, 0.1)
    Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :]

    krr = imp.krr()

    pl.figure()
    pl.subplot(1, 2, 1)
    params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", np.logspace(-2, 2, 10)]
    cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2)
    cvkrr.predict(Xte)
    print cvkrr.kernelparameter
    print cvkrr.regularization

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)
    pl.title("CV with fixed regularization")

    pl.subplot(1, 2, 2)
    params = ["kernel", ["gaussian"], "kernelparam", np.logspace(-2, 2, 10), "regularization", [0]]
    cvkrr = imp.cv(Xtr, Ytr, krr, params, loss_function=squared_error_loss, nrepetitions=2)
    cvkrr.predict(Xte)
    print cvkrr.kernelparameter
    print cvkrr.regularization

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)
    pl.title("CV with efficient LOOCV")
    print "\n(time the test takes on my notebook: approx. 6 seconds)"
예제 #5
0
def cv_test():
    '''
        tests the cross validation. needs working krr class!
    '''
    Xtr, Ytr = noisysincfunction(100, 0.1)
    Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :]

    krr = imp.krr()

    pl.figure()
    pl.subplot(1, 2, 1)
    params = [
        'kernel', ['gaussian'], 'kernelparam',
        np.logspace(-2, 2, 10), 'regularization',
        np.logspace(-2, 2, 10)
    ]
    cvkrr = imp.cv(Xtr,
                   Ytr,
                   krr,
                   params,
                   loss_function=squared_error_loss,
                   nrepetitions=2)
    cvkrr.predict(Xte)

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)

    pl.subplot(1, 2, 2)
    params = [
        'kernel', ['gaussian'], 'kernelparam',
        np.logspace(-2, 2, 10), 'regularization', [0]
    ]
    cvkrr = imp.cv(Xtr,
                   Ytr,
                   krr,
                   params,
                   loss_function=squared_error_loss,
                   nrepetitions=2)
    cvkrr.predict(Xte)

    pl.plot(Xtr.T, Ytr.T)
    pl.plot(Xte.T, cvkrr.ypred.T)
예제 #6
0
def krr_test():
    '''
        tests the class krr
    '''
    Xtr, Ytr = noisysincfunction(100, 0.1)
    Xte = np.arange(-np.pi, np.pi, 0.01)[np.newaxis, :]

    pl.figure()
    kernels = ['gaussian', 'polynomial', 'linear']
    titles = ['gaussian', 'polynomial', 'linear']
    params = [0.5, 4, 0]
    regularizations = [0.01, 0.01, 0.01]
    for i in range(3):
        for j in range(2):
            pl.subplot(2, 3, 1 + i + 3 * j)
            krr = imp.krr()
            if j == 0:
                krr.fit(Xtr,
                        Ytr,
                        kernel=kernels[i],
                        kernelparameter=params[i],
                        regularization=regularizations[i])
            if j == 1:
                krr.fit(Xtr,
                        Ytr,
                        kernel=kernels[i],
                        kernelparameter=params[i],
                        regularization=0)
                print krr.regularization
            krr.predict(Xte)
            pl.plot(Xtr.T, Ytr.T)
            pl.plot(Xte.T, krr.ypred.T)
            if j == 0 and i == 0:
                pl.ylabel('fixed regularization')
            if j == 1 and i == 0:
                pl.ylabel('reg. by efficent cv')
            pl.title(titles[i])
def krr_app(reg=False):
    ''' Applies krr to all data sets and saves the result to a file
    '''
    datasets = ['banana','diabetis','flare-solar','image','ringnorm']
    #dataset = ['image'] # for computing the results via console, the dataset was changed manually
    path = 'ps3_datasets/'
    results = dict()
    for data in datasets:
        Xtr = np.loadtxt(path+'U04_'+data+'-xtrain.dat')
        Ytr = np.loadtxt(path+'U04_'+data+'-ytrain.dat')
        Xte = np.loadtxt(path+'U04_'+data+'-xtest.dat')
        d,n = Xtr.shape
        print data, ' was loaded with %d dimensions'%d
        krr = imp.krr()
        kernels = ['gaussian','polynomial','linear']
        kernel_params = [np.logspace(-2,2,10),np.arange(10),np.arange(10)]
        tmp_results = dict()
        for i in range(len(kernels)):
            params = [ 'kernel',[kernels[i]], 'kernelparam', kernel_params[i],
                  'regularization', [0]]
            cvkrr = imp.cv(Xtr, Ytr.reshape(1,-1), krr, params, loss_function=squared_error_loss,
                    nrepetitions=2)
            cvkrr.predict(Xte)
            result = dict()
            result['cvloss'] = cvkrr.cvloss
            result['kernel'] = kernels[i]
            result['kernelparameter'] = cvkrr.kernelparameter
            result['regularization'] = cvkrr.regularization
            result['ypred'] = cvkrr.ypred
            tmp_results[i] = result
            print 'finished %s kernel on %s'%(kernels[i],data)
        CVloss = np.zeros(len(kernels))
        for i in range(len(kernels)):
            CVloss[i] = tmp_results[i]['cvloss']
        print 'CVloss for dataset %s'%data,CVloss
        results[data] = tmp_results[np.argmin(CVloss)]
예제 #8
0
    yTrain = np.loadtxt("ps3_datasets/U04_banana-ytrain.dat")

    n = xTest.shape[1]

    lables = np.ones((1, n))

    lables[yTest >= 0] = 0.5
    #
    #     pl.scatter(xTest[0,:],xTest[1,:],c=lables, cmap = cm.jet);
    #     pl.title('Prediction banana test data set');
    #
    #     pl.figure();
    #     pl.scatter(xTrain[0,:],xTrain[1,:],c=yTrain, cmap=cm.jet);
    #     pl.title('Banana training data set');

    krr = imp.krr(file['banana']['kernel'], file['banana']['kernelparameter'],
                  file['banana']['regularization'])

    krr.fit(xTrain, yTrain)
    xInput = np.linspace(-3, 3)
    (x, y) = np.meshgrid(xInput, xInput)

    x = x.reshape((1, x.size))
    y = y.reshape((1, y.size))
    X = np.vstack((x, y))
    krr.predict(X)
    Z = krr.ypred
    sn = np.sqrt(x.size)
    Z = Z.reshape((sn, sn))

    pl.figure()
예제 #9
0
def apply_krr(reg=False):
    ''' This function applies the krr to the provided data set in order
        to find a good classification. The results are stored in a dictionary
        which is at the end pickled.
        
        Usage:
            It is important to adapt the path of the datasets.
        
        Input:
            reg : boolean variable indicating whether the regularization constant
                shall be estimated by LOOCV or drawn from a provided range. True
                means that the provided range is used and False means that the
                LOOCV will be used.
                
        Author:
            Till Rohrmann, [email protected]
    '''
    
    # IMPORTANT: Adapt path to where the data sets have been stored
    path = 'ps3_datasets';
    testSuffix = 'xtest';
    trainXSuffix = 'xtrain';
    trainYSuffix = 'ytrain';
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))];
    
    datasetNames = set();
    
    for filename in files:
        m = re.search('U04_([^\.]*)\.dat', filename);
        if m != None:
            datasetNames.add(m.group(1)[:m.group(1).rfind('-')]);
            
    result = {};
    
    if reg:
        regularization = np.logspace(-2, 2, 10);
    else:
        regularization = [0];
        
    gaussianKernelParams = np.logspace(-2, 2, 10);
    polynomialKernelParams = np.arange(1, 10);
    
    nrep = 5;
    nfs = 10;
            
    for dataset in datasetNames:
        print('Dataset: ' + dataset);
        # training phase
        filenameX = 'U04_' + dataset + '-' + trainXSuffix + '.dat';
        filenameY = 'U04_' + dataset + '-' + trainYSuffix + '.dat';
        filenameTestX = 'U04_' + dataset + '-' + testSuffix + '.dat';
        X = np.loadtxt(os.path.join(path, filenameX), dtype=float);
        Y = np.loadtxt(os.path.join(path, filenameY), dtype=float)[np.newaxis, :];
        testX = np.loadtxt(os.path.join(path, filenameTestX), dtype=float);
        
        print('Shape: ' + str(X.shape));
        
        # linear cv
        startTime = time.time();
        krrLinear = imp.krr();
        linearParams = ['kernel', ['linear'], 'kernelparam', [0], 'regularization', regularization];
        imp.cv(X, Y, krrLinear, linearParams, nrepetitions=nrep, nfolds=nfs);
        timeLinear = time.time() - startTime;
        
        # polynomial cv
        startTime = time.time();
        krrPolynomial = imp.krr();
        polynomialParams = ['kernel', ['polynomial'], 'kernelparam', 
                            polynomialKernelParams, 'regularization', regularization];
        imp.cv(X, Y, krrPolynomial, polynomialParams, nrepetitions=nrep, nfolds=nfs);
        timePolynomial = time.time() - startTime;
        
        # gaussian cv
        startTime = time.time();
        krrGaussian = imp.krr();
        gaussianParams = ['kernel', ['gaussian'], 'kernelparam', 
                          gaussianKernelParams, 'regularization', regularization];
        imp.cv(X, Y, krrGaussian, gaussianParams, nrepetitions=nrep, nfolds=nfs);
        timeGaussian = time.time() - startTime;
        
        krr = [krrLinear, krrPolynomial, krrGaussian][np.argmin([krrLinear.cvloss, 
                                                                 krrPolynomial.cvloss, krrGaussian.cvloss])];
        minTime = [timeLinear, timePolynomial, timeGaussian][np.argmin([krrLinear.cvloss, 
                                                                        krrPolynomial.cvloss, krrGaussian.cvloss])];
        
        krr.predict(testX);
        
        dictionary = dict();
        dictionary['kernel'] = krr.kernel;
        dictionary['kernelparameter'] = krr.kernelparameter;
        dictionary['regularization'] = krr.regularization;
        dictionary['cvloss'] = krr.cvloss;
        dictionary['ypred'] = krr.ypred;
        
        result[dataset] = dictionary;
        
        # plot ROC curve and calculate AUC
        params = ['kernel', [krr.kernel], 'kernelparam', [krr.kernelparameter], 
                  'regularization', [krr.regularization]];
        rocKRR = imp.krr();
        imp.cv(X, Y, rocKRR, params, loss_function=roc_fun, nrepetitions=nrep, nfolds=nfs);
        
        truePositiveRate = rocKRR.cvloss[0];
        falsePositiveRate = rocKRR.cvloss[1];
        
        # Simpson rule for integration
        xdiff = falsePositiveRate[1:] - falsePositiveRate[:-1];
        ysum = (truePositiveRate[1:] + truePositiveRate[:-1]) / 2
        AUC = np.dot(ysum, xdiff);
    
        pl.figure();
        pl.plot(falsePositiveRate, truePositiveRate);
        pl.ylabel('True positive rate');
        pl.xlabel('False positive rate');

        if reg == True:
            pl.title('ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + 
                     ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + 
                  '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + 
                  ' regularization:' + ('%.3f' % dictionary['regularization']));
        else:
            pl.title('LOOCV ROC-Curve Dataset:' + dataset + ' AUC=' + ('%.3f' % AUC) + 
                     ' cvloss:' + ('%.3f' % (dictionary['cvloss'])) + ' time:' + str('%.1f' % minTime) + 's' + 
                  '\n Kernel:' + dictionary['kernel'] + ' parameter:' + ('%.3f' % dictionary['kernelparameter']) + 
                  ' regularization:' + ('%.3f' % dictionary['regularization']));
        
        print('Dataset:' + dataset + ' kernel:' + dictionary['kernel'] + ' cvloss:' + 
              str(dictionary['cvloss']) + ' AUC:' + str(AUC) + ' time:' + ('%.1f' % minTime));
        
    if reg:
        filename = 'results.p'
    else:
        filename = 'resultsLOOCV.p'
        
    pickle.dump(result, open(filename, 'wb'));
예제 #10
0
    def plot_energies_for_1000(self):
        """ 
        Excercise 3e, perform under-, well- and overfit for 1000 training samples
        """
        # split data
        # Random Partitioning
        X_pos = np.linspace(0, len(self.X) - 1, len(self.X))
        random.Random(4).shuffle(X_pos)
        Xtr1000 = self.X[X_pos[:1000].astype('int')]
        Xte1000 = self.X[X_pos[1000:].astype('int')]
        Ytr1000 = self.y[X_pos[:1000].astype('int')]
        Yte1000 = self.y[X_pos[1000:].astype('int')]

        # get parameter for good fit
        # Fivefold Cross validation

        D1000 = np.linalg.norm(Xtr1000[None, :] - Xtr1000[:, None], axis=2)
        quantiles = np.quantile(D1000, [0.1, 0.5, 0.9])
        params = {
            'kernel': ['gaussian'],
            'kernelparameter': quantiles,
            'regularization': np.logspace(-7, 0, 10)
        }
        cvkrr = imp.cv(Xtr1000,
                       Ytr1000,
                       imp.krr,
                       params,
                       loss_function=mean_absolute_error,
                       nfolds=5)
        y_pred1000 = cvkrr.predict(Xte1000)
        MAE = mean_absolute_error(Yte1000, y_pred1000)

        # result of CV
        print("The mean absolute error is: {} ".format(round(MAE, 2)))
        print("The best regularzation parameter C is: {}".format(
            cvkrr.regularization))
        print("The best kernelparameter sigma is: {}".format(
            cvkrr.kernelparameter))
        print("The cvloss: {}".format(cvkrr.cvloss))

        # define parameters for training
        params = {
            'kernel': ['linear', 'gaussian', 'gaussian'],
            'kernelparameter': [False, cvkrr.kernelparameter, 1],
            'regularization': [cvkrr.regularization, cvkrr.regularization, 0]
        }

        # plot
        plt.figure(figsize=(10, 6))
        for i in [0, 1, 2]:
            model = imp.krr(params['kernel'][i], params['kernelparameter'][i],
                            params['regularization'][i])
            model.fit(Xtr1000, Ytr1000)
            y_pred_train = model.predict(Xtr1000)
            y_pred = model.predict(self.Xte)
            plt.subplot(1, 3, i + 1)
            plt.plot(self.Yte, y_pred, 'bo')
            plt.plot(Ytr1000, y_pred_train, 'ro')
            plt.xlabel("y_true")
            plt.ylabel("y_pred")
            plt.legend(labels=['test', 'train'])
        plt.tight_layout(pad=3.0)