Error_train_nofeatures[k] = np.square(
        y_train - y_train.mean()).sum() / y_train.shape[0]
    Error_test_nofeatures[k] = np.square(y_test -
                                         y_test.mean()).sum() / y_test.shape[0]

    # Compute squared error with all features selected (no feature selection)
    m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
    Error_train[k] = np.square(y_train -
                               m.predict(X_train)).sum() / y_train.shape[0]
    Error_test[k] = np.square(y_test -
                              m.predict(X_test)).sum() / y_test.shape[0]

    # Compute squared error with feature subset selection
    #textout = 'verbose';
    textout = ''
    selected_features, features_record, loss_record = feature_selector_lr(
        X_train, y_train, internal_cross_validation, display=textout)

    Features[selected_features, k] = 1
    if len(selected_features) is 0:
        print(
            'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
        )
    else:
        m = lm.LinearRegression(fit_intercept=True).fit(
            X_train[:, selected_features], y_train)
        Error_train_fs[k] = np.square(y_train - m.predict(
            X_train[:, selected_features])).sum() / y_train.shape[0]
        Error_test_fs[k] = np.square(y_test - m.predict(
            X_test[:, selected_features])).sum() / y_test.shape[0]

        figure(k)
示例#2
0
def forwardSelection(X,y,N,K,attributeNames, classNames):
    # Add offset attribute
    X2 = np.concatenate((np.ones((X.shape[0],1)),X),1)
    attributeNames2 = [u'Offset']+attributeNames
    M2 = len(attributeNames)+1
    
    
    #X3 = np.copy(X)
    X2[:,2] = np.power(X2[:,2],2)    
    
    ## Crossvalidation
    # Create crossvalidation partition for evaluation

    CV = cross_validation.KFold(N,K,shuffle=True)
    
    # Initialize variables
    Features = np.zeros((M2,K))
    Error_train = np.empty((K,1))
    Error_test = np.empty((K,1))
    Error_train_fs = np.empty((K,1))
    Error_test_fs = np.empty((K,1))
    Error_train_nofeatures = np.empty((K,1))
    Error_test_nofeatures = np.empty((K,1))
    
    k=0
    for train_index, test_index in CV:
        
        # extract training and test set for current CV fold
        X_train = X2[train_index]
        y_train = y[train_index]
        X_test = X2[test_index]
        y_test = y[test_index]
        internal_cross_validation = 5
        
        
        
        # Compute squared error without using the input data at all
        Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
        Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]
        
         # Compute squared error with all features selected (no feature selection)
        m = lm.LinearRegression().fit(X_train, y_train)
        Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
        Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]


        # Compute squared error with feature subset selection
        selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation)
        Features[selected_features,k]=1
            # .. alternatively you could use module sklearn.feature_selection
        m = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
        Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
        Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

        
        figure()
        subplot(1,2,1)
        plot(range(1,len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')    
        
        subplot(1,3,3)
        bmplot(attributeNames2, range(1,features_record.shape[1]), -features_record[:,1:])
        clim(-1.5,0)
        xlabel('Iteration')
    
        print('Cross validation fold {0}/{1}'.format(k+1,K))
    
        k+=1
    
    
    # Display results
    print('\n')
    print('Linear regression without feature selection:\n')
    print('- Training error: {0}'.format(Error_train.mean()))
    print('- Test error:     {0}'.format(Error_test.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
    print('\n')
    print('Linear regression with feature selection:\n')
    print('- Training error: {0}'.format(Error_train_fs.mean()))
    print('- Test error:     {0}'.format(Error_test_fs.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum()))
    
    figure()
    subplot(1,3,2)
    bmplot(attributeNames2, range(1,Features.shape[1]+1), -Features)
    clim(-1.5,0)
    xlabel('Crossvalidation fold')
    ylabel('Attribute')
    
    # Inspect selected feature coefficients effect on the entire dataset and
    # plot the fitted model residual error as function of each attribute to
    # inspect for systematic structure in the residual
    f=2 # cross-validation fold to inspect
    ff=Features[:,f-1].nonzero()[0]
    m = lm.LinearRegression().fit(X2[:,ff], y)
    
    y_est= m.predict(X2[:,ff])
    residual=y-y_est
    
    figure()
    title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
    for i in range(0,len(ff)):
       subplot(2,ceil(len(ff)/2.0),i+1)
       for c in classNames:
           class_mask = (y_est==c)
           plot(X2[:,ff[i]],residual,'.')
       xlabel(attributeNames2[ff[i]])
       ylabel('residual error')
    
    
    show()    
示例#3
0
def linear_reg(input_matrix, index, outer_cross_number, inner_cross_number):
    X, y = split_train_test(input_matrix, index)
    N, M = X.shape
    K = outer_cross_number
    # CV = model_selection.KFold(K,True)

    attributeNames = [
        'MPG', 'Cylinders', 'Displacment', 'Horsepower', 'Weight (lbs)',
        'Acceleration (MPH)', 'Model year', 'Origin'
    ]
    temp = attributeNames[index]
    attributeNamesShorter = attributeNames
    attributeNamesShorter.remove(temp)

    neurons = 1
    learning_goal = 25
    max_epochs = 64
    show_error_freq = 65

    CV = cross_validation.KFold(N, K, shuffle=True)

    Features = np.zeros((M, K))
    Error_train = np.empty((K, 1))
    Error_test = np.empty((K, 1))
    Error_train_fs = np.empty((K, 1))
    Error_test_fs = np.empty((K, 1))
    Error_train_mean = np.empty((K, 1))
    Error_test_mean = np.empty((K, 1))
    Error_train_nn = np.empty((K, 1))
    Error_test_nn = np.empty((K, 1))
    k = 0
    for train_index, test_index in CV:
        X_train = X[train_index, :]
        y_train = y[train_index]
        X_test = X[test_index, :]
        y_test = y[test_index]
        internal_cross_validation = inner_cross_number

        Error_train_mean[k] = np.square(
            y_train - y_train.mean()).sum() / y_train.shape[0]
        Error_test_mean[k] = np.square(y_test -
                                       y_test.mean()).sum() / y_test.shape[0]

        m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
        Error_train[k] = np.square(y_train -
                                   m.predict(X_train)).sum() / y_train.shape[0]
        Error_test[k] = np.square(y_test -
                                  m.predict(X_test)).sum() / y_test.shape[0]
        textout = ''
        selected_features, features_record, loss_record = feature_selector_lr(
            X_train, y_train, internal_cross_validation, display=textout)

        Features[selected_features, k] = 1
        # .. alternatively you could use module sklearn.feature_selection
        if len(selected_features) is 0:
            print(
                'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
            )
        else:
            m = lm.LinearRegression(fit_intercept=True).fit(
                X_train[:, selected_features], y_train)
            Error_train_fs[k] = np.square(y_train - m.predict(
                X_train[:, selected_features])).sum() / y_train.shape[0]
            Error_test_fs[k] = np.square(y_test - m.predict(
                X_test[:, selected_features])).sum() / y_test.shape[0]

            y_train_2 = np.asmatrix([[x] for x in y_train])
            y_test_2 = np.asmatrix([[x] for x in y_test])
            ann = nl.net.newff(
                [[-3, 3]] * M, [neurons, 1],
                [nl.trans.TanSig(), nl.trans.PureLin()])

            ann.train(X_train,
                      y_train_2,
                      goal=learning_goal,
                      epochs=max_epochs,
                      show=show_error_freq)
            y_est_train = ann.sim(X_train)
            y_est_test = ann.sim(X_test)

            Error_train_nn[k] = np.square(y_est_train -
                                          y_train_2).sum() / y_train.shape[0]
            Error_test_nn[k] = np.square(y_est_test -
                                         y_test_2).sum() / y_test.shape[0]

            figure()
            subplot(2, 1, 1)
            plot(y_train_2, y_est_train, '.')
            subplot(2, 1, 2)
            plot(y_test_2, y_est_test, '.')
            xlabel('MPG (true, normalized)')
            ylabel('MPG (estimated, normalized)')

        print('Cross validation fold {0}/{1}'.format(k + 1, K))
        print('Features no: {0}\n'.format(selected_features.size))

        k += 1

        figure(k)
        subplot(1, 2, 1)
        plot(range(1, len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')

        subplot(1, 3, 3)
        bmplot(attributeNames, range(1, features_record.shape[1]),
               -features_record[:, 1:])
        clim(-1.5, 0)
        xlabel('Iteration')

    print('Feature_select vs. ANN:')
    significant_differnece(Error_1=Error_test_fs, Error_2=Error_test_nn, K=K)
    print('Mean vs. ANN:')
    significant_differnece(Error_1=Error_test_mean, Error_2=Error_test_nn, K=K)
    print('Linear vs. ANN:')
    significant_differnece(Error_1=Error_test, Error_2=Error_test_nn, K=K)

    figure()
    plt.boxplot(
        np.bmat('Error_test_nn, Error_test_fs, Error_test, Error_train_mean'))
    title('Normalized input/output')
    xlabel('ANN vs. Feature_selected vs. clean vs. mean')
    ylabel('Mean squared error')

    show()
    X_test = X[test_index,:]
    y_test = y[test_index]

    print('--------------START LINEAR ON FOLD--------------')
    LINEAR_INTERNAL_CROSS_VALIDATION = 10
    # Compute squared error without using the input data at all
    LINEAR_ERROR_TRAIN_NOFEATURES[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
    LINEAR_ERROR_TEST_NOFEATURES[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]

    # Compute squared error with all features selected (no feature selection)
    model = lm.LinearRegression().fit(X_train, y_train)
    LINEAR_ERROR_TRAIN[k] = np.square(y_train-model.predict(X_train)).sum()/y_train.shape[0]
    LINEAR_ERROR_TEST[k] = np.square(y_test-model.predict(X_test)).sum()/y_test.shape[0]

    # Compute squared error with feature subset selection
    selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, LINEAR_INTERNAL_CROSS_VALIDATION)
    LINEAR_FEATURES[selected_features,k]=1

    model = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
    LINEAR_ERROR_TRAIN_FS[k] = np.square(y_train-model.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
    LINEAR_ERROR_TEST_FS[k] = np.square(y_test-model.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

    print('MODEL COEFFICENTS: ')
    print('Selected Features: ' + str(selected_features))
    params = attributeNames[selected_features]
    for ind in range(len(selected_features)):
        print params[ind] + ": " + str(model.coef_[:,ind])

    figure(k)
    subplot(1,2,1)
    plot(range(1,len(loss_record)), loss_record[1:])
    combinations = np.zeros((N, M**2))
    labelcombinations = [None] * M**2

    for i in range(M):
        for j in range(M):
            combinations[:, i + j * M] = np.multiply(X[:, i],
                                                     X[:, j]).reshape(1, -1)
            labelcombinations[i + j * M] = labels[i] + " | " + labels[j]

    # Add all combinations of attributes
    X = np.hstack((X, combinations))
    labels = np.hstack((labels, labelcombinations))

Error_nofeatures = np.square(y - y.mean()).sum() / y.shape[0]

selected_features, features_record, loss_record = feature_selector_lr(X, y, 10)

model = lm.LinearRegression(fit_intercept=True).fit(X[:, selected_features], y)

y_pred = model.predict(X[:, selected_features])

equation = "y = {0:.2e}".format(model.intercept_[0])

for i in range(len(model.coef_[0])):
    if (model.coef_[0][i] < 0):
        equation += " - "
    else:
        equation += " + "
    equation += "{0:.2e} * {1}".format(abs(model.coef_[0][i]),
                                       labels[selected_features[i]])
示例#6
0
def lreg(x, y):

    X = x
    y = y

    N, M = X.shape

    ## Crossvalidation
    # Create crossvalidation partition for evaluation
    K = 5
    CV = model_selection.KFold(n_splits=K, shuffle=True)

    # Initialize variables
    Features = np.zeros((M, K))
    Error_train = np.empty((K, 1))
    Error_test = np.empty((K, 1))
    Error_train_fs = np.empty((K, 1))
    Error_test_fs = np.empty((K, 1))
    Error_train_nofeatures = np.empty((K, 1))
    Error_test_nofeatures = np.empty((K, 1))

    k = 0
    for train_index, test_index in CV.split(X):

        # extract training and test set for current CV fold
        X_train = X[train_index, :]
        y_train = y[train_index]
        X_test = X[test_index, :]
        y_test = y[test_index]
        internal_cross_validation = 10

        # Compute squared error without using the input data at all
        Error_train_nofeatures[k] = np.square(
            y_train - y_train.mean()).sum() / y_train.shape[0]
        Error_test_nofeatures[k] = np.square(
            y_test - y_test.mean()).sum() / y_test.shape[0]

        # Compute squared error with all features selected (no feature selection)
        m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
        Error_train[k] = np.square(y_train -
                                   m.predict(X_train)).sum() / y_train.shape[0]
        Error_test[k] = np.square(y_test -
                                  m.predict(X_test)).sum() / y_test.shape[0]

        # Compute squared error with feature subset selection
        #textout = 'verbose';
        textout = ''
        selected_features, features_record, loss_record = feature_selector_lr(
            X_train, y_train, internal_cross_validation, display=textout)

        Features[selected_features, k] = 1
        # .. alternatively you could use module sklearn.feature_selection
        if len(selected_features) is 0:
            print(
                'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
            )
        else:
            m = lm.LinearRegression(fit_intercept=True).fit(
                X_train[:, selected_features], y_train)
            Error_train_fs[k] = np.square(y_train - m.predict(
                X_train[:, selected_features])).sum() / y_train.shape[0]
            Error_test_fs[k] = np.square(y_test - m.predict(
                X_test[:, selected_features])).sum() / y_test.shape[0]

            #figure(k)
            #subplot(1,2,1)
            #plot(range(1,len(loss_record)), loss_record[1:])
            #xlabel('Iteration')
            #ylabel('Squared error (crossvalidation)')

            #subplot(1,3,3)
            #bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:])
            #clim(-1.5,0)
            #xlabel('Iteration')

        #print('Cross validation fold {0}/{1}'.format(k+1,K))
        #print('Train indices: {0}'.format(train_index))
        #print('Test indices: {0}'.format(test_index))
        #print('Features no: {0}\n'.format(selected_features.size))

        k += 1

    # Display results
    #print('\n')
    #print('parameters: {0}'.format(m.get_params()))

    #print('\n')
    #print('Linear regression without feature selection:\n')
    #print('- Training error: {0}'.format(Error_train.mean()))
    #print('- Test error:     {0}'.format(Error_test.mean()))
    #print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
    #print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
    #print('Linear regression with feature selection:\n')
    #print('- Training error: {0}'.format(Error_train_fs.mean()))
    #print('- Test error:     {0}'.format(Error_test_fs.mean()))
    #print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum()))
    #print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum()))

    #figure(k)
    #subplot(1,3,2)
    #bmplot(attributeNames, range(1,Features.shape[1]+1), -Features)
    #clim(-1.5,0)
    #xlabel('Crossvalidation fold')
    #ylabel('Attribute')

    # Inspect selected feature coefficients effect on the entire dataset and
    # plot the fitted model residual error as function of each attribute to
    # inspect for systematic structure in the residual

    f = np.argmin(Error_test_fs)  # cross-validation fold to inspect
    ff = Features[:, f - 1].nonzero()[0]
    if len(ff) is 0:
        print(
            '\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
        )
    else:
        m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y)

        y_est = m.predict(X[:, ff])
        residual = y - y_est

        #figure(k+1, figsize=(12,6))
        #title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
        #for i in range(0,len(ff)):
        #subplot(2,np.ceil(len(ff)/2.0),i+1)
        #plot(X[:,ff[i]],residual,'.')
        #xlabel(attributeNames[ff[i]])
        #ylabel('residual error')

    #show()
    def predict(data):
        return m.predict(data[:, ff])

    return (predict, ff)
    ##################################################################################
    m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
    LR_Error_train[k] = np.square(y_train -
                                  m.predict(X_train)).sum() / y_train.shape[0]
    LR_Error_test[k] = np.square(y_test -
                                 m.predict(X_test)).sum() / y_test.shape[0]

    ##################################################################################
    #                                                                                #
    #                     LINEAR REGRESSION WITH FEATURE SELECTION                   #
    #                                                                                #
    ##################################################################################
    print('\nLINEAR REGRESSION MODEL')
    K_internal = 10
    textout = ''
    selected_features, features_record, loss_record = feature_selector_lr(
        X_train, y_train, K_internal, display=textout)
    LR_Features_fs[selected_features, k] = 1
    m = lm.LinearRegression(fit_intercept=True).fit(
        X_train[:, selected_features], y_train)
    LR_Params_fs.append(m.coef_)

    LR_Error_train_fs[k] = np.square(y_train - m.predict(
        X_train[:, selected_features])).sum() / y_train.shape[0]
    y_est = m.predict(X_test[:, selected_features])
    LR_Error_test_fs[k] = np.square(y_test - m.predict(
        X_test[:, selected_features])).sum() / y_test.shape[0]

    figure()
    plot(y_test, y_est)
    title('Linear regression with forward feature selection')
    xlabel('Real values')
    y_train = y[train_index]
    X_test = X[test_index]
    y_test = y[test_index]
    internal_cross_validation = 10
    
    # Compute squared error without using the input data at all
    Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
    Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]

    # Compute squared error with all features selected (no feature selection)
    m = lm.LinearRegression().fit(X_train, y_train)
    Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
    Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]

    # Compute squared error with feature subset selection
    selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation)
    Features[selected_features,k]=1
    # .. alternatively you could use module sklearn.feature_selection
    m = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
    Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
    Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

    figure(k)
    subplot(1,2,1)
    plot(range(1,len(loss_record)), loss_record[1:])
    xlabel('Iteration')
    ylabel('Squared error (crossvalidation)')    
    
    subplot(1,3,3)
    bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:])
    clim(-1.5,0)
示例#9
0
def forwardSelection(X,y,N,K,attributeNames, classNames):
    # Add offset attribute
    X2 = np.concatenate((np.ones((X.shape[0],1)),X),1)
    attributeNames2 = [u'Offset']+attributeNames
    M2 = len(attributeNames)+1
    
    
    #X3 = np.copy(X)
    X2[:,2] = np.power(X2[:,2],2)    
    
    ## Crossvalidation
    # Create crossvalidation partition for evaluation

    CV = cross_validation.KFold(N,K,shuffle=True)
    
    # Initialize variables
    Features = np.zeros((M2,K))
    Error_train = np.empty((K,1))
    Error_test = np.empty((K,1))
    Error_train_fs = np.empty((K,1))
    Error_test_fs = np.empty((K,1))
    Error_train_nofeatures = np.empty((K,1))
    Error_test_nofeatures = np.empty((K,1))
    
    k=0
    for train_index, test_index in CV:
        
        # extract training and test set for current CV fold
        X_train = X2[train_index]
        y_train = y[train_index]
        X_test = X2[test_index]
        y_test = y[test_index]
        internal_cross_validation = 5
        
        
        
        # Compute squared error without using the input data at all
        Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0]
        Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0]
        
         # Compute squared error with all features selected (no feature selection)
        m = lm.LinearRegression().fit(X_train, y_train)
        Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0]
        Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0]


        # Compute squared error with feature subset selection
        selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation)
        Features[selected_features,k]=1
            # .. alternatively you could use module sklearn.feature_selection
        m = lm.LinearRegression().fit(X_train[:,selected_features], y_train)
        Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0]
        Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0]

        
        figure()
        subplot(1,2,1)
        plot(range(1,len(loss_record)), loss_record[1:])
        xlabel('Iteration')
        ylabel('Squared error (crossvalidation)')    
        
        subplot(1,3,3)
        bmplot(attributeNames2, range(1,features_record.shape[1]), -features_record[:,1:])
        clim(-1.5,0)
        xlabel('Iteration')
    
        print('Cross validation fold {0}/{1}'.format(k+1,K))
    
        k+=1
    
    
    # Display results
    print('\n')
    print('Linear regression without feature selection:\n')
    print('- Training error: {0}'.format(Error_train.mean()))
    print('- Test error:     {0}'.format(Error_test.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum()))
    print('\n')
    print('Linear regression with feature selection:\n')
    print('- Training error: {0}'.format(Error_train_fs.mean()))
    print('- Test error:     {0}'.format(Error_test_fs.mean()))
    print('- R^2 train:     {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum()))
    
    figure()
    subplot(1,3,2)
    bmplot(attributeNames2, range(1,Features.shape[1]+1), -Features)
    clim(-1.5,0)
    xlabel('Crossvalidation fold')
    ylabel('Attribute')
    
    # Inspect selected feature coefficients effect on the entire dataset and
    # plot the fitted model residual error as function of each attribute to
    # inspect for systematic structure in the residual
    f=2 # cross-validation fold to inspect
    ff=Features[:,f-1].nonzero()[0]
    m = lm.LinearRegression().fit(X2[:,ff], y)
    
    y_est= m.predict(X2[:,ff])
    residual=y-y_est
    
    figure()
    title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f))
    for i in range(0,len(ff)):
       subplot(2,ceil(len(ff)/2.0),i+1)
       for c in classNames:
           class_mask = (y_est==c)
           plot(X2[:,ff[i]],residual,'.')
       xlabel(attributeNames2[ff[i]])
       ylabel('residual error')
    
    
    show()    
示例#10
0
def linear_reg(input_matrix, index, outer_cross_number, inner_cross_number):
    X, y = split_train_test(input_matrix, index)
    N, M = X.shape
    K = outer_cross_number
    # CV = model_selection.KFold(K,True)

    neurons = 50
    learning_goal = 10
    max_epochs = 64 * 5
    show_error_freq = 65

    temp = attributeNames[index]
    attributeNamesShorter = attributeNames
    attributeNamesShorter.remove(temp)
    CV = cross_validation.KFold(N, K, shuffle=True)

    Features = np.zeros((M, K))
    Error_train = np.empty((K, 1))
    Error_test = np.empty((K, 1))
    Error_train_fs = np.empty((K, 1))
    Error_test_fs = np.empty((K, 1))
    Error_train_nofeatures = np.empty((K, 1))
    Error_test_nofeatures = np.empty((K, 1))
    Error_train_nn = np.empty((K, 1))
    Error_test_nn = np.empty((K, 1))
    k = 0
    for train_index, test_index in CV:
        X_train = X[train_index, :]
        y_train = y[train_index]
        X_test = X[test_index, :]
        y_test = y[test_index]
        internal_cross_validation = inner_cross_number

        Error_train_nofeatures[k] = np.square(
            y_train - y_train.mean()).sum() / y_train.shape[0]
        Error_test_nofeatures[k] = np.square(
            y_test - y_test.mean()).sum() / y_test.shape[0]

        m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train)
        Error_train[k] = np.square(y_train -
                                   m.predict(X_train)).sum() / y_train.shape[0]
        Error_test[k] = np.square(y_test -
                                  m.predict(X_test)).sum() / y_test.shape[0]
        textout = ''
        selected_features, features_record, loss_record = feature_selector_lr(
            X_train, y_train, internal_cross_validation, display=textout)

        Features[selected_features, k] = 1
        # .. alternatively you could use module sklearn.feature_selection
        if len(selected_features) is 0:
            print(
                'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
            )
        else:
            m = lm.LinearRegression(fit_intercept=True).fit(
                X_train[:, selected_features], y_train)
            Error_train_fs[k] = np.square(y_train - m.predict(
                X_train[:, selected_features])).sum() / y_train.shape[0]
            Error_test_fs[k] = np.square(y_test - m.predict(
                X_test[:, selected_features])).sum() / y_test.shape[0]

            y_train_2 = np.asmatrix([[x] for x in y_train])
            y_test_2 = np.asmatrix([[x] for x in y_test])
            ann = nl.net.newff(
                [[-3, 3]] * M, [neurons, 1],
                [nl.trans.TanSig(), nl.trans.PureLin()])
            # Please f*****g train
            '''X_train = (X_train - np.mean(X_train)) / np.std(X_train)
            y_train_2 = (y_train_2 - np.mean(y_train_2)) / np.std(y_train_2)
            X_test = (X_test - np.mean(X_test)) / np.std(X_test)
            y_test_2 = (y_test_2 - np.mean(y_test_2)) / np.std(y_test_2)'''

            ann.train(X_train,
                      y_train_2,
                      goal=learning_goal,
                      epochs=max_epochs,
                      show=show_error_freq)
            y_est_train = ann.sim(X_train)
            y_est_test = ann.sim(X_test)

            Error_train_nn[k] = np.square(y_est_train -
                                          y_train_2).sum() / y_train.shape[0]
            Error_test_nn[k] = np.square(y_est_test -
                                         y_test_2).sum() / y_test.shape[0]

            # figure(k)
            # subplot(1, 2, 1)
            # plot(range(1, len(loss_record)), loss_record[1:])
            # xlabel('Iteration')
            # ylabel('Squared error (crossvalidation)')

            # subplot(1, 3, 3)
            # bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:])
            # clim(-1.5, 0)
            # xlabel('Iteration')

        print('Cross validation fold {0}/{1}'.format(k + 1, K))
        # print('Train indices: {0}'.format(train_index))
        # print('Test indices: {0}'.format(test_index))
        print('Features no: {0}\n'.format(selected_features.size))

        k += 1

    print('\n')
    print('Linear regression without feature selection:\n')
    print('- Training error: {0}'.format(Error_train.mean()))
    print('- Test error:     {0}'.format(Error_test.mean()))
    print('- R^2 train:     {0}'.format(
        (Error_train_nofeatures.sum() - Error_train.sum()) /
        Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format(
        (Error_test_nofeatures.sum() - Error_test.sum()) /
        Error_test_nofeatures.sum()))
    print('Linear regression with feature selection:\n')
    print('- Training error: {0}'.format(Error_train_fs.mean()))
    print('- Test error:     {0}'.format(Error_test_fs.mean()))
    print('- R^2 train:     {0}'.format(
        (Error_train_nofeatures.sum() - Error_train_fs.sum()) /
        Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format(
        (Error_test_nofeatures.sum() - Error_test_fs.sum()) /
        Error_test_nofeatures.sum()))
    print('Neural newtork :\n')
    print('- Training error: {0}'.format(Error_train_nn.mean()))
    print('- Test error:     {0}'.format(Error_test_nn.mean()))
    print('- R^2 train:     {0}'.format(
        (Error_train_nofeatures.sum() - Error_train_nn.sum()) /
        Error_train_nofeatures.sum()))
    print('- R^2 test:     {0}'.format(
        (Error_test_nofeatures.sum() - Error_test_nn.sum()) /
        Error_test_nofeatures.sum()))
    '''figure(k)
    subplot(1, 3, 2)
    bmplot(attributeNamesShorter, range(1, Features.shape[1] + 1), -Features)
    clim(-1.5, 0)
    xlabel('Crossvalidation fold')
    ylabel('Attribute')'''

    # Inspect selected feature coefficients effect on the entire dataset and
    # plot the fitted model residual error as function of each attribute to
    # inspect for systematic structure in the residual

    f = 2  # cross-validation fold to inspect
    ff = Features[:, f - 1].nonzero()[0]
    if len(ff) is 0:
        print(
            '\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).'
        )
    else:
        m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y)
        y_est = m.predict(X[:, ff])
        residual = y - y_est

        figure(k + 1)
        title(
            'Residual error vs. Attributes for features selected in cross-validation fold {0}'
            .format(f))
        for i in range(0, len(ff)):
            subplot(2, np.ceil(len(ff) / 2.0), i + 1)
            plot(X[:, ff[i]], residual, '.')
            xlabel(attributeNamesShorter[ff[i]])
            ylabel('residual error')

        show()