Error_train_nofeatures[k] = np.square( y_train - y_train.mean()).sum() / y_train.shape[0] Error_test_nofeatures[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] # Compute squared error with feature subset selection #textout = 'verbose'; textout = '' selected_features, features_record, loss_record = feature_selector_lr( X_train, y_train, internal_cross_validation, display=textout) Features[selected_features, k] = 1 if len(selected_features) is 0: print( 'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] figure(k)
def forwardSelection(X,y,N,K,attributeNames, classNames): # Add offset attribute X2 = np.concatenate((np.ones((X.shape[0],1)),X),1) attributeNames2 = [u'Offset']+attributeNames M2 = len(attributeNames)+1 #X3 = np.copy(X) X2[:,2] = np.power(X2[:,2],2) ## Crossvalidation # Create crossvalidation partition for evaluation CV = cross_validation.KFold(N,K,shuffle=True) # Initialize variables Features = np.zeros((M2,K)) Error_train = np.empty((K,1)) Error_test = np.empty((K,1)) Error_train_fs = np.empty((K,1)) Error_test_fs = np.empty((K,1)) Error_train_nofeatures = np.empty((K,1)) Error_test_nofeatures = np.empty((K,1)) k=0 for train_index, test_index in CV: # extract training and test set for current CV fold X_train = X2[train_index] y_train = y[train_index] X_test = X2[test_index] y_test = y[test_index] internal_cross_validation = 5 # Compute squared error without using the input data at all Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0] Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression().fit(X_train, y_train) Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] # Compute squared error with feature subset selection selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation) Features[selected_features,k]=1 # .. alternatively you could use module sklearn.feature_selection m = lm.LinearRegression().fit(X_train[:,selected_features], y_train) Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0] Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0] figure() subplot(1,2,1) plot(range(1,len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1,3,3) bmplot(attributeNames2, range(1,features_record.shape[1]), -features_record[:,1:]) clim(-1.5,0) xlabel('Iteration') print('Cross validation fold {0}/{1}'.format(k+1,K)) k+=1 # Display results print('\n') print('Linear regression without feature selection:\n') print('- Training error: {0}'.format(Error_train.mean())) print('- Test error: {0}'.format(Error_test.mean())) print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum())) print('\n') print('Linear regression with feature selection:\n') print('- Training error: {0}'.format(Error_train_fs.mean())) print('- Test error: {0}'.format(Error_test_fs.mean())) print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum())) figure() subplot(1,3,2) bmplot(attributeNames2, range(1,Features.shape[1]+1), -Features) clim(-1.5,0) xlabel('Crossvalidation fold') ylabel('Attribute') # Inspect selected feature coefficients effect on the entire dataset and # plot the fitted model residual error as function of each attribute to # inspect for systematic structure in the residual f=2 # cross-validation fold to inspect ff=Features[:,f-1].nonzero()[0] m = lm.LinearRegression().fit(X2[:,ff], y) y_est= m.predict(X2[:,ff]) residual=y-y_est figure() title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f)) for i in range(0,len(ff)): subplot(2,ceil(len(ff)/2.0),i+1) for c in classNames: class_mask = (y_est==c) plot(X2[:,ff[i]],residual,'.') xlabel(attributeNames2[ff[i]]) ylabel('residual error') show()
def linear_reg(input_matrix, index, outer_cross_number, inner_cross_number): X, y = split_train_test(input_matrix, index) N, M = X.shape K = outer_cross_number # CV = model_selection.KFold(K,True) attributeNames = [ 'MPG', 'Cylinders', 'Displacment', 'Horsepower', 'Weight (lbs)', 'Acceleration (MPH)', 'Model year', 'Origin' ] temp = attributeNames[index] attributeNamesShorter = attributeNames attributeNamesShorter.remove(temp) neurons = 1 learning_goal = 25 max_epochs = 64 show_error_freq = 65 CV = cross_validation.KFold(N, K, shuffle=True) Features = np.zeros((M, K)) Error_train = np.empty((K, 1)) Error_test = np.empty((K, 1)) Error_train_fs = np.empty((K, 1)) Error_test_fs = np.empty((K, 1)) Error_train_mean = np.empty((K, 1)) Error_test_mean = np.empty((K, 1)) Error_train_nn = np.empty((K, 1)) Error_test_nn = np.empty((K, 1)) k = 0 for train_index, test_index in CV: X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] internal_cross_validation = inner_cross_number Error_train_mean[k] = np.square( y_train - y_train.mean()).sum() / y_train.shape[0] Error_test_mean[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0] m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] textout = '' selected_features, features_record, loss_record = feature_selector_lr( X_train, y_train, internal_cross_validation, display=textout) Features[selected_features, k] = 1 # .. alternatively you could use module sklearn.feature_selection if len(selected_features) is 0: print( 'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] y_train_2 = np.asmatrix([[x] for x in y_train]) y_test_2 = np.asmatrix([[x] for x in y_test]) ann = nl.net.newff( [[-3, 3]] * M, [neurons, 1], [nl.trans.TanSig(), nl.trans.PureLin()]) ann.train(X_train, y_train_2, goal=learning_goal, epochs=max_epochs, show=show_error_freq) y_est_train = ann.sim(X_train) y_est_test = ann.sim(X_test) Error_train_nn[k] = np.square(y_est_train - y_train_2).sum() / y_train.shape[0] Error_test_nn[k] = np.square(y_est_test - y_test_2).sum() / y_test.shape[0] figure() subplot(2, 1, 1) plot(y_train_2, y_est_train, '.') subplot(2, 1, 2) plot(y_test_2, y_est_test, '.') xlabel('MPG (true, normalized)') ylabel('MPG (estimated, normalized)') print('Cross validation fold {0}/{1}'.format(k + 1, K)) print('Features no: {0}\n'.format(selected_features.size)) k += 1 figure(k) subplot(1, 2, 1) plot(range(1, len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1, 3, 3) bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:]) clim(-1.5, 0) xlabel('Iteration') print('Feature_select vs. ANN:') significant_differnece(Error_1=Error_test_fs, Error_2=Error_test_nn, K=K) print('Mean vs. ANN:') significant_differnece(Error_1=Error_test_mean, Error_2=Error_test_nn, K=K) print('Linear vs. ANN:') significant_differnece(Error_1=Error_test, Error_2=Error_test_nn, K=K) figure() plt.boxplot( np.bmat('Error_test_nn, Error_test_fs, Error_test, Error_train_mean')) title('Normalized input/output') xlabel('ANN vs. Feature_selected vs. clean vs. mean') ylabel('Mean squared error') show()
X_test = X[test_index,:] y_test = y[test_index] print('--------------START LINEAR ON FOLD--------------') LINEAR_INTERNAL_CROSS_VALIDATION = 10 # Compute squared error without using the input data at all LINEAR_ERROR_TRAIN_NOFEATURES[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0] LINEAR_ERROR_TEST_NOFEATURES[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0] # Compute squared error with all features selected (no feature selection) model = lm.LinearRegression().fit(X_train, y_train) LINEAR_ERROR_TRAIN[k] = np.square(y_train-model.predict(X_train)).sum()/y_train.shape[0] LINEAR_ERROR_TEST[k] = np.square(y_test-model.predict(X_test)).sum()/y_test.shape[0] # Compute squared error with feature subset selection selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, LINEAR_INTERNAL_CROSS_VALIDATION) LINEAR_FEATURES[selected_features,k]=1 model = lm.LinearRegression().fit(X_train[:,selected_features], y_train) LINEAR_ERROR_TRAIN_FS[k] = np.square(y_train-model.predict(X_train[:,selected_features])).sum()/y_train.shape[0] LINEAR_ERROR_TEST_FS[k] = np.square(y_test-model.predict(X_test[:,selected_features])).sum()/y_test.shape[0] print('MODEL COEFFICENTS: ') print('Selected Features: ' + str(selected_features)) params = attributeNames[selected_features] for ind in range(len(selected_features)): print params[ind] + ": " + str(model.coef_[:,ind]) figure(k) subplot(1,2,1) plot(range(1,len(loss_record)), loss_record[1:])
combinations = np.zeros((N, M**2)) labelcombinations = [None] * M**2 for i in range(M): for j in range(M): combinations[:, i + j * M] = np.multiply(X[:, i], X[:, j]).reshape(1, -1) labelcombinations[i + j * M] = labels[i] + " | " + labels[j] # Add all combinations of attributes X = np.hstack((X, combinations)) labels = np.hstack((labels, labelcombinations)) Error_nofeatures = np.square(y - y.mean()).sum() / y.shape[0] selected_features, features_record, loss_record = feature_selector_lr(X, y, 10) model = lm.LinearRegression(fit_intercept=True).fit(X[:, selected_features], y) y_pred = model.predict(X[:, selected_features]) equation = "y = {0:.2e}".format(model.intercept_[0]) for i in range(len(model.coef_[0])): if (model.coef_[0][i] < 0): equation += " - " else: equation += " + " equation += "{0:.2e} * {1}".format(abs(model.coef_[0][i]), labels[selected_features[i]])
def lreg(x, y): X = x y = y N, M = X.shape ## Crossvalidation # Create crossvalidation partition for evaluation K = 5 CV = model_selection.KFold(n_splits=K, shuffle=True) # Initialize variables Features = np.zeros((M, K)) Error_train = np.empty((K, 1)) Error_test = np.empty((K, 1)) Error_train_fs = np.empty((K, 1)) Error_test_fs = np.empty((K, 1)) Error_train_nofeatures = np.empty((K, 1)) Error_test_nofeatures = np.empty((K, 1)) k = 0 for train_index, test_index in CV.split(X): # extract training and test set for current CV fold X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] internal_cross_validation = 10 # Compute squared error without using the input data at all Error_train_nofeatures[k] = np.square( y_train - y_train.mean()).sum() / y_train.shape[0] Error_test_nofeatures[k] = np.square( y_test - y_test.mean()).sum() / y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] # Compute squared error with feature subset selection #textout = 'verbose'; textout = '' selected_features, features_record, loss_record = feature_selector_lr( X_train, y_train, internal_cross_validation, display=textout) Features[selected_features, k] = 1 # .. alternatively you could use module sklearn.feature_selection if len(selected_features) is 0: print( 'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] #figure(k) #subplot(1,2,1) #plot(range(1,len(loss_record)), loss_record[1:]) #xlabel('Iteration') #ylabel('Squared error (crossvalidation)') #subplot(1,3,3) #bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:]) #clim(-1.5,0) #xlabel('Iteration') #print('Cross validation fold {0}/{1}'.format(k+1,K)) #print('Train indices: {0}'.format(train_index)) #print('Test indices: {0}'.format(test_index)) #print('Features no: {0}\n'.format(selected_features.size)) k += 1 # Display results #print('\n') #print('parameters: {0}'.format(m.get_params())) #print('\n') #print('Linear regression without feature selection:\n') #print('- Training error: {0}'.format(Error_train.mean())) #print('- Test error: {0}'.format(Error_test.mean())) #print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum())) #print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum())) #print('Linear regression with feature selection:\n') #print('- Training error: {0}'.format(Error_train_fs.mean())) #print('- Test error: {0}'.format(Error_test_fs.mean())) #print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum())) #print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum())) #figure(k) #subplot(1,3,2) #bmplot(attributeNames, range(1,Features.shape[1]+1), -Features) #clim(-1.5,0) #xlabel('Crossvalidation fold') #ylabel('Attribute') # Inspect selected feature coefficients effect on the entire dataset and # plot the fitted model residual error as function of each attribute to # inspect for systematic structure in the residual f = np.argmin(Error_test_fs) # cross-validation fold to inspect ff = Features[:, f - 1].nonzero()[0] if len(ff) is 0: print( '\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y) y_est = m.predict(X[:, ff]) residual = y - y_est #figure(k+1, figsize=(12,6)) #title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f)) #for i in range(0,len(ff)): #subplot(2,np.ceil(len(ff)/2.0),i+1) #plot(X[:,ff[i]],residual,'.') #xlabel(attributeNames[ff[i]]) #ylabel('residual error') #show() def predict(data): return m.predict(data[:, ff]) return (predict, ff)
################################################################################## m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) LR_Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] LR_Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] ################################################################################## # # # LINEAR REGRESSION WITH FEATURE SELECTION # # # ################################################################################## print('\nLINEAR REGRESSION MODEL') K_internal = 10 textout = '' selected_features, features_record, loss_record = feature_selector_lr( X_train, y_train, K_internal, display=textout) LR_Features_fs[selected_features, k] = 1 m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) LR_Params_fs.append(m.coef_) LR_Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] y_est = m.predict(X_test[:, selected_features]) LR_Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] figure() plot(y_test, y_est) title('Linear regression with forward feature selection') xlabel('Real values')
y_train = y[train_index] X_test = X[test_index] y_test = y[test_index] internal_cross_validation = 10 # Compute squared error without using the input data at all Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0] Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression().fit(X_train, y_train) Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] # Compute squared error with feature subset selection selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation) Features[selected_features,k]=1 # .. alternatively you could use module sklearn.feature_selection m = lm.LinearRegression().fit(X_train[:,selected_features], y_train) Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0] Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0] figure(k) subplot(1,2,1) plot(range(1,len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1,3,3) bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:]) clim(-1.5,0)
def linear_reg(input_matrix, index, outer_cross_number, inner_cross_number): X, y = split_train_test(input_matrix, index) N, M = X.shape K = outer_cross_number # CV = model_selection.KFold(K,True) neurons = 50 learning_goal = 10 max_epochs = 64 * 5 show_error_freq = 65 temp = attributeNames[index] attributeNamesShorter = attributeNames attributeNamesShorter.remove(temp) CV = cross_validation.KFold(N, K, shuffle=True) Features = np.zeros((M, K)) Error_train = np.empty((K, 1)) Error_test = np.empty((K, 1)) Error_train_fs = np.empty((K, 1)) Error_test_fs = np.empty((K, 1)) Error_train_nofeatures = np.empty((K, 1)) Error_test_nofeatures = np.empty((K, 1)) Error_train_nn = np.empty((K, 1)) Error_test_nn = np.empty((K, 1)) k = 0 for train_index, test_index in CV: X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] internal_cross_validation = inner_cross_number Error_train_nofeatures[k] = np.square( y_train - y_train.mean()).sum() / y_train.shape[0] Error_test_nofeatures[k] = np.square( y_test - y_test.mean()).sum() / y_test.shape[0] m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] textout = '' selected_features, features_record, loss_record = feature_selector_lr( X_train, y_train, internal_cross_validation, display=textout) Features[selected_features, k] = 1 # .. alternatively you could use module sklearn.feature_selection if len(selected_features) is 0: print( 'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] y_train_2 = np.asmatrix([[x] for x in y_train]) y_test_2 = np.asmatrix([[x] for x in y_test]) ann = nl.net.newff( [[-3, 3]] * M, [neurons, 1], [nl.trans.TanSig(), nl.trans.PureLin()]) # Please f*****g train '''X_train = (X_train - np.mean(X_train)) / np.std(X_train) y_train_2 = (y_train_2 - np.mean(y_train_2)) / np.std(y_train_2) X_test = (X_test - np.mean(X_test)) / np.std(X_test) y_test_2 = (y_test_2 - np.mean(y_test_2)) / np.std(y_test_2)''' ann.train(X_train, y_train_2, goal=learning_goal, epochs=max_epochs, show=show_error_freq) y_est_train = ann.sim(X_train) y_est_test = ann.sim(X_test) Error_train_nn[k] = np.square(y_est_train - y_train_2).sum() / y_train.shape[0] Error_test_nn[k] = np.square(y_est_test - y_test_2).sum() / y_test.shape[0] # figure(k) # subplot(1, 2, 1) # plot(range(1, len(loss_record)), loss_record[1:]) # xlabel('Iteration') # ylabel('Squared error (crossvalidation)') # subplot(1, 3, 3) # bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:]) # clim(-1.5, 0) # xlabel('Iteration') print('Cross validation fold {0}/{1}'.format(k + 1, K)) # print('Train indices: {0}'.format(train_index)) # print('Test indices: {0}'.format(test_index)) print('Features no: {0}\n'.format(selected_features.size)) k += 1 print('\n') print('Linear regression without feature selection:\n') print('- Training error: {0}'.format(Error_train.mean())) print('- Test error: {0}'.format(Error_test.mean())) print('- R^2 train: {0}'.format( (Error_train_nofeatures.sum() - Error_train.sum()) / Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format( (Error_test_nofeatures.sum() - Error_test.sum()) / Error_test_nofeatures.sum())) print('Linear regression with feature selection:\n') print('- Training error: {0}'.format(Error_train_fs.mean())) print('- Test error: {0}'.format(Error_test_fs.mean())) print('- R^2 train: {0}'.format( (Error_train_nofeatures.sum() - Error_train_fs.sum()) / Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format( (Error_test_nofeatures.sum() - Error_test_fs.sum()) / Error_test_nofeatures.sum())) print('Neural newtork :\n') print('- Training error: {0}'.format(Error_train_nn.mean())) print('- Test error: {0}'.format(Error_test_nn.mean())) print('- R^2 train: {0}'.format( (Error_train_nofeatures.sum() - Error_train_nn.sum()) / Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format( (Error_test_nofeatures.sum() - Error_test_nn.sum()) / Error_test_nofeatures.sum())) '''figure(k) subplot(1, 3, 2) bmplot(attributeNamesShorter, range(1, Features.shape[1] + 1), -Features) clim(-1.5, 0) xlabel('Crossvalidation fold') ylabel('Attribute')''' # Inspect selected feature coefficients effect on the entire dataset and # plot the fitted model residual error as function of each attribute to # inspect for systematic structure in the residual f = 2 # cross-validation fold to inspect ff = Features[:, f - 1].nonzero()[0] if len(ff) is 0: print( '\nNo features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit(X[:, ff], y) y_est = m.predict(X[:, ff]) residual = y - y_est figure(k + 1) title( 'Residual error vs. Attributes for features selected in cross-validation fold {0}' .format(f)) for i in range(0, len(ff)): subplot(2, np.ceil(len(ff) / 2.0), i + 1) plot(X[:, ff[i]], residual, '.') xlabel(attributeNamesShorter[ff[i]]) ylabel('residual error') show()