Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] figure(k) suptitle("Cross-validation fold #{}".format(k + 1), fontsize=12, fontweight='bold') subplot(1, 2, 1) plot(range(1, len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1, 3, 3) bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:]) clim(-1.5, 0) xlabel('Iteration') savefig( "project-2-forward-selection-squared-error-{}-cv-fold.png".format( k + 1), bbox_inches='tight') print('Cross validation fold {0}/{1}'.format(k + 1, K)) print('Train indices: {0}'.format(train_index)) print('Test indices: {0}'.format(test_index)) print('Features no: {0}'.format(selected_features.size)) print("Weights: {}\n".format(m.coef_)) k += 1
def forwardSelection(X,y,N,K,attributeNames, classNames): # Add offset attribute X2 = np.concatenate((np.ones((X.shape[0],1)),X),1) attributeNames2 = [u'Offset']+attributeNames M2 = len(attributeNames)+1 #X3 = np.copy(X) X2[:,2] = np.power(X2[:,2],2) ## Crossvalidation # Create crossvalidation partition for evaluation CV = cross_validation.KFold(N,K,shuffle=True) # Initialize variables Features = np.zeros((M2,K)) Error_train = np.empty((K,1)) Error_test = np.empty((K,1)) Error_train_fs = np.empty((K,1)) Error_test_fs = np.empty((K,1)) Error_train_nofeatures = np.empty((K,1)) Error_test_nofeatures = np.empty((K,1)) k=0 for train_index, test_index in CV: # extract training and test set for current CV fold X_train = X2[train_index] y_train = y[train_index] X_test = X2[test_index] y_test = y[test_index] internal_cross_validation = 5 # Compute squared error without using the input data at all Error_train_nofeatures[k] = np.square(y_train-y_train.mean()).sum()/y_train.shape[0] Error_test_nofeatures[k] = np.square(y_test-y_test.mean()).sum()/y_test.shape[0] # Compute squared error with all features selected (no feature selection) m = lm.LinearRegression().fit(X_train, y_train) Error_train[k] = np.square(y_train-m.predict(X_train)).sum()/y_train.shape[0] Error_test[k] = np.square(y_test-m.predict(X_test)).sum()/y_test.shape[0] # Compute squared error with feature subset selection selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation) Features[selected_features,k]=1 # .. alternatively you could use module sklearn.feature_selection m = lm.LinearRegression().fit(X_train[:,selected_features], y_train) Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0] Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0] figure() subplot(1,2,1) plot(range(1,len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1,3,3) bmplot(attributeNames2, range(1,features_record.shape[1]), -features_record[:,1:]) clim(-1.5,0) xlabel('Iteration') print('Cross validation fold {0}/{1}'.format(k+1,K)) k+=1 # Display results print('\n') print('Linear regression without feature selection:\n') print('- Training error: {0}'.format(Error_train.mean())) print('- Test error: {0}'.format(Error_test.mean())) print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train.sum())/Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test.sum())/Error_test_nofeatures.sum())) print('\n') print('Linear regression with feature selection:\n') print('- Training error: {0}'.format(Error_train_fs.mean())) print('- Test error: {0}'.format(Error_test_fs.mean())) print('- R^2 train: {0}'.format((Error_train_nofeatures.sum()-Error_train_fs.sum())/Error_train_nofeatures.sum())) print('- R^2 test: {0}'.format((Error_test_nofeatures.sum()-Error_test_fs.sum())/Error_test_nofeatures.sum())) figure() subplot(1,3,2) bmplot(attributeNames2, range(1,Features.shape[1]+1), -Features) clim(-1.5,0) xlabel('Crossvalidation fold') ylabel('Attribute') # Inspect selected feature coefficients effect on the entire dataset and # plot the fitted model residual error as function of each attribute to # inspect for systematic structure in the residual f=2 # cross-validation fold to inspect ff=Features[:,f-1].nonzero()[0] m = lm.LinearRegression().fit(X2[:,ff], y) y_est= m.predict(X2[:,ff]) residual=y-y_est figure() title('Residual error vs. Attributes for features selected in cross-validation fold {0}'.format(f)) for i in range(0,len(ff)): subplot(2,ceil(len(ff)/2.0),i+1) for c in classNames: class_mask = (y_est==c) plot(X2[:,ff[i]],residual,'.') xlabel(attributeNames2[ff[i]]) ylabel('residual error') show()
def linear_reg(input_matrix, index, outer_cross_number, inner_cross_number): X, y = split_train_test(input_matrix, index) N, M = X.shape K = outer_cross_number # CV = model_selection.KFold(K,True) attributeNames = [ 'MPG', 'Cylinders', 'Displacment', 'Horsepower', 'Weight (lbs)', 'Acceleration (MPH)', 'Model year', 'Origin' ] temp = attributeNames[index] attributeNamesShorter = attributeNames attributeNamesShorter.remove(temp) neurons = 1 learning_goal = 25 max_epochs = 64 show_error_freq = 65 CV = cross_validation.KFold(N, K, shuffle=True) Features = np.zeros((M, K)) Error_train = np.empty((K, 1)) Error_test = np.empty((K, 1)) Error_train_fs = np.empty((K, 1)) Error_test_fs = np.empty((K, 1)) Error_train_mean = np.empty((K, 1)) Error_test_mean = np.empty((K, 1)) Error_train_nn = np.empty((K, 1)) Error_test_nn = np.empty((K, 1)) k = 0 for train_index, test_index in CV: X_train = X[train_index, :] y_train = y[train_index] X_test = X[test_index, :] y_test = y[test_index] internal_cross_validation = inner_cross_number Error_train_mean[k] = np.square( y_train - y_train.mean()).sum() / y_train.shape[0] Error_test_mean[k] = np.square(y_test - y_test.mean()).sum() / y_test.shape[0] m = lm.LinearRegression(fit_intercept=True).fit(X_train, y_train) Error_train[k] = np.square(y_train - m.predict(X_train)).sum() / y_train.shape[0] Error_test[k] = np.square(y_test - m.predict(X_test)).sum() / y_test.shape[0] textout = '' selected_features, features_record, loss_record = feature_selector_lr( X_train, y_train, internal_cross_validation, display=textout) Features[selected_features, k] = 1 # .. alternatively you could use module sklearn.feature_selection if len(selected_features) is 0: print( 'No features were selected, i.e. the data (X) in the fold cannot describe the outcomes (y).' ) else: m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] y_train_2 = np.asmatrix([[x] for x in y_train]) y_test_2 = np.asmatrix([[x] for x in y_test]) ann = nl.net.newff( [[-3, 3]] * M, [neurons, 1], [nl.trans.TanSig(), nl.trans.PureLin()]) ann.train(X_train, y_train_2, goal=learning_goal, epochs=max_epochs, show=show_error_freq) y_est_train = ann.sim(X_train) y_est_test = ann.sim(X_test) Error_train_nn[k] = np.square(y_est_train - y_train_2).sum() / y_train.shape[0] Error_test_nn[k] = np.square(y_est_test - y_test_2).sum() / y_test.shape[0] figure() subplot(2, 1, 1) plot(y_train_2, y_est_train, '.') subplot(2, 1, 2) plot(y_test_2, y_est_test, '.') xlabel('MPG (true, normalized)') ylabel('MPG (estimated, normalized)') print('Cross validation fold {0}/{1}'.format(k + 1, K)) print('Features no: {0}\n'.format(selected_features.size)) k += 1 figure(k) subplot(1, 2, 1) plot(range(1, len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1, 3, 3) bmplot(attributeNames, range(1, features_record.shape[1]), -features_record[:, 1:]) clim(-1.5, 0) xlabel('Iteration') print('Feature_select vs. ANN:') significant_differnece(Error_1=Error_test_fs, Error_2=Error_test_nn, K=K) print('Mean vs. ANN:') significant_differnece(Error_1=Error_test_mean, Error_2=Error_test_nn, K=K) print('Linear vs. ANN:') significant_differnece(Error_1=Error_test, Error_2=Error_test_nn, K=K) figure() plt.boxplot( np.bmat('Error_test_nn, Error_test_fs, Error_test, Error_train_mean')) title('Normalized input/output') xlabel('ANN vs. Feature_selected vs. clean vs. mean') ylabel('Mean squared error') show()
if addCombinations: plt.xticks(range(len(selected_features) + 1), ["", "Ca", "Si", "Al", "Ba", "K | Si"]) plt.xlabel('iteration (attribute added)', fontsize=12) else: plt.xticks(range(len(selected_features) + 1)) plt.xlabel('iteration') plt.ylabel('R^2 (crossvalidation)', fontsize=12) plt.ylim(0, 1) plt.grid('on') if not addCombinations: plt.subplot(1, 3, 3) #Add the constant (no feature) evaluation to data bmplot(labels, range(features_record.shape[1]), -features_record) plt.clim(-1.5, 0) plt.xlabel('Iteration') filename = "reg" if addCombinations: filename += "_allCombi" filename += "_Trans" if transformMean: filename += "_Mean" if transformStd: filename += "_Std"
else: m = lm.LinearRegression(fit_intercept=True).fit( X_train[:, selected_features], y_train) Error_train_fs[k] = np.square(y_train - m.predict( X_train[:, selected_features])).sum() / y_train.shape[0] Error_test_fs[k] = np.square(y_test - m.predict( X_test[:, selected_features])).sum() / y_test.shape[0] figure(k) subplot(1, 2, 1) plot(range(1, len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1, 3, 3) bmplot(label, range(1, features_record.shape[1]), -features_record[:, 1:]) clim(-1.5, 0) xlabel('Iteration') print('Cross validation fold {0}/{1}'.format(k + 1, K)) print('Train indices: {0}'.format(train_index)) print('Test indices: {0}'.format(test_index)) print('Features no: {0}\n'.format(selected_features.size)) k += 1 # Display results print('\n') print('Linear regression without feature selection:\n') print('- Training error: {0}'.format(Error_train.mean())) print('- Test error: {0}'.format(Error_test.mean()))
print('- R^2 train: {0}'.format((LINEAR_ERROR_TRAIN_NOFEATURES.sum()-LINEAR_ERROR_TRAIN.sum())/LINEAR_ERROR_TRAIN_NOFEATURES.sum())) print('- R^2 test: {0}'.format((LINEAR_ERROR_TEST_NOFEATURES.sum()-LINEAR_ERROR_TEST.sum())/LINEAR_ERROR_TEST_NOFEATURES.sum())) # print('- Error rate train: {0}%'.format(100*mean(LINEAR_ERROR_TRAIN))) # print('- Error rate test: {0}%'.format(100*mean(LINEAR_ERROR_TEST))) print('Linear regression with feature selection:\n') print('- Training error: {0}'.format(LINEAR_ERROR_TRAIN_FS.mean())) print('- Test error: {0}'.format(LINEAR_ERROR_TEST_FS.mean())) print('- R^2 train: {0}'.format((LINEAR_ERROR_TRAIN_NOFEATURES.sum()-LINEAR_ERROR_TRAIN_FS.sum())/LINEAR_ERROR_TRAIN_NOFEATURES.sum())) print('- R^2 test: {0}'.format((LINEAR_ERROR_TEST_NOFEATURES.sum()-LINEAR_ERROR_TEST_FS.sum())/LINEAR_ERROR_TEST_NOFEATURES.sum())) # print('- Error rate train: {0}%'.format(100*mean(LINEAR_ERROR_TRAIN_FS))) # print('- Error rate test: {0}%'.format(100*mean(LINEAR_ERROR_TEST_FS))) figure(k) subplot(1,3,2) bmplot(attributeNames, range(1,LINEAR_FEATURES.shape[1]+1), -LINEAR_FEATURES) clim(-1.5,0) xlabel('Crossvalidation fold') ylabel('Attribute') f=2 # cross-validation fold to inspect ff=LINEAR_FEATURES[:,f-1].nonzero()[0] m = lm.LinearRegression().fit(X[:,ff], y) # print "ff: " + str(ff) # params = attributeNames[ff] # coefficients = m.coef_ # # for ind in range(len(ff)): # print params[ind] + ": " + str(coefficients[ind])
# Compute squared error with feature subset selection selected_features, features_record, loss_record = feature_selector_lr(X_train, y_train, internal_cross_validation) Features[selected_features,k]=1 # .. alternatively you could use module sklearn.feature_selection m = lm.LinearRegression().fit(X_train[:,selected_features], y_train) Error_train_fs[k] = np.square(y_train-m.predict(X_train[:,selected_features])).sum()/y_train.shape[0] Error_test_fs[k] = np.square(y_test-m.predict(X_test[:,selected_features])).sum()/y_test.shape[0] figure(k) subplot(1,2,1) plot(range(1,len(loss_record)), loss_record[1:]) xlabel('Iteration') ylabel('Squared error (crossvalidation)') subplot(1,3,3) bmplot(attributeNames, range(1,features_record.shape[1]), -features_record[:,1:]) clim(-1.5,0) xlabel('Iteration') print('Cross validation fold {0}/{1}'.format(k+1,K)) print('Train indices: {0}'.format(train_index)) print('Test indices: {0}'.format(test_index)) print('Features no: {0}\n'.format(selected_features.size)) k+=1 # Display results print('\n') print('Linear regression without feature selection:\n') print('- Training error: {0}'.format(Error_train.mean()))