def findBadWords(): ''' Plots different information about the words that were incorrectly guessed such as the semantic features rmse's and counts of words versus the semantic feature at which they didn't perform well :return: [rmsecorrect,rmseincorrect] ''' w = io.mmread("w_lse_dim299.mtx") [xtrainPCA, xtestPCA] = pcaData(299, fmri_train, fmri_test) [yright, ywrong] = test_suite.prepareData(wordid_test, wordfeature_std) [guessed_words, percentage] = test_suite.word_guesser(xtestPCA, w, 0, yright, ywrong) indexes_incorrect_guesses = np.where(guessed_words == False)[0].tolist() n = yright.shape[0] f = yright.shape[1] rmsecorrect = np.zeros((n, f)) rmseincorrect = np.zeros((n, f)) j = 0 for j in range(len(indexes_incorrect_guesses)): i = indexes_incorrect_guesses[j] ypredict = xtestPCA[i].dot(w.T) correct_word = dictionary[int(wordid_test[i][0] - 1)] ycorrect = yright[i] rmsecorrect[j] = np.sqrt(np.square(ycorrect - ypredict)) incorrect_word = dictionary[int(wordid_test[i][1] - 1)] yincorrect = ywrong[i] rmseincorrect[j] = np.sqrt(np.square(yincorrect - ypredict)) print(correct_word, sum(rmsecorrect[j]) / f, incorrect_word, sum(rmseincorrect[j]) / f) correct_line, = plt.plot(rmsecorrect[0], label='bear') incorrect_line, = plt.plot(rmseincorrect[0], label='airplane') plt.legend(handles=[correct_line, incorrect_line]) plt.xlabel('semantic feature') plt.ylabel('RMSE') plt.show() difference_rmse = rmsecorrect - rmseincorrect # if rmsecorrect > rmseincorrect then this is bad indexes_bad_rmse = np.where(difference_rmse > 0) plt.hist(indexes_bad_rmse[1], range(f)) plt.xlabel('semantic feature') plt.ylabel('Counts of Words') plt.show() differences_summed = np.sum(difference_rmse, axis=0) plt.plot(differences_summed) plt.show() return [rmsecorrect, rmseincorrect]
def findBadWords (): ''' Plots different information about the words that were incorrectly guessed such as the semantic features rmse's and counts of words versus the semantic feature at which they didn't perform well :return: [rmsecorrect,rmseincorrect] ''' w = io.mmread ("w_lse_dim299.mtx") [xtrainPCA,xtestPCA] = pcaData (299, fmri_train, fmri_test) [yright,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std) [guessed_words, percentage] = test_suite.word_guesser(xtestPCA, w, 0, yright, ywrong) indexes_incorrect_guesses = np.where(guessed_words == False)[0].tolist() n = yright.shape[0] f = yright.shape[1] rmsecorrect = np.zeros((n,f)) rmseincorrect = np.zeros((n,f)) j = 0 for j in range(len(indexes_incorrect_guesses)): i = indexes_incorrect_guesses[j] ypredict = xtestPCA[i].dot(w.T) correct_word = dictionary[int(wordid_test[i][0] - 1)] ycorrect = yright[i] rmsecorrect[j] = np.sqrt(np.square(ycorrect - ypredict)) incorrect_word = dictionary[int(wordid_test[i][1] - 1)] yincorrect = ywrong[i] rmseincorrect[j] = np.sqrt(np.square(yincorrect - ypredict)) print(correct_word, sum(rmsecorrect[j])/f,incorrect_word,sum(rmseincorrect[j])/f) correct_line, = plt.plot(rmsecorrect[0], label='bear') incorrect_line, = plt.plot(rmseincorrect[0], label='airplane') plt.legend(handles=[correct_line,incorrect_line]) plt.xlabel('semantic feature') plt.ylabel('RMSE') plt.show() difference_rmse = rmsecorrect - rmseincorrect # if rmsecorrect > rmseincorrect then this is bad indexes_bad_rmse = np.where(difference_rmse>0) plt.hist(indexes_bad_rmse [1],range(f)) plt.xlabel('semantic feature') plt.ylabel('Counts of Words') plt.show() differences_summed = np.sum (difference_rmse,axis = 0) plt.plot(differences_summed) plt.show() return [rmsecorrect,rmseincorrect]
def nonlinearFeatures(dimensions): ''' Applies PCA for the given amount of dimensions. It then multiplies each principal component with each other principal component to create non linear features and finds the least squares solution. :param dimensions: amount of principal components to keep :return: [accuracy,rmsetrain,rmsetest,rmsetestwrong] ''' [xtrainPCA, xtestPCA] = pcaData(300, fmri_train, fmri_train) n = xtrainPCA.shape[0] dimPCA = xtrainPCA.shape[1] xNonlinear = np.zeros((n, dimPCA + dimPCA * dimPCA / 2)) xtestNonlinear = np.zeros( (xtestPCA.shape[0], dimPCA + dimPCA * dimPCA / 2)) counter = dimPCA xNonlinear[:, 0:dimPCA] = xtrainPCA xtestNonlinear[:, 0:dimPCA] = xtestPCA for i in range(dimPCA): for j in range(i + 1, dimPCA): xNonlinear[:, counter] = xtrainPCA[:, i] * xtrainPCA[:, j] xtestNonlinear[:, counter] = xtestPCA[:, i] * xtestPCA[:, j] counter += 1 num_features = ytrain.shape[1] d = xNonlinear.shape[1] ntotdata = xNonlinear.shape[0] bestw = np.zeros([num_features, d]) bestw0 = np.zeros(num_features) accuracy = np.zeros(d) for i in range(num_features): y = ytrain[:, i].reshape(ntotdata, 1) x = xNonlinear[:, :] w = least_squares(x, y) bestw[i, :] = w.reshape(d) wfile = "w_lse_nonlinear.mtx" io.mmwrite(wfile, bestw) [accuracy] = test_suite.main(bestw, bestw0, wordid_test, wordfeature_std, xtestNonlinear) print(accuracy) [ytest, ywrong] = test_suite.prepareData(wordid_test, wordfeature_std) rmsetest = rmse_per_semantic_feature(xtestNonlinear, ytest, bestw) rmsetrain = rmse_per_semantic_feature(xNonlinear, ytrain, bestw) rmsetestwrong = rmse_per_semantic_feature(xtestNonlinear, ywrong, bestw) return [accuracy, rmsetrain, rmsetest, rmsetestwrong]
def findw_PCA_LSE(dimensions, train_data, test_data, wordid_test, ytrain, wfile="w_lse_dim299.mtx"): ''' Find number of dimensions given principal components of the data and using that it solves least squares to obtain the weights for each semantic feature. Using the weights obtained it calculates the predicted semantic features on the test data set and returns the accuracy on a guess out of 2 words. It also returnsrmse on test and training data, and on the wrong column of training data. :param dimensions: number of pca components :param train_data: training data :param test_data: test data :param wordid_test: ids of the words in the test data set :param ytrain: semantic features values for training data :param wfile: name of file where w's will be written :return: ''' [xtrainPCA, xtestPCA] = pcaData(dimensions, train_data, test_data) num_features = ytrain.shape[1] d = xtrainPCA.shape[1] ntotdata = xtrainPCA.shape[0] bestw = np.zeros([num_features, d]) bestw0 = np.zeros(num_features) accuracy = np.zeros(d) for i in range(num_features): print('looking at feature ', i) y = ytrain[:, i].reshape(ntotdata, 1) x = xtrainPCA[:, :] w = least_squares(x, y) bestw[i, :] = w.reshape(dimensions) io.mmwrite(wfile, bestw) [accuracy] = test_suite.main(bestw, bestw0, wordid_test, wordfeature_std, xtestPCA) print(accuracy) [ytest, ywrong] = test_suite.prepareData(wordid_test, wordfeature_std) rmsetest = rmse_per_semantic_feature(xtestPCA, ytest, bestw) rmsetrain = rmse_per_semantic_feature(xtrainPCA, ytrain, bestw) rmsetestwrong = rmse_per_semantic_feature(xtestPCA, ywrong, bestw) return [accuracy, rmsetrain, rmsetest, rmsetestwrong]
def nonlinearFeatures (dimensions): ''' Applies PCA for the given amount of dimensions. It then multiplies each principal component with each other principal component to create non linear features and finds the least squares solution. :param dimensions: amount of principal components to keep :return: [accuracy,rmsetrain,rmsetest,rmsetestwrong] ''' [xtrainPCA,xtestPCA] = pcaData(300, fmri_train, fmri_train) n = xtrainPCA.shape[0] dimPCA = xtrainPCA.shape[1] xNonlinear = np.zeros((n,dimPCA + dimPCA*dimPCA/2)) xtestNonlinear = np.zeros((xtestPCA.shape[0],dimPCA + dimPCA*dimPCA/2)) counter = dimPCA xNonlinear [:,0:dimPCA] = xtrainPCA xtestNonlinear [:,0:dimPCA] = xtestPCA for i in range(dimPCA): for j in range(i+1,dimPCA): xNonlinear[:,counter] = xtrainPCA[:,i] * xtrainPCA[:,j] xtestNonlinear[:,counter] = xtestPCA[:,i] * xtestPCA[:,j] counter +=1 num_features = ytrain.shape[1] d = xNonlinear.shape[1] ntotdata = xNonlinear.shape[0] bestw = np.zeros([num_features,d]) bestw0 = np.zeros(num_features) accuracy = np.zeros(d) for i in range(num_features): y = ytrain[:,i].reshape(ntotdata,1) x = xNonlinear[:,:] w = least_squares (x,y) bestw[i,:] = w.reshape(d) wfile = "w_lse_nonlinear.mtx" io.mmwrite(wfile, bestw) [accuracy] = test_suite.main(bestw,bestw0,wordid_test,wordfeature_std,xtestNonlinear) print(accuracy) [ytest,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std) rmsetest = rmse_per_semantic_feature (xtestNonlinear,ytest,bestw) rmsetrain = rmse_per_semantic_feature (xNonlinear,ytrain,bestw) rmsetestwrong = rmse_per_semantic_feature (xtestNonlinear,ywrong,bestw) return [accuracy,rmsetrain,rmsetest,rmsetestwrong]
def findw_PCA_LSE (dimensions, train_data,test_data, wordid_test, ytrain, wfile = "w_lse_dim299.mtx"): ''' Find number of dimensions given principal components of the data and using that it solves least squares to obtain the weights for each semantic feature. Using the weights obtained it calculates the predicted semantic features on the test data set and returns the accuracy on a guess out of 2 words. It also returnsrmse on test and training data, and on the wrong column of training data. :param dimensions: number of pca components :param train_data: training data :param test_data: test data :param wordid_test: ids of the words in the test data set :param ytrain: semantic features values for training data :param wfile: name of file where w's will be written :return: ''' [xtrainPCA,xtestPCA] = pcaData(dimensions, train_data, test_data) num_features = ytrain.shape[1] d = xtrainPCA.shape[1] ntotdata = xtrainPCA.shape[0] bestw = np.zeros([num_features,d]) bestw0 = np.zeros(num_features) accuracy = np.zeros(d) for i in range(num_features): print ('looking at feature ', i) y = ytrain[:,i].reshape(ntotdata,1) x = xtrainPCA[:,:] w = least_squares (x,y) bestw[i,:] = w.reshape(dimensions) io.mmwrite(wfile, bestw) [accuracy] = test_suite.main(bestw,bestw0,wordid_test,wordfeature_std,xtestPCA) print(accuracy) [ytest,ywrong] = test_suite.prepareData (wordid_test,wordfeature_std) rmsetest = rmse_per_semantic_feature (xtestPCA,ytest,bestw) rmsetrain = rmse_per_semantic_feature (xtrainPCA,ytrain,bestw) rmsetestwrong = rmse_per_semantic_feature (xtestPCA,ywrong,bestw) return [accuracy,rmsetrain,rmsetest,rmsetestwrong]