def __init__(self, tx=None, y=None, z=None, num_components=50): [D, self.WW, self.mu] = pca(tx, num_components) self.y = y self.tx = tx self.z = z for xi in tx: self.projections.append( project(self.WW, xi.reshape(1, -1), self.mu))
def mnistProcess(): """ 分析minist-5,目标图片是数字5的手写 将不同pcnum的结果保存到data/mnistprocess 这里将每一行当作一个样本,即28个28维样本。 可选的其他样本划分方式: 1. 加入行序号:由于行号规律强,PCA通常不会将其作为主成分,所以加不加影响不大。 2. 28*28个三维样本,(行号,列号,灰度):行列号对PCA影响不大,仅剩下一个维度无法PCA """ im = readData() for i in range(50): # i=17与i=18,人眼就非常难以分辨了。 lowD, newD, topdvects = pca(im, i) PSNR = analyzePSNR(im, newD) print(i, PSNR) plt.imshow(np.matrix.tolist(newD), cmap='gray') plt.savefig("data/mnistprocess/" + str(i) + ".png")
def look_rawdata(origin_num): fin = open('data/microarray.original.txt', 'r') lines = [] fin.readline() fout = open('output/pca/pca%.2f.txt' % PCA_PERCENTAGE, 'w') dataset_np = np.zeros([ALL_DATA, PCA[str(PCA_PERCENTAGE)]]) # read raw data for i in range(22283): line = fin.readline() line = line.split('\t') line = line[1:] lines.append(list(map(float, line))) print("Data has been read successfully.") # do PCA data = np.array(lines).T print("Now reducing dimension...") lowDData = pca(data, PCA_PERCENTAGE) #print(lowDData[0][0]) print("Finished, the new dimension is :" + str(len(lowDData[0]))) # save pca results (.txt file and .npy) print("Start writing new data...") j = 0 for k in origin_num: for num in k[1]: for i in range(len(lowDData[num])): dataset_np[j][i] = lowDData[num][i].real # the number will be xxx+0j without .real fout.write(str(lowDData[num][i].real) + '\t') j += 1 fout.write('\n') np.save('output/pca/pca%.2f.npy' % PCA_PERCENTAGE, dataset_np) print("Finished the whole work.") fin.close() fout.close() return dataset_np
def Train(): global MLPObj, PrepareObj, RBFObj, PCAObj, sc_x MLPObj = MLP() PrepareObj = Preparation() RBFObj = RBF() PCAObj = pca() x_train, y_train, x_test, y_test, Original_x_train, Original_x_test = PrepareObj.GetDataset( "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Training", "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Testing" ) if NNPCAVar.get(): PCAObj.LoadWeights() x_train = PCAObj.transform(Original_x_train) x_test = PCAObj.transform(Original_x_test) from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_train = sc_x.fit_transform(x_train) x_test = sc_x.transform(x_test) if LoadTrainVar.get(): if AlgoVar.get(): MLPObj.TrainTheModel(Hidden_Entry.get(), epochs_Entry.get(), LearningRate_Entry.get(), Neurons_Entry.get(), Activation_Entry.get(), MSE_Entry.get(), var.get(), x_train, y_train, x_test, y_test) else: RBFObj.TrainTheModel_rbf(Neurons_Entry.get(), LearningRate_Entry.get(), MSE_Entry.get(), epochs_Entry.get(), 5, x_train, y_train, x_test, y_test) else: if AlgoVar.get(): MLPObj.LoadWeights(Hidden_Entry.get(), epochs_Entry.get(), LearningRate_Entry.get(), Neurons_Entry.get(), Activation_Entry.get(), MSE_Entry.get(), var.get(), x_train, y_train, x_test, y_test) else: RBFObj.LoadWeights(Neurons_Entry.get(), LearningRate_Entry.get(), MSE_Entry.get(), epochs_Entry.get(), 5, x_train, y_train, x_test, y_test)
print("h:%s erro:%s"%(i,NaiveBayes.classificarParzen(wpdcCopia, mp1, mp2, i, ["N","R"]))) print(confusion_matrix(WpdcOri.classes, wpdcCopia.classes,["N","R"])) print("naive com janela de parzen retangular - wbdc") for i in h: wbdcCopia = Base(copy.deepcopy(wbdcOri.classes),copy.deepcopy(wbdcOri.atributos)) print("h:%s erro:%s"%(i,NaiveBayes.classificarParzen(wbdcCopia, m1, m2, i, ["M","B"],"r"))) print(confusion_matrix(wbdcOri.classes, wbdcCopia.classes,["M","B"])) print("naive com janela de parzen retangular - wpdc") for i in h: wpdcCopia = Base(copy.deepcopy(WpdcOri.classes),copy.deepcopy(WpdcOri.atributos)) print("h:%s erro:%s"%(i,NaiveBayes.classificarParzen(wpdcCopia, mp1, mp2, i, ["N","R"]))) print(confusion_matrix(WpdcOri.classes, wpdcCopia.classes,["N","R"])) #Q6 print("---------------------Sexta Questao------------------------\n") wbdcPCA = pca(wbdcOri, len(wbdcOri.atributos[0])-1) m1,m2 = separarElementosPorClasse(wbdcPCA, ["M","B"]) v1 = np.var(m1) v2 = np.var(m2) m1 = np.mean(m1, axis=0) m2 = np.mean(m2, axis=0) print("erro wbdc naiveBayes univariado:%s"%NaiveBayes.classificar(m1, m2, v1, v2, wbdcPCA,["M","B"],"u")) print(confusion_matrix(wbdcOriSort.classes, wbdcPCA.classes)) WpdcPCA = pca(WpdcOri,len(WpdcOri.atributos[0])-1) m1,m2 = separarElementosPorClasse(WpdcPCA, ["N","R"]) v1 = np.var(m1) v2 = np.var(m2) m1 = np.mean(m1, axis=0) m2 = np.mean(m2, axis=0) print("erro wpdc naiveBayes univariado:%s"%NaiveBayes.classificar(m1, m2, v1, v2, WpdcPCA,["N","R"],"u"))
from PCA import pca TrainingDatasetPath = "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Training" TestingDatasetPath = "C:\\Users\\Lenovo-PC\\Desktop\\neural-network-course\\Project\\Data set\\Testing" classes = [] tmp_x_train = np.full((25,2500),0) y_train = np.full((25,5),0) idx=0 for filename in glob.glob(TrainingDatasetPath + '/*.jpg'): img = cv2.imread(filename,0) GrayImage = cv2.resize(img, (50, 50)) tmp_x_train[idx,:] = np.array(GrayImage).reshape((1,2500)) image = filename[len(TrainingDatasetPath)+2:] if image.split("- ")[1][:-4] not in classes: classes.append(image.split("- ")[1][:-4]) y_train[idx,classes.index(image.split("- ")[1][:-4])] = 1 idx = idx + 1 tmp_x_test = np.full((26,2500),0) y_test = np.full((26,5),0) idx=0 for filename in glob.glob(TestingDatasetPath + '/*.jpg'): img = cv2.imread(filename,0) GrayImage = cv2.resize(img, (50, 50)) tmp_x_test[idx,:] = np.array(GrayImage).reshape((1,2500)) image = filename[len(TrainingDatasetPath)+2:] y_test[idx,classes.index(image.split("- ")[1][:-4])] = 1 idx = idx + 1 o = pca() w = o.fit(23,tmp_x_train,50,0.00000000000001)
cf_rating[indx] = res # For every item rated by the user "user_id", we now also have the collaborative filtering rating of the neighborhood. # end for user_id return cf_rating if __name__ == "__main__": os.chdir("..") ratings_train = pd.load('proc_data/ratings_train.pda') accuracyList = list() # a list of (embedding, deviation_from_rating, misclassification_error) tuples for k in range(2, 1677, 20): #Get user embedding userEmbedding = pca(ratings_train[['userid', 'itemid', 'rating']], k, 0).real print "Computed %d-dimensional user embedding." %(k) # Calculate collaborative filtering ratings cf = CFilter(ratings_train, userEmbedding, size = 20) print "Built CFilter object" predictedRatings = cf.get_user_cf_rating(ratings_train) # Estimate training error in terms of two metrics: average deviation from true # rating and average misclassification error in terms of classifying a movie # as "good" or "bad". predictedLabels = np.array([1 if rat > 3 else 0 for rat in predictedRatings]) # make the "isgood" column of the data map to {0,1} instead of {-1, 1} # so that average squared loss works correctly
def main(lib_docs, con_docs, lib_test_docs, con_test_docs, num_evecs, num_words, cutoff=False, cutoff_rate=1.0): ### MODEL CONSTRUCTION ### # tokenize the docs lib_tokenized_docs = [] con_tokenized_docs = [] for sentence in lib_docs: tokens = toknize_article(sentence) lib_tokenized_docs.append(tokens) for sentence in con_docs: tokens = toknize_article(sentence) con_tokenized_docs.append(tokens) # concat tokenized_docs lists all_tokenized_docs = lib_tokenized_docs + con_tokenized_docs # use all_tokenized_docs so that matrix's # of features matches dict = corpora.Dictionary(all_tokenized_docs) # create matrix for each category lib_docs_matrix = vectorize_articles(lib_tokenized_docs, dict) con_docs_matrix = vectorize_articles(con_tokenized_docs, dict) # stack them and use it to do PCA on whole training data all_docs_matrix = np.vstack((lib_docs_matrix, con_docs_matrix)) (proj_matrix, e_vecs, e_vals)= pca(all_docs_matrix) print lib_docs_matrix.shape print con_docs_matrix.shape # project each category matrix onto the transpose of eigenvector matrix if cutoff: cutoff_index = pca_cutoff(e_vals, cutoff_rate) lib_proj_matrix = np.dot(lib_docs_matrix, e_vecs.T)[:,:cutoff_index] con_proj_matrix = np.dot(con_docs_matrix, e_vecs.T)[:,:cutoff_index] else: lib_proj_matrix = np.dot(lib_docs_matrix, e_vecs.T) con_proj_matrix = np.dot(con_docs_matrix, e_vecs.T) print lib_proj_matrix.shape print con_proj_matrix.shape # take mean of all rows and get a vector representing the average sentence for # each category lib_mean_vector = lib_proj_matrix.mean(axis=0) con_mean_vector = con_proj_matrix.mean(axis=0) print lib_mean_vector print lib_mean_vector.shape print con_mean_vector print con_mean_vector.shape X = np.vstack((lib_mean_vector, con_mean_vector)) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) ### TESTING ### # tokenize the test docs lib_tokenized_test_docs = [] con_tokenized_test_docs = [] for sentence in lib_docs: tokens = toknize_article(sentence) lib_tokenized_test_docs.append(tokens) for sentence in con_docs: tokens = toknize_article(sentence) con_tokenized_test_docs.append(tokens) # create matrix for each category lib_test_docs_matrix = vectorize_articles(lib_tokenized_test_docs, dict) con_test_docs_matrix = vectorize_articles(con_tokenized_test_docs, dict) # project each matrix to eigenspace if cutoff: lib_proj_test_matrix = np.dot(lib_test_docs_matrix, e_vecs.T)[:,:cutoff_index] con_proj_test_matrix = np.dot(con_test_docs_matrix, e_vecs.T)[:,:cutoff_index] else: lib_proj_test_matrix = np.dot(lib_test_docs_matrix, e_vecs.T) con_proj_test_matrix = np.dot(con_test_docs_matrix, e_vecs.T) lib_result = kmeans.predict(lib_proj_test_matrix) con_result = kmeans.predict(con_proj_test_matrix) lib_hit = float(np.count_nonzero(lib_result == 0)) con_hit = float(np.count_nonzero(con_result == 1)) lib_accuracy = lib_hit/len(lib_result) con_accuracy = con_hit/len(con_result) print "Liberal Accuracy: ", lib_accuracy print "Conservative Accuracy: ", con_accuracy # find the top n words for the top m eigenvectors for row in range(num_evecs): print find_topn_words(e_vecs[row,:], dict, num_words) # plot along eigenvectors lib_x = lib_proj_test_matrix[:,0] lib_y = lib_proj_test_matrix[:,1] con_x = con_proj_test_matrix[:,0] con_y = con_proj_test_matrix[:,1] lib_z = lib_proj_test_matrix[:,2] con_z = con_proj_test_matrix[:,2] lib_xyz = [lib_x, lib_y, lib_z] con_xyz = [con_x, con_y, lib_z] two_dim_eigenplot(lib_xyz[:2], con_xyz[:2]) three_dim_eigenplot(lib_xyz, con_xyz)
h, w = ggray.shape thresh = np.array([[255 if pixel > 0 else 0 for pixel in row] for row in ggray]) b = np.array(get_boundry_img_matrix(thresh, bval=1), dtype=np.float32) perameter = np.sum(b) / (h * w) area = np.sum( np.sum([[1.0 for j in range(w) if ggray[i, j]] for i in range(h)])) mean_area = area / (h * w) r, b, g = np.sum([ gcolor[i, j] for j in range(gcolor.shape[1]) for i in range(gcolor.shape[0]) ], axis=0) / (area * 256) _, _, eigen_value = pca(ggray) eccentricity = eigen_value[0] / eigen_value[1] l = [ mean_area, perameter, r, b, g, eigen_value[0], eigen_value[1], eccentricity ] ftrain.append(np.array(l)) for gi in range(len(xctest)): gcolor = xctest[gi] ggray = xgtest[gi] h, w = ggray.shape thresh = np.array([[255 if pixel > 0 else 0 for pixel in row] for row in ggray]) b = np.array(get_boundry_img_matrix(thresh, bval=1), dtype=np.float32)
# [ 1 3 -3 1] [P0] # B(t) = T(t)*CP = [t^3 t^2 t^1 t^0] * | 3 -6 3 0| * |P1| # |-3 3 0 0| |P2| # [ 1 0 0 0] [P3] # # t_j = sum(i = 1 : j){d(p_i, p_(i-1))} / sum(i = 1 : N){d(p_i, p_(i-1))} # d(p_i, p_j) = sqrt((u_i - u_j)^2 + (v_i - v_j)^2) # =============================================================================================== # N_select = 10 # calculate principal direction, eigen values are sorting by ascending # set z_value as 0. points = np.array(points) points[:, 2] = 0. eigen_values, eigen_vectors = pca(points) principal = eigen_vectors[:, -1] normal = eigen_vectors[:, -2] print("principal direction is:\n", principal) print('normal direction is\n', normal) #show_pca(points, normal) sampled_points = subsample_along_principal(points, principal, N_select) show_samples(points, sampled_points) figure_title = 'curve fitting' # plot_scatters(points, figure_title) # plot_1st_order_curve(points, figure_title) # plot_2nd_order_curve(points, figure_title) plot_3rd_order_curve(points, figure_title)
# n = count # break #print("eigValIndice=\n",eigValIndice) n_eigValIndice = eigValIndice[-1:-(n + 1):-1] #[-1,-2),其实也就是第一个 #print("n_eigValIndice=\n",n_eigValIndice) n_eigVect = eigVects[:, n_eigValIndice] #二维的 #print("n_eigVect=\n",n_eigVect) lowDataMat = newData * n_eigVect reconMat = (lowDataMat * n_eigVect.T) + meanVal return lowDataMat, reconMat if '__main__' == __name__: data = scipy.io.loadmat("BU3D_feature.mat") dataMat = data.get("data") y = np.array(dataMat)[:, -1] dataMat = np.delete(dataMat, -1, axis=1) dataMat, reconMat = pca(dataMat=dataMat, n=2) #plotData(dataMat=dataMat,reconMat=reconMat) plt.scatter(dataMat[:, 0].tolist(), dataMat[:, 1].tolist(), marker='o', c=y) plt.title('PCA') plt.show()
def call(options): if options == 1: openfile(1, 1) strip_empty_chars(1) convert_to_float(1) pca(rowdata, 1) openfile(2, 1) open_label_file(1) appendFile(1) convert_to_float(2) write_processed_file(1) openfile(3, 1) strip_empty_chars(2) convert_to_float(3) pca(testdata, 2) openfile(4, 1) open_label_file(2) appendFile(2) convert_to_float(4) write_to_new_file(1) elif options == 2: openfile(1, 1) strip_empty_chars(1) convert_to_float(1) corFil(rowdata, 1, attr) openfile(2, 2) open_label_file(1) appendFile(1) convert_to_float(2) write_processed_file(2) openfile(3, 2) strip_empty_chars(2) convert_to_float(3) corFil(testdata, 2, attr) openfile(4, 2) open_label_file(2) appendFile(2) convert_to_float(4) write_to_new_file(2) elif options == 3: openfile(1, 1) strip_empty_chars(1) convert_to_float(1) varFil(rowdata, 1, attr) openfile(2, 3) open_label_file(1) appendFile(1) convert_to_float(2) write_processed_file(3) openfile(3, 3) strip_empty_chars(2) convert_to_float(3) varFil(testdata, 2, attr) openfile(4, 3) open_label_file(2) appendFile(2) convert_to_float(4) write_to_new_file(3) else: openfile(1, 1) strip_empty_chars(1) convert_to_float(1) open_label_file(1) open_label_file(2) appendfile_woreduction(1) convert_to_float(1) write_processed_file(4) openfile(3, 1) strip_empty_chars(2) convert_to_float(3) appendfile_test(1) convert_to_float(3) write_to_new_file(4)
import os import pandas as pd import pickle as pkl if __name__ == "__main__": os.chdir("..") ratings_train = pd.load('proc_data/ratings_train.pda') accuracyList = list( ) # a list of (embedding, deviation_from_rating, misclassification_error) tuples for k in range(2, 941, 20): #Get user embedding itemEmbedding = pca(ratings_train[['userid', 'itemid', 'rating']], k, 1).real #print itemEmbedding print "Computed %d-dimensional user embedding." % (k) # Calculate collaborative filtering ratings cf = CFilter_item(ratings_train, itemEmbedding, size=20) print "Built CFilter object" predictedRatings = cf.get_item_cf_rating(ratings_train) # Estimate training error in terms of two metrics: average deviation from true # rating and average misclassification error in terms of classifying a movie # as "good" or "bad". predictedLabels = np.array( [1 if rat > 3 else 0 for rat in predictedRatings])
(predictions, ) = tuple( LSTM.predict(input_fn=tf.contrib.timeseries. predict_continuation_input_fn(evaluation, steps=5))) observed_times = evaluation["times"][0] observed = evaluation["observed"][0, :, :] evaluated_times = evaluation["times"][0] evaluated = evaluation["mean"][0] predicted_times = predictions['times'] predicted = predictions["mean"] return observed, evaluated, predicted _ = pca() data = _.start() group_id = data.Group_ID.unique() jsn_dict = {} jsn_Dict = {} jsn_Dict['Data'] = [] for i in group_id: tf.reset_default_graph() obsList = [] evaList = [] preList = [] data_group = data[data.Group_ID == i]['Grade'] if data_group.shape[0] > 50: o, e, p = model_h(data_group)
label_true = label_test[index_test] #计算准确率 if label_vote == label_true: count = count + 1 index_test = index_test + 1 #print(count) accuracy = float(count) / len(dataset_test) return accuracy if __name__ == '__main__': #加载数据集 dataset_train, label_train = load_dataset( '../two datasets/sonar-train.txt', ',') dataset_test, label_test = load_dataset('../two datasets/sonar-test.txt', ',') #根据训练集计算投影矩阵 K = [10, 20, 30] for k in K: W = pca(dataset_train, k) #计算降维后的样本 dataset_train_K = transform(dataset_train, W) dataset_test_K = transform(dataset_test, W) #1NN方法计算准确率 accuracy = oneNN(dataset_train_K, dataset_test_K, label_train, label_test) print "k= %d, accuracy= %f" % (k, accuracy)
# 数中的重了,相当于求了两次协方差,所以相当于对data的协方差进行了特征提取,而本应该对data进 # 行特征提取!所以造成了U和V矩阵一直不对劲,造成了U和V一直都是1024×1024,实际上应该是:U矩 # 阵为m×m,V矩阵为1024×1024。各自的作用是:U矩阵负责行降维(提取前k列为主要特征),V矩阵负 # 责列降维(提取前k行为主要特征)。本例中,之前一直用U矩阵降维是错的,应该用V矩阵降维才对。 # import numpy as np import matplotlib.pyplot as plt import scipy.io as scio from PCA import pca, normalize, display import cap # ************* 练习1:2D变1D ************* # 需要把PCA算法中的k设置为1,并添加可视化 data = scio.loadmat('ex7data1.mat')['X'] data, _ = normalize(data) # 数据预处理--特征缩放 u, v, _ = pca(data) # 执行PCA算法 k = 1 v_reduce = v[:k, :] z = np.matmul(data, v_reduce.T) data_re = np.matmul(z, v_reduce) # 可视化 plt.plot(data[:, 0], data[:, 1], '.b') # 原始样本 plt.plot([v_reduce[0][0], 0], [v_reduce[0][1], 0], '--r') # 特征向量 plt.plot(data_re[:, 0], data_re[:, 1], 'or') # 压缩后再重建输入 plt.show() # **************** 练习2:图片压缩 **************** # Dataset共有1000张图片,只用前100个进行训练 data = scio.loadmat('ex7faces.mat')['X'][:100, :] # 导入数据 data, data_mean = normalize(data) # 归一化处理
x, y = get_info(info, n) X = get_root(fa, x) Y = get_root(fa, y) if X != Y: fa[X] = Y cluster_num = cluster_num - 1 for i in range(n): get_root(fa, i) return fa if __name__ == '__main__': clusters = 700 data, label_family, label_genus, label_species, label_record = data_reader.read_frog_data( ) data = pca(data, 10) data = data / data.max(axis=0) res = [0 for _ in range(data.shape[0])] start_time = time.time() cluster_result = agnes(data, clusters) cluster_set = set(cluster_result) ii = 0 for index in cluster_set: cluster_indexs = [ i for i, x in enumerate(cluster_result) if x == index ] for cluster_index in cluster_indexs: res[cluster_index] = ii ii = ii + 1