def main(): # get dataset data = pd.read_pickle("cluster.pkl") data_np = data.values # get mean mean = PCA.calc_mean(data_np) U = PCA.getU("PCA_eigen_cluster.pkl") # get error for data space error = [] featureSpace = [] prevError = sys.maxint reconstructError = 0.0 k = 0 # find smallest feature space to reduce data set for k in range(10): print "k: " + str(k) prevError = reconstructError newSpace, eigen_vectors = PCA.reduce(data_np, k, U.values, mean) reconstructError = PCA.reconstruction_error(newSpace, data_np, eigen_vectors, mean, k) print "reconstr error: " + str(reconstructError) error.append(reconstructError) featureSpace.append(k) print "Smallest feature space size: " + str(k) plt.plot(featureSpace, error, marker=".") plt.ylabel("Reconstruction Error") plt.xlabel("Size of Reduced Feature Space") plt.title("Size of Reduced Feature Space vs Reconstruction Error") plt.savefig("Error for PCA Cluster")
def main(): global group_num # get parser k = int(sys.argv[1]) print 1 print 2 train = pd.read_pickle("cluster.pkl") reduced_train = PCA.reduce(train.values, 50, PCA.getU("PCA_eigen_cluster.pkl").values, PCA.calc_mean(train.values)) print 3 cluster_center, cluster_idx = cluster(reduced_train, k) print 4 print cluster_center print cluster_center.shape print cluster_idx print 5 articles = train.index.values groupings = {} for i in range(k): group_num = i b = np.apply_along_axis(isInGroup, 0, cluster_idx) groupings[i] = articles[b] print(groupings) for key in groupings: print groupings[key].shape
def confirm_PCA(self, event): self.fig_PCA.clear() self.checkedPCAStrings = self.PCA_selection.GetCheckedStrings() self.pca_color = self.color_on_pca.GetValue() self.pca_shape = self.shape_on_pca.GetValue() self.key_headers_PCA = [ self.x_axis_selection.GetValue(), self.y_axis_selection.GetValue(), self.checkedPCAStrings, self.pca_color, self.size_slider_PCA.GetValue(), self.pca_shape, self.label_points_pca.GetValue() ] if self.data_been_filtered: self.fig_PCA = pca.pca_(self.dataFiltered, self.key_headers_PCA, self.fig_PCA, self.CLR_check.GetValue(), self.arrows_check.GetValue(), self.samples_check.GetValue(), self.colordict, self.shapedict) else: self.fig_PCA = pca.pca_(self.data, self.key_headers_PCA, self.fig_PCA, self.CLR_check.GetValue(), self.arrows_check.GetValue(), self.samples_check.GetValue(), self.colordict, self.shapedict) self.PCA_plot = self.fig_PCA self.canvas2.draw() self.confirm_btn_PCA.SetLabelText("Update Graph") self.PCA_button.Enable(True)
def eigenface(trainData, testData, dataVariety): # standardize train data dropTrainData = trainData.drop("variety", axis=1) trainMean = dropTrainData.sum() trainMean = trainMean.values.reshape([dropTrainData.shape[1], 1]) trainMean = trainMean / dropTrainData.shape[0] newtrainData = PCA.normalize(trainData, trainMean) # calculate xT * x and its eigenvector normTrainData = newtrainData.drop("variety", axis=1) normTrainData = np.array(normTrainData) X = np.transpose(normTrainData) tempMat = np.zeros([X.shape[1], X.shape[1]]) np.matmul(np.transpose(X), X, tempMat) eigValX, eigVecX = np.linalg.eigh(tempMat) # calculate X * eigenvector newEigVecX = np.zeros([X.shape[0], eigVecX.shape[1]]) newEigVecX = np.matmul(X, eigVecX) # normalize eigenvector newEigVecX = np.transpose(newEigVecX) length = np.linalg.norm(newEigVecX, axis=1) for i in range(newEigVecX.shape[0]): newEigVecX[i] /= length[i] normEigVec = np.transpose(newEigVecX) # calculate A L = 20 maxEigIdx = np.argsort(-eigValX) A = [] for i in range(L): A.append(normEigVec[:, maxEigIdx[i]]) A = np.array(A) A = np.transpose(A) newtestData = PCA.normalize(testData, trainMean) # projection of train data projTrainFrame = PCA.project(A, newtrainData) # projection of test data projTestFrame = PCA.project(A, newtestData) # # classify test data by likelihood # g1, testIdx1, success1, confusion_mat1 = Likelihood.likelihood(projTrainFrame, projTestFrame, dataVariety) # Header.calAccuracy(success1, projTestFrame) # Header.ROC_AUC(projTestFrame, dataVariety, g1, testIdx1) # Header.drawConfusionMat(confusion_mat1, dataVariety) # classify test data by bayes names = [] for i in range(projTestFrame.shape[1] - 1): names.append('0') names.append('variety') g2, testIdx2, success2, confusion_mat2 = Bayes.bayes(projTrainFrame, projTestFrame, dataVariety, names) Header.calAccuracy(success2, projTestFrame) Header.drawConfusionMat(confusion_mat2, dataVariety)
def main(runIndex=None): print("Starting Main.main()") # if the required directory structure doesn't exist, create it makeDirectoryStructure(address) # now start the GMM process Load.main(address, filename_raw_data, runIndex, subsample_uniform,\ subsample_random, subsample_inTime, grid, conc, \ fraction_train, inTime_start, inTime_finish,\ fraction_nan_samples, fraction_nan_depths, cov_type,\ run_bic=False) # loads data, selects train, cleans, centres/standardises, prints PCA.create(address, runIndex, n_dimen, use_fPCA) GMM.create(address, runIndex, n_comp, cov_type) PCA.apply(address, runIndex) GMM.apply(address, runIndex, n_comp) # reconstruction (back into depth space) Reconstruct.gmm_reconstruct(address, runIndex, n_comp) Reconstruct.full_reconstruct(address, runIndex) Reconstruct.train_reconstruct(address, runIndex) # calculate properties mainProperties(address, runIndex, n_comp)
def pca(self, X, dim=25): """ 进行PCA降维 :param X: 图片 :param dim: 将维后图片维度 """ pca = PCA(X) output = pca.reduction(dim=25) return output
def pcaOnMnist(training, dimension=700): principalComponents = PCA.pca(training, dimension) low, same = PCA.reduce(principalComponents, training) image2DInitial = vectorToImage(training[0], (28,28)) print same[0].shape image2D = vectorToImage(same[0], (28,28)) plt.imshow(image2DInitial, cmap=plt.cm.gray) plt.show() plt.imshow(image2D, cmap=plt.cm.gray) plt.show() print "done"
def prepareFMNISTData(scale=0, PCA_threshold=-1, Whitening=0, PCA_p=None): mndata = MNIST('fashion_data') imagesTrain, labelsTrain = mndata.load_training() imagesTest, labelsTest = mndata.load_testing() X_test = np.array(imagesTest) y_test = np.array(labelsTest) n = len(imagesTrain) np.random.seed(RANDOM_SEED) indices = np.random.permutation(n) trainingIndex = indices[:int(4 * n / 5)] validationIndex = indices[int(4 * n / 5):] X_train = np.array(imagesTrain)[trainingIndex] y_train = np.array(labelsTrain)[trainingIndex] X_val = np.array(imagesTrain)[validationIndex] y_val = np.array(labelsTrain)[validationIndex] if (PCA_threshold != -1): [Z_train, p, Xr, U, W] = PCA(X_train, PCA_threshold) if PCA_p is not None: p = PCA_p [Z_test, Xr] = project(X_test, U, p) [Z_val, Xr] = project(X_val, U, p) X_train = Z_train[:, :p] X_val = Z_val[:, :p] X_test = Z_test[:, :p] print("PCA_Threshold = " + str(PCA_threshold) + ", P = " + str(p)) if (scale == 1): mean = np.mean(X_train, axis=0) X_train = X_train - mean X_test = X_test - mean X_val = X_val - mean variance = np.var(X_train, axis=0) X_train = X_train / np.sqrt(variance) X_test = X_test / np.sqrt(variance) X_val = X_val / np.sqrt(variance) if (Whitening == 1): [Z, p, X3, U, W] = PCA(X_train, 1.0) X_train = whiteningTransform(X_train, W, U) X_test = whiteningTransform(X_test, W, U) X_val = whiteningTransform(X_val, W, U) return (X_train, y_train, X_val, y_val, X_test, y_test)
def main(): train = pd.read_pickle("cluster.pkl") reduced_data = PCA.reduce(train.values, 50, PCA.getU("PCA_eigen_cluster.pkl").values, PCA.calc_mean(train.values)) heterogeneity_k_means = [] heterogeneity_spectral = [] ks = range(1, 51) spectral_laplacian = spectral.setup(train.values) for k in ks: print "k: " + str(k) bestSSD_k_means = sys.maxint bestSSD_spectral = sys.maxint spectral_eigen = spectral.computeEigen(spectral_laplacian, k) # do clustering 3 times for each k for i in range(5): print "i: " + str(i) print "k_means" cluster_center_k_means, cluster_idx_k_means = k_means.cluster( reduced_data, k) ssd_k_means = SSD(reduced_data, cluster_center_k_means, cluster_idx_k_means) if ssd_k_means < bestSSD_k_means: bestSSD_k_means = ssd_k_means print "Spectral" cluster_center_spectral, cluster_idx_spectral = spectral.cluster( spectral_eigen, k) ssd_spectral = SSD(spectral_eigen, cluster_center_spectral, cluster_idx_spectral) if ssd_spectral < bestSSD_spectral: bestSSD_spectral = ssd_spectral # append best ssd heterogeneity_k_means.append(bestSSD_k_means) heterogeneity_spectral.append(bestSSD_spectral) plt.figure(1) plt.plot(ks, heterogeneity_k_means, marker=".") plt.ylabel("Heterogeneity") plt.xlabel("k") plt.title("k vs Heterogeneity for k means") plt.xticks(np.arange(0, max(ks), 2.0)) plt.savefig("heterogeneity_k_means_cluster.png") plt.figure(2) plt.plot(ks, heterogeneity_spectral, marker=".") plt.ylabel("Heterogeneity") plt.xlabel("k") plt.title("k vs Heterogeneity for spectral") plt.xticks(np.arange(0, max(ks), 2.0)) plt.savefig("heterogeneity_spectral_cluster.png")
def loadProjectPCA(): saveFileName = askopenfilename(initialdir = "/",title = "Select file",filetypes = (("Phenotype Files","*"),("all files","*.*"))) saveFileObject = open(saveFileName) i = 0; next = saveFileObject.readline() PCADataRead = next PCADataRead = PCADataRead.strip() #reads through the text file and takes out the save data while i < 5: if(i == 0): PCAPhenoRead = saveFileObject.readline() PCAPhenoRead = PCAPhenoRead.rstrip('\n') if(i == 1): columnPCAEvec1read = saveFileObject.readline() columnPCAEvec1read = columnPCAEvec1read.strip() if(i == 2): columnPCAEvec2read = saveFileObject.readline() columnPCAEvec2read = columnPCAEvec2read.strip() if(i == 3): columnPCAEvec3read = saveFileObject.readline() columnPCAEvec3read = columnPCAEvec3read.strip() if(i == 4): columnPCAPhenoRead = saveFileObject.readline() columnPCAPhenoRead = columnPCAPhenoRead.strip() i = i + 1 #Creates a new plot when loaded PCAPlotterLoad = PCA.PCAPlotter() PCAPlotterLoad.readFile1(PCAPhenoRead) PCAPlotterLoad.readFile2(PCADataRead) PCAPlotterLoad.connectFilesAddColour(int(columnPCAEvec1read), int(columnPCAEvec2read), int(columnPCAEvec3read), int(columnPCAPhenoRead)) PCAPlotterLoad.plotGraph()
def prepare_data(self, test_data_perc=0.2): self.data = shuffle(self.data) self.data_np_arr1 = self.data.values self.features = np.shape(self.data_np_arr1)[1] - 1 if self.pca_decompose: self.PCAObj = PCA.PCADecompose(self.num_feature_to_decompose) d_x = self.data_np_arr1[:, 0:self.features] d_y = self.data_np_arr1[:, self.features:] data_new = self.PCAObj.transform_data(d_x, d_y) self.data_np_arr = data_new self.features = np.shape(self.data_np_arr)[1] - 1 else: self.data_np_arr = self.data_np_arr1 train, test = train_test_split(self.data_np_arr, test_size=test_data_perc) self.X_train = train[:, 0:self.features] self.Y_train = train[:, self.features:] validation, test = train_test_split(test, test_size=0.5) self.X_validation = validation[:, 0:self.features] self.Y_validation = validation[:, self.features:] self.X_test = test[:, 0:self.features] self.Y_test = test[:, self.features:] print("X Train shape ", np.shape(self.X_train)) print("Y Train shape ", np.shape(self.Y_train)) print("X Validation shape ", np.shape(self.X_validation)) print("Y Validation shape ", np.shape(self.Y_validation)) print("X Test shape ", np.shape(self.X_test)) print("Y Test shape ", np.shape(self.Y_test))
def treat_data(self, data): data, name = data.drop( ['participant'], axis=1).as_matrix(), data['participant'].tolist() data = PCA.PCA(data) # PCA it return data, name
def callPCA(dataset, attrNum, k): X, y = g.splitXandY(dataset, attrNum, len(dataset)) print(k) finalData, reconMat = PCA.pca(X, k) # PCA.plotBestFit(finalData, reconMat, y) return np.hstack((finalData, y)), np.hstack((reconMat, y))
def read(PCA_v = True, covariances = None, begin=0, end=10): prev = None X = None Y = None for i in range(begin,end): sample = dirList_2[i][-10:] mat_string = dirList_2[i] + sample + '.mat' arousal_string = dirList_2[i] + sample print("Working on sample nr: ", i, ) x = sio.loadmat(mat_string)['val'] if (PCA_v): ann = wfdb.rdann(arousal_string, 'arousal') prev = PCA.get_matrices(x,ann,prev) else: arousal_string += '-arousal.mat' f = h5py.File(arousal_string, 'r') y = f['data']['arousals'][:] X,Y = extract_features(x,np.transpose(y),covariances,X,Y) print("-----------------------") if(PCA_v): return prev return X,Y
def construct_mnist(): # 主要成分 K = 1 # 手写数字 num = 9 # 样本数量 N = 100 print('read from MNIST_test.txt...') data = np.loadtxt('dataset/MNIST_test.txt', delimiter=',') # 切分 标签和 特征 Y = data[:, 0] X = data[:, 1:] ######单一数字###### # 获得某个手写数字的所有下标 indices = np.argwhere(Y == num) # 获得所有该数字的样本 X_n = X[indices][:N] # 展示原始图片 slice_imgs(X_n, 'original') # 主成分分析 特征重建 X_n_k, re_X_n = PCA(np.asarray(X_n).reshape((N, 784)), K) # 展示重建图片 slice_imgs(np.real(re_X_n), 'reconstruct') # 每张图片的信噪比 print('SNR of each picture...') print([compute_SNR(X_n[i], re_X_n[i]) for i in range(N)])
def preprocess(feature_abstract_method): # X_raw = raw_data.iloc[:, 1:] # y_raw = raw_data['label'] # X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2) # X_train.to_csv('x_train.csv') # X_test.to_csv('x_test.csv') # y_train.to_csv('y_train.csv') # y_test.to_csv('y_test.csv') X_train = pd.read_csv('x_train.csv', index_col=0) X_test = pd.read_csv('x_test.csv', index_col=0) y_train = pd.read_csv('y_train.csv', index_col=0, header=None) y_test = pd.read_csv('y_test.csv', index_col=0, header=None) if (feature_abstract_method == 'LBP'): X_train = LBP.lbp_extract(X_train) X_test = LBP.lbp_extract(X_test) elif (feature_abstract_method == 'PCA'): X_train, X_test = PCA.PCA_extract(X_train, X_test) elif (feature_abstract_method == 'skeleton'): X_train = SKELETON.skeleton_extract(X_train) X_test = SKELETON.skeleton_extract(X_test) elif (feature_abstract_method == 'grid'): X_train = GRID.grid_extract(X_train) X_test = GRID.grid_extract(X_test) elif (feature_abstract_method == 'hog'): X_train = HOG.hog_extract(X_train) X_test = HOG.hog_extract(X_test) return X_train, X_test, y_train, y_test
def get_feature_mattrix(): meal_files = [ 'MealNoMealData/mealData1.csv', 'MealNoMealData/mealData2.csv', 'MealNoMealData/mealData3.csv', 'MealNoMealData/mealData4.csv', 'MealNoMealData/mealData5.csv', ] meal_data = parse_and_interpolate(meal_files) data = meal_data[0] fft_features = get_fft_features(data) entropy_feature = get_entropy(data) moving_avg_features = moving_avg(data) normal_skew_feature = normal_skew(data) for index in range(1, len(meal_data)): data = meal_data[index] fft_features = np.concatenate((fft_features, get_fft_features(data)), axis=0) moving_avg_features = np.concatenate((moving_avg_features, moving_avg(data)), axis=0) entropy_feature = np.concatenate((entropy_feature, get_entropy(data)), axis=0) normal_skew_feature = np.concatenate((normal_skew_feature, normal_skew(data)), axis=0) feature_mattrix = np.concatenate((moving_avg_features, entropy_feature, fft_features, normal_skew_feature), axis=1) np.set_printoptions(suppress=True) PCA = p.cal_PCA() feature_mattrix = PCA.performPCA(feature_mattrix) return feature_mattrix, PCA
def getPC(coords, outFileName='ligBox.pdb'): size = coords.size shape = coords.shape if shape != (1, 3): if size != 0: eigenVectors, eigenValues = PCA.princomp(coords.T, numpc=3, getEigenValues=True) com = coords.mean(axis=0) projection = numpy.dot(coords - com, eigenVectors) signs = numpy.sign(numpy.sign(projection).sum(axis=0)) signs2 = numpy.sign( projection[numpy.abs(projection).argmax(axis=0)].diagonal()) signs[signs == 0] = signs2[signs == 0] eigenVectors = eigenVectors * signs vectors = com + eigenVectors.T * numpy.atleast_2d( numpy.sqrt(eigenValues)).T elif size == 0: com = numpy.zeros((3)) vectors = numpy.zeros((3, 3)) else: com = coords.flatten() vectors = numpy.zeros((3, 3)) # pdbBoxWriter(com, vectors, outFileName) return com, vectors
def draw_2d(): x2 = PCA(data_set.x, 2) plt.figure() plt.scatter(x2[0, :50], x2[1, :50], marker='x', color='m', s=30, label='Iris-setosa') plt.scatter(x2[0, 50:100], x2[1, 50:100], marker='+', color='c', s=50, label='Iris-versicolor') plt.scatter(x2[0, 100:150], x2[1, 100:150], marker='o', color='r', s=15, label='Iris-virginica') plt.legend() plt.title('PCA of IRIS k = 2') plt.show()
def small_data_test(): test_1 = PCA.pca_function([[1,0,1,1],[0,1,2,0],[1,1,2,0],[0,1,2,1]],2) result = [[0.8333, -0.5, -0.1666, -0.1666], [0.0, 0.0, 0.7071, -0.7071]] if test_1: return("All OK") else: return("something went wrong")
def main(): data = pd.read_csv('/Users/bytedance/Desktop/AI/data/wine.data.csv') label = data["0"].to_numpy() del data["0"] data = data / data.max(axis=0) # normalize data = data.to_numpy() # PCA K = 3 for thresh in [0.9, 0.8, 0.7, 0.6, 0.5]: new_data, _, _ = PCA.PCA(data.T, 2, True, thresh) ndim = new_data.shape[1] print( f"======== kmeans, K = {K}, ndim = {ndim}, thresh = {thresh} =========" ) if ndim == 2: plt.figure(1) plt.scatter(new_data[:, 0], new_data[:, 1], s=50) S, RI, predicted_label = Kmeans.test_kmeans(new_data, label, K) df_data = pd.DataFrame(new_data) df_label = pd.DataFrame(predicted_label) result_df = pd.concat([df_label, df_data], axis=1) result_df.to_csv(f"./result_ndim{ndim}_K{K}.csv")
def plotTestSet3(filepath): n = 1000 # number of points to create xcord0 = []; ycord0 = [] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] markers = [] colors = [] fw = open(filepath, 'w') for i in range(n): groupNum = int(3 * numpy.random.uniform()) [r0, r1] = numpy.random.standard_normal(2) if groupNum == 0: x = r0 + 16.0 y = 1.0 * r1 + x xcord0.append(x) ycord0.append(y) elif groupNum == 1: x = r0 + 8.0 y = 1.0 * r1 + x xcord1.append(x) ycord1.append(y) elif groupNum == 2: x = r0 + 0.0 y = 1.0 * r1 + x xcord2.append(x) ycord2.append(y) fw.write("%f\t%f\t%d\n" % (x, y, groupNum)) fw.close() fig = plt.figure() ax = fig.add_subplot(211) ax.scatter(xcord0, ycord0, marker='^', s=90) ax.scatter(xcord1, ycord1, marker='o', s=50, c='red') ax.scatter(xcord2, ycord2, marker='v', s=50, c='yellow') ax = fig.add_subplot(212) myDat = PCA.loadDataSet(filepath) lowDDat, reconDat = PCA.pca(myDat[:, 0:2], 1) label0Mat = lowDDat[numpy.nonzero(myDat[:, 2] == 0)[0], :2][0] # get the items with label 0 label1Mat = lowDDat[numpy.nonzero(myDat[:, 2] == 1)[0], :2][0] # get the items with label 1 label2Mat = lowDDat[numpy.nonzero(myDat[:, 2] == 2)[0], :2][0] # get the items with label 2 # ax.scatter(label0Mat[:,0],label0Mat[:,1], marker='^', s=90) # ax.scatter(label1Mat[:,0],label1Mat[:,1], marker='o', s=50, c='red') # ax.scatter(label2Mat[:,0],label2Mat[:,1], marker='v', s=50, c='yellow') ax.scatter(label0Mat[:, 0], numpy.zeros(numpy.shape(label0Mat)[0]), marker='^', s=90) ax.scatter(label1Mat[:, 0], numpy.zeros(numpy.shape(label1Mat)[0]), marker='o', s=50, c='red') ax.scatter(label2Mat[:, 0], numpy.zeros(numpy.shape(label2Mat)[0]), marker='v', s=50, c='yellow') plt.show()
def predict(data, components): pca, train_features, train_results, test_features, test_results, values = PCA.transform( components) clf = svm.SVC(kernel="rbf", gamma='auto', probability=True) PCA_data = pca.transform(data) clf.fit(train_features, train_results) outcome = clf.predict_proba(PCA_data) return (outcome, test_features, test_results, values)
def predict(data, components): pca, train_features, train_targets, test_features, test_results, values = PCA.transform( components) model = GaussianNB() # Train the model using the training sets model.fit(train_features, train_targets) PCA_data = pca.transform(data) predicted = model.predict_proba(PCA_data) return (predicted, test_features, test_results, values)
def main(): global group_num k = int(sys.argv[1]) train = pd.read_pickle("tfidf_small.pkl") reduced_data = PCA.reduce(train.values, 50, PCA.getU("PCA_eigen_cluster.pkl").values, PCA.calc_mean(train.values)) laplacian = setup(train.values) eigen_vectors = computeEigen(laplacian, k) cluster_center, cluster_idx = cluster(eigen_vectors, k) # display the data: articles = train.index.values groupings = {} for i in range(k): group_num = i b = np.apply_along_axis(isInGroup, 0, cluster_idx) groupings[i] = articles[b] print(groupings) for key in groupings: print groupings[key].shape
def main(run=None): print("Starting Main.main()") # Now start the GMM process Load.main(address, dir_raw_data, run, subsample_uniform, subsample_random,\ subsample_inTime, grid, conc, fraction_train, inTime_start,\ inTime_finish, fraction_nan_samples, fraction_nan_depths, dtype) #Load.main(address, filename_raw_data, run, subsample_uniform, subsample_random,\ # Loads data, selects Train, cleans, centres/standardises, prints PCA.create(address, run, n_dimen) # Uses Train to create PCA, prints results, stores object GMM.create(address, run, n_comp) # Uses Train to create GMM, prints results, stores object PCA.apply(address, run) # Applies PCA to test dataset GMM.apply(address, run, n_comp) # Applies GMM to test dataset # Reconstruction Reconstruct.gmm_reconstruct(address, run, n_comp) # Reconstructs the results in original space Reconstruct.full_reconstruct(address, run) Reconstruct.train_reconstruct(address, run) # new stuff DD 27/08/18, after seeing updates on DJ github #mainProperties(address, runIndex, n_comp) # Plotting -- first commented out DD #Plot.plotMapCircular(address, address_fronts, run, n_comp) #Plot.plotPosterior(address, address_fronts, run, n_comp, plotFronts=True) Plot.plotPostZonal(address, run, n_comp, dtype, plotFronts=False) ## zonal frequencies #Plot.plotPosterior(address, run, n_comp, dtype, plotFronts=False) ## works but data overlaps spatially... Plot.plotProfileClass(address, run, n_comp, dtype, 'uncentred') Plot.plotProfileClass(address, run, n_comp, dtype, 'depth') Plot.plotGaussiansIndividual(address, run, n_comp, dtype, 'reduced')#uncentred')#'depth')#reduced') # Plot.plotGaussiansIndividual(address, run, n_comp, 'depth') # ERROR NOT WOKRING PROPERLY # Plot.plotGaussiansIndividual(address, run, n_comp, 'uncentred') # ERROR NOT WOKRING PROPERLY #Plot.plotProfile(address, run, dtype, 'original') # these run just fine but are huge and unhelpful Plot.plotProfile(address, run, dtype, 'uncentred') Plot.plotWeights(address, run, dtype)
def confirm_scatter(self, event): self.fig_scatter.clear() self.scatter_color = self.color_on_scatter.GetValue() self.scatter_shape = self.shape_on_scatter.GetValue() size = self.size_slider_scatter.GetValue() self.key_headers_scatter = [ self.scatter_color, self.x_name_scatter.GetValue(), self.y_name_scatter.GetValue(), self.z_name_scatter.GetValue(), self.x1_name_scatter.GetValue(), self.y1_name_scatter.GetValue(), self.z1_name_scatter.GetValue(), size, self.scatter_shape, self.label_points_scatter.GetValue() ] limits = [ self.xLowLim.GetValue(), self.xUpLim.GetValue(), self.yLowLim.GetValue(), self.yUpLim.GetValue() ] log_scales = [ self.scatter_log_x.GetValue(), self.scatter_log_y.GetValue() ] if self.data_been_filtered: if not len(self.x_name_scatter.GetValue()) == 0 and not len( self.y_name_scatter.GetValue()) == 0: self.fig_scatter = pca.blank_scatter_plot( self.dataFiltered, self.key_headers_scatter, limits, self.fig_scatter, log_scales, self.colordict, self.shapedict) else: if not len(self.x_name_scatter.GetValue()) == 0 and not len( self.y_name_scatter.GetValue()) == 0: self.fig_scatter = pca.blank_scatter_plot( self.data, self.key_headers_scatter, limits, self.fig_scatter, log_scales, self.colordict, self.shapedict) self.scatter_plot = self.fig_scatter self.canvas3.draw() self.confirm_btn_scatter.SetLabelText("Update Graph") self.scatter_button.Enable(True)
def main(): percentages = dict() for PC in range(1, 274): PCA.init(PC) img = cv2.imread("c1.jpg", cv2.IMREAD_GRAYSCALE) kp1, des1 = get_descriptors(img) img2 = cv2.imread("c2.jpg", cv2.IMREAD_GRAYSCALE) kp2, des2 = get_descriptors(img2) # Matching between descriptors bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True) matches = sorted(bf.match(des1, des2), key=lambda match: match.distance) # # Plot keypoints # img4 = cv2.drawKeypoints(img, kp1, outImage=None) # img5 = cv2.drawKeypoints(img2, kp2, outImage=None) # f, axarr = plt.subplots(1, 2) # axarr[0].imshow(img4) # axarr[1].imshow(img5) # # plt.show() # # Plot matches # img3 = cv2.drawMatches(img, kp1, img2, kp2, matches, flags=2, outImg=None) # plt.imshow(img3) # # plt.show() # Calculate score score = 0 for match in matches: score += match.distance score_threshold = 33 k = 100 - (score / len(matches)) if score / len(matches) < score_threshold: print("PC " + str(PC) + ": Matches with " + str(k) + "%") percentages.update({PC: (k, "Yes")}) else: print("PC " + str(PC) + ": No Match with " + str(k) + "%") percentages.update({PC: (k, "No")}) pickle_out = open("percentages.pickle", "wb") pickle.dump(percentages, pickle_out) pickle_out.close() print('It took', time.time() - start, 'seconds.')
def test_PCA(): X = np.empty((100, 2)) X[:, 0] = np.random.uniform(0., 100., size=100) X[:, 1] = 0.75 * X[:, 0] + 3. + np.random.normal(0, 10., size=100) pca = PCA(n_components=2) pca.fit(X) print(pca.components_) # 降维 pca = PCA(n_components=1) pca.fit(X) X_reduction = pca.transform(X) print(X_reduction.shape) X_restore = pca.inverse_transform(X_reduction) print(X_restore.shape) plt.scatter(X[:, 0], X[:, 1], color='b') plt.scatter(X_restore[:, 0], X_restore[:, 1], color='r', alpha=0.5) plt.show()
def RunTrainLDA(infile, pcaFile, ldaFile): import cPickle fp = open(infile, "r") dataset = cPickle.load(fp) subjID = cPickle.load(fp) fp.close() pca = PCA(dataset) pca_proj = pca.compute() np.save(pcaFile, pca_proj) lda_proj = [] lda = LDA(dataset, subjID, pca_proj) projData = lda.projectData() lda_proj = lda.train(projData) np.save(ldaFile, lda_proj)
def pca_and_call(features=all_features, fn=using_distance_to_original, dim=2, k=-1): data = np.array([f[1] for f in features]) # Note: this warps the variable data data_rescaled = PCA.PCA(data, dim) features = [(features[i][0], data_rescaled[i]) for i in range(len(features))] if k > 0: return fn(features, k) return fn(features)
def pcaOnMnist(training, dimension=700): mean, principalComponents = PCA.pca(training, dimension) low, same = PCA.reduce(principalComponents, training, mean, noSame=False) print "low[0].shape" print low[0].shape image2DInitial = vectorToImage(training[0], (28,28)) print same[0].shape image2D = vectorToImage(same[0], (28,28)) image2DLow = vectorToImage(low[0], (20,20)) plt.imshow(image2DLow, cmap=plt.cm.gray) plt.show() plt.imshow(image2DInitial, cmap=plt.cm.gray) plt.show() plt.imshow(image2D, cmap=plt.cm.gray) plt.show() print "done" return low
def pcaSklearn(training, dimension=700): pca = PCA(n_components=dimension) pca.fit(training) low = pca.transform(training) same = pca.inverse_transform(low) print "low[0].shape" print low[0].shape image2DInitial = vectorToImage(training[0], (28,28)) print same[0].shape image2D = vectorToImage(same[0], (28,28)) image2DLow = vectorToImage(low[0], (20,20)) plt.imshow(image2DLow, cmap=plt.cm.gray) plt.show() plt.imshow(image2DInitial, cmap=plt.cm.gray) plt.show() plt.imshow(image2D, cmap=plt.cm.gray) plt.show() print "done" return low
def do ( Obs_ij, run_dir ) : # PCA N_PCs, V_nj, U_in = PCA.do_PCA( Obs_ij, run_dir ) print '# ---------------------------------------------------' print '# U_in' # print samples for ii in xrange( len( U_in ) ): for nn in xrange( len( U_in.T ) ): print U_in[ii][nn], print '' print '' # shrink wrap A_mn = shrinkwrap.do_shrinkwrap ( U_in, N_PCs, run_dir )
def PlotXference_AVG(): global EyeData global Events # Change these into arrays and prelocate size inference = [] noference = [] for idx in range(0, len(Events)): # inference.append(FindSlices(EyeData[idx], Events[idx], 'Inference', trialTypes)) # noference.append(FindSlices(EyeData[idx], Events[idx], 'Noference', trialTypes)) inference.append(FindSlices(EyeData[idx], Events[idx], "Inference", "typeB", 1)) noference.append(FindSlices(EyeData[idx], Events[idx], "Noference", "typeA", 1)) fig = plt.figure() fig.suptitle("Gaze X position") ax = fig.add_subplot(121) ax2 = fig.add_subplot(122) for trial in inference: ax.plot(trial) ax.set_ylim(0, 2000) ax.set_ylabel("X coordinate of gaze position") ax.set_xlabel("Inference trials \n x time course in ms") for trial in noference: ax2.plot(trial) ax2.set_ylim(0, 2000) ax2.set_xlabel("No inference trials \n x time course in ms") ticks = ax.get_xticks() * 16 ax.set_xticklabels(ticks.astype(int)) ax2.set_xticklabels(ticks.astype(int)) inf_cat = [1 for i in range(1, len(inference) + 1)] nof_cat = [0 for i in range(1, len(noference) + 1)] known_cat = np.hstack((np.array(inf_cat), np.array(nof_cat))) ferences = np.vstack((inference, noference)) PlotAverage_X(np.array(inference), np.array(noference)) # components = PCA.myPCA(ferences, known_cat) components = components * 1000 LOG_REG.logReg(known_cat, components) # components_tmp = components *1000 np.savetxt("eda_pcaResults.csv", np.hstack((known_cat.reshape(len(known_cat), 1), components)), delimiter=",")
def plotSecomPCA(filepath): dataMat = PCA.replaceNanWithMean(filepath) # below is a quick hack copied from pca.pca() meanVals = numpy.mean(dataMat, axis=0) meanRemoved = dataMat - meanVals # remove mean covMat = numpy.cov(meanRemoved, rowvar=0) eigVals, eigVects = numpy.linalg.eig(numpy.mat(covMat)) eigValInd = numpy.argsort(eigVals) # sort, sort goes smallest to largest eigValInd = eigValInd[::-1] # reverse sortedEigVals = eigVals[eigValInd] total = sum(sortedEigVals) varPercentage = sortedEigVals / total * 100 fig = plt.figure() ax = fig.add_subplot(111) ax.plot(range(1, 21), varPercentage[:20], marker='^') plt.xlabel('Principal Component Number') plt.ylabel('Percentage of Variance') plt.show()
def getPC(coords, outFileName='ligBox.pdb'): size = coords.size shape = coords.shape if shape != (1,3): if size != 0 : eigenVectors, eigenValues = PCA.princomp(coords.T, numpc=3, getEigenValues=True) com = coords.mean(axis=0) projection = numpy.dot(coords-com,eigenVectors) signs = numpy.sign(numpy.sign(projection).sum(axis=0)) signs2 = numpy.sign(projection[numpy.abs(projection).argmax(axis=0)].diagonal()) signs[signs==0] = signs2[signs==0] eigenVectors = eigenVectors*signs vectors = com + eigenVectors.T * numpy.atleast_2d(numpy.sqrt(eigenValues)).T elif size == 0: com = numpy.zeros((3)) vectors = numpy.zeros((3,3)) else: com = coords.flatten() vectors = numpy.zeros((3,3)) # pdbBoxWriter(com, vectors, outFileName) return com, vectors
import numpy as np import sys import PCA import shrinkwrap INFILE = 'data/raddata_2_norm' #=================================================== if __name__ == "__main__": # input data Obs_ij = np.loadtxt(INFILE) n_slice = len(Obs_ij) # PCA N_PCs, V_nj, U_in = PCA.do_PCA( Obs_ij ) # print 'Principal Components (V_lj) : ' # print V_nj # print '' # print 'Coefficients (U_il) : ' # print U_in # print '' print '# ---------------------------------------------------' print '# U_in' # print samples for ii in xrange( len( U_in ) ): for nn in xrange( len( U_in.T ) ): print U_in[ii][nn], print '' print ''
print '\nERROR : Unknown slice type\n' sys.exit() # Determine initial values for fitting parameters if KNOWN_ANSWER : print 'Initial values from known answers' X0_albd_kj = np.loadtxt( ALBDFILE ).T X0_area_lk = np.loadtxt( AREAFILE ) else: # PCA print 'Performing PCA...' n_pc, V_nj, U_in, M_j = PCA.do_PCA( Obs_ij, E_cutoff=1e-2, run_dir=run_dir ) n_type = n_pc + 1 # shrinkwrap print 'Perfoming shrink-wrapping...' # N ( = n_PC ): number of principle components # M ( = n_PC + 1 ) : number of vertices A_mn, P_im = shrinkwrap.do_shrinkwrap( U_in, n_pc, run_dir=run_dir ) X0_albd_kj = np.dot( A_mn, V_nj ) X0_albd_kj = X0_albd_kj + M_j if ( SLICE_TYPE=='time' ) : X0_area_lk = P_im else : X0_area_lk = np.ones( n_slice*n_type ).reshape([n_slice, n_type])/(n_type*1.0) # Save initial condutions
if __name__ == "__main__": # Load input data Obs_ij = np.loadtxt( INFILE_DIR + INFILE ) Time_i = np.arange( len( Obs_ij ) ) / ( 1.0 * len( Obs_ij ) ) n_band = len( Obs_ij.T ) # Initialization of Kernel print 'Decomposition into time slices...' n_slice = len( Time_i ) Kernel_il = np.identity( n_slice ) # PCA print 'Performing PCA...' n_pc, V_nj, U_in, M_j = PCA.do_PCA( Obs_ij, E_cutoff=1e-2, output=False, run_dir=OUTFILE_DIR ) V_nj[0] = -1. * V_nj[0] U_in.T[0] = -1. * U_in.T[0] V_nj[1] = -1. * V_nj[1] U_in.T[1] = -1. * U_in.T[1] n_type = n_pc + 1 if n_type != 3 : print 'ERROR: This code is only applicable for 3 surface types!' sys.exit() U_iq = np.c_[ U_in, np.ones( len( U_in ) ) ] PC1_limit = [XMIN,XMAX] # manually set for now PC2_limit = [YMIN,YMAX] # manually set for now points_kn_list = []
data=np.vstack((x,y)) #mean, eigenvectors = cv2.PCACompute(npc, np.mean(npc, axis=0).reshape(1,-1)) #mlab_pca = mlabPCA(data.T) sklearn_pca = PCA(n_components=2) incPCA.append(sklearn_pca.fit_transform(data.T))''' #print(incPCA[0].components_) #spaja sve primere prvog sekutica u matricu gde su u jednom redu spojeni vektori koordinata lendmarkova (x,y) a u koloni razliciti primeri 14x80 data1=np.append(Persons[0].Incisors[0].normXY[:,0],Persons[0].Incisors[0].normXY[:,1]) for p in range(1,14): data1=np.vstack((data1,np.append(Persons[p].Incisors[0].normXY[:,0],Persons[p].Incisors[0].normXY[:,1]))) eigenvalues, eigenvectors, mu=pca.pcaD(data1,3) tEVectors=np.array((eigenvectors[0:40,:],eigenvectors[40:80,:])).T ty=np.array((mu[0:40],mu[40:80])).T ''' #spaja sve primere prvog sekutica u jednu x,y matricu 560x2 x=np.array([]) y=np.array([]) for p in range(0,14): a=Persons[p].Incisors[0].normXY x=np.append(x,(a[:,0])) y=np.append(y,a[:,1]) data=np.vstack((x,y)).T
#Part One: Load Example Dataset print 'One: ======== Load Example Dataset1 ... ' plt.plot(X[:,0],X[:,1],'bo') plt.axis(xmin=0.5,xmax=6.5,ymin=2,ymax=8) plt.title('Example Dataset1') #Part Two: Principal Component Analysis print 'Two: ================ Running PCA on example dataset...' result=FN.featureNormalize(X) X_norm=result[0] mu=result[1] res=PCA.pca(X_norm) U=res[0] S=res[1] S=np.eye(S.shape[0])*S print 'Top eigenvector: ' print 'U[:,0] = %f %f ' % (U[0,0],U[1,0]) print '(You should expect to see -0.707107, -0.707107)' tmp1=mu+1.5*np.dot(S[0,0],U[:,0].transpose()) tmp2=mu+1.5*np.dot(S[1,1],U[:,1].transpose()) DL.drawLine(mu,tmp1,color='k',linewidth=2) DL.drawLine(mu,tmp2,color='b',linewidth=2) plt.show()
fig.savefig(file_name) if display == True: plt.show() plt.clf() def graph2d(data, display = True, file_name = None, verbose = True): fig, ax = plt.subplots() for code in np.unique(data[:,2]): x, y = zip(*data[data[:,2] == code][:,0:2]) ax.scatter(x, y, c = color_convert(code), marker = 'o') if file_name != None: fig.savefig(file_name) if display == True: plt.show() plt.clf() ''' Examples, in order, 3d plot of data, PCA, Isomap, LLE, LapEig ''' # graph3d(np.column_stack((npdata, color_code))) graph2d(np.column_stack((PCA.pca(npdata, dim = 2), color_code)), False, 'PCA') graph2d(np.column_stack((Isomap.isomap(npdata, load = 'C.npy'), color_code)), False, 'Isomap') graph2d(np.column_stack((LLE.lle(npdata), color_code)), False, 'LLE') graph2d(np.column_stack((LaplacianEigenmap.le(npdata), color_code)), False, 'LaplacianEigenmap') #Just a sanity check # from sklearn import manifold # x = manifold.SpectralEmbedding().fit_transform(X= npdata) # graph2d(np.column_stack((x, color_code)))
import PCA from numpy import * def loadData (fileAddress) : """ """ file = open (fileAddress).readlines () data = [] for line in file : data.append (map (float , line.strip().split())) return mat (data) if __name__ == '__main__' : dataSet = loadData ('testSet.txt') lowDimData , newData = PCA.pca (dataSet , 1) PCA.showPca (dataSet , lowDimData , newData)
if 0: # # PCAT magic: Lifting the following from GMM.py in PCAT # import PCA, GMM score_matrix, principal_components, means, stds, eigenvalues = \ PCA.PCA(catalogue, components_number=10) principal_components_number=10 reduced_score_matrix = score_matrix[:,:principal_components_number] mat, tmp, tmp1 = PCA.matrix_whiten(reduced_score_matrix, std=True) #labels = GMM.gaussian_mixture(mat,upper_bound=5) labels = GMM.gaussian_mixture(reduced_score_matrix,upper_bound=5) colored_clusters = GMM.color_clusters( score_matrix, labels ) GMM.print_cluster_info(colored_clusters) #sys.exit() # # PCA # #H = np.matrix(waveform_catalogue) H = np.matrix(catalogue)
'robberies', 'robbbPerPop', 'assaults', 'assaultPerPop', 'burglaries', 'burglPerPop', 'larcenies', 'larcPerPop', # 'autoTheft', 'autoTheftPerPop', 'arsons', 'arsonsPerPop', 'violentPerPop', 'nonViolPerPop', ] from DataSet import * from PCA import * dataset = DataSet(data, names, drop_columns=drop_columns, fix_missing=FixMissing.DROPATTRIBUTES, rescale=Rescale.NORMALIZE) print(dataset.X) print(dataset.X.iloc[5,10]) pca = PCA(dataset) pca.plot_rho() pca.show() plt.show() print("\n\nstd:", dataset.X.std()) print("\n\nmean:", dataset.X.mean()) print("\n\nrange:", dataset.X.max()-dataset.X.min())
import vectorizeFiles as VF import getFileNames as gf import matplotlib.pyplot as plot import numpy as np # from feature_extractor import FeatureExtractor # fe = FeatureExtractor(1) # featurized = fe.featurizeFiles('../data') # classNames, repubAndDemMatrix, labels = featurized[:3] [repubAndDemMatrix,vectorizerRepubDem,labels]=VF.extractWordCounts(True,True,False) k = 3 files=gf.getFileNames() transformed = PCA.getPCAMat(repubAndDemMatrix, k) repub=np.array([list(x) for i,x in enumerate(transformed) if labels[i]==1]) dem=np.array([list(x) for i,x in enumerate(transformed) if labels[i]==0]) plot.figure() plot.scatter(repub[:,0],repub[:,1],c='r',marker='x') plot.scatter(dem[:,0],dem[:,1],c='b',marker='x') ##plot.annotate(s=files[0],xy=transformed[0]) plot.savefig('results/images/VFPCA.png') # plot.savefig('results/images/PCA.png') ''' transformedWords=PCA.getPCAMat(repubAndDemMatrix.T, k) vocab=vectorizerRepubDem.vocabulary_ indicesOfInterest=[] f=open('wordsInterest.txt','r') wordsOfInterest=[line.split()[0] for line in f]
import PCA import EXTRAS import numpy dataMat = PCA.loadDataSet("E:/TestDatas/MachineLearningInAction/Ch13/testSet.txt") lowDMat, reconMat = PCA.pca(dataMat, 1) PCA.plotPCA(dataMat, reconMat) """ dataMat = PCA.replaceNanWithMean("E:/TestDatas/MachineLearningInAction/Ch13/secom.data") meanVals = numpy.mean(dataMat, axis=0) meanRemoved = dataMat - meanVals covMat = numpy.cov(meanRemoved, rowvar=0) eigVals, eigVects = numpy.linalg.eig(numpy.mat(covMat)) print eigVals """ # EXTRAS.plotTestSet("E:/TestDatas/MachineLearningInAction/Ch13/testSet.txt") # EXTRAS.plotTestSet3("E:/TestDatas/MachineLearningInAction/Ch13/testSet3.txt") # EXTRAS.plotSecomPCA("E:/TestDatas/MachineLearningInAction/Ch13/secom.data")
SGMean = PCA.MLmean(SGNorm) SGCov = PCA.MLcov(SGNorm,SGMean) eigw,eigv = np.linalg.eig(SGCov) """ Python doesn't return an ordered list of eigenvalues/eigenvectors so we join them and sort them in descending order. Then we substract the 2 highest eigenvectors/principal components """ SGVectors = [] for i in range(len(eigw)): SGVectors.append((eigw[i],eigv[:,i])) SGVectors = sorted(SGVectors, reverse=True, key=lambda tup: tup[0]) SGPC = [SGVectors[0][1],SGVectors[1][1]] #Projection via dot product new_SGX,new_SGY = PCA.transform(SGNorm,SGPC) # Plotting the eigenspectrum plt.plot(range(1,len(eigw)+1),eigw,'r-') plt.xlabel('Eigenvector number') plt.ylabel('Eigenvalue') plt.title('Eigenspectrum') plt.show() #Plotting the projection onto the first 2 Principal Components plt.plot(new_SGX,new_SGY,"x") plt.xlabel('First principal component') plt.ylabel('Second principal component') plt.title("The SGdata projected onto Principal Components") plt.show()
# Load input data Obs_ij = np.loadtxt( INFILE_DIR + INFILE ) Time_i = np.arange( len( Obs_ij ) ) / ( 1.0 * len( Obs_ij ) ) n_band = len( Obs_ij.T ) # Initialization of Kernel print 'Decomposition into time slices...' n_slice = len( Time_i ) Kernel_il = geometry.kernel( Time_i, n_slice, N_SIDE, GEOM ) print 'Kernel_il', Kernel_il Kernel_il[ np.where( Kernel_il < 1e-3 ) ] = 0. print 'Kernel_il', Kernel_il # PCA print 'Performing PCA...' n_pc, V_nj, U_in, M_j = PCA.do_PCA( Obs_ij, E_cutoff=1e-2, output=True ) # V_nj[0] = -1. * V_nj[0] # U_in.T[0] = -1. * U_in.T[0] # V_nj[1] = -1. * V_nj[1] # U_in.T[1] = -1. * U_in.T[1] n_type = n_pc + 1 if n_type != 3 : print 'ERROR: This code is only applicable for 3 surface types!' sys.exit() U_iq = np.c_[ U_in, np.ones( len( U_in ) ) ] PC1_limit = [-0.4, 0.2] # manually set for now PC2_limit = [-0.1, 0.4] # manually set for now
# Ignore the new feature as it messes up PCA data_dict = pickle.load(open("data/own_data_dict.pkl", "r")) features_list = getallFeatures(data_dict) data = featureFormat(data_dict, features_list, sort_keys = True) # Scale features: mins = np.min(data, axis=0) maxs = np.max(data, axis=0) data = (data-mins)/(maxs-mins) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ stratifiedShuffleSplit(features, labels) ### Do some PCA pca = PCA.doPCA(features_train, n = 4) transformed_train = pca.transform(features_train) # Do some hyperparam validation: best_svc, svc_grid_scores = ClassifySVM.gridsearch( transformed_train, labels_train ) svmfit = ClassifySVM.train(transformed_train, labels_train, best_svc) test_classifier(svmfit, data) dump_classifier_and_data(svmfit, data_dict, features_list)