def test_sparse_randomized_pca_inverse(): """Test that RandomizedPCA is inversible on sparse data""" rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= 0.00001 # make middle component relatively small # no large means because the sparse version of randomized pca does not do # centering to avoid breaking the sparsity X = csr_matrix(X) # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) pca = RandomizedPCA(n_components=2, random_state=0).fit(X) assert_equal(len(w), 1) assert_equal(w[0].category, DeprecationWarning) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.todense(), Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DeprecationWarning) pca = RandomizedPCA(n_components=2, whiten=True, random_state=0).fit(X) assert_equal(len(w), 1) assert_equal(w[0].category, DeprecationWarning) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X.todense() - Y_inverse) / np.abs(X).mean()).max() # XXX: this does not seam to work as expected: assert_almost_equal(relative_max_delta, 0.91, decimal=2)
def test_sparse_randomized_pca_inverse(): """Test that RandomizedPCA is inversible on sparse data""" rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small # no large means because the sparse version of randomized pca does not do # centering to avoid breaking the sparsity X = csr_matrix(X) # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = RandomizedPCA(n_components=2, random_state=0) assert_warns(DeprecationWarning, pca.fit, X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.toarray(), Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) pca = assert_warns( DeprecationWarning, RandomizedPCA(n_components=2, whiten=True, random_state=0).fit, X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X.toarray() - Y_inverse) / np.abs(X.toarray()).mean()).max() # XXX: this does not seam to work as expected: assert_almost_equal(relative_max_delta, 0.91, decimal=2)
def open_img(): x = filedialog.askopenfilenames( parent=root, initialdir='/', initialfile='tmp', filetypes=[ ("All files", "*")]) img = Image.open(x[0]) img = img.resize((250, 250), Image.ANTIALIAS) img = ImageTk.PhotoImage(img) panel = tk.Label(root, image=img) panel.image = img panel.grid(row=70, column=1) image = cv2.imread(x[0]) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) cv2.imwrite("grey.jpeg", gray) gray.shape img = mpimg.imread("grey.jpeg") f=compo() ipca = RandomizedPCA(f) ipca.fit(img) img_c = ipca.transform(img) print(img_c.shape) temp = ipca.inverse_transform(img_c) print(temp.shape) cv2.imwrite("pca1.jpg", temp) print(np.sum(ipca.explained_variance_ratio_)) plt.plot(np.cumsum(ipca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); plt.savefig("graph.jpg")
def test_randomized_pca_inverse(): """Test that RandomizedPCA is inversible on dense data""" rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= 0.00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = RandomizedPCA(n_components=2, random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) pca = RandomizedPCA(n_components=2, whiten=True, random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max() assert_almost_equal(relative_max_delta, 0.11, decimal=2)
def test_randomized_pca_inverse(): """Test that RandomizedPCA is inversible on dense data""" rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = RandomizedPCA(n_components=2, random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) pca = RandomizedPCA(n_components=2, whiten=True, random_state=0).fit(X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X - Y_inverse) / np.abs(X).mean()).max() assert_almost_equal(relative_max_delta, 0.11, decimal=2)
def gap_statistic(x, random_datasets=64): """ Returns the gap statistic of the data set. Keeps increasing the number of clusters until the maximum gap statistic is more than double the current gap statistic. http://blog.echen.me/2011/03/19/counting-clusters/ """ assert isinstance(x, np.ndarray) assert len(x.shape) == 2 if x.shape > SETTINGS.GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD: pca = RandomizedPCA(SETTINGS.GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD) else: pca = PCA() pca.fit(x) transformed = pca.transform(x) reference_datasets = [ pca.inverse_transform(generate_random_dataset(transformed)) for _ in range(random_datasets) ] max_gap_statistic = -1 best_num_clusters = 1 for num_clusters in range(1, x.shape[0] + 1): kmeans = MiniBatchKMeans(num_clusters) kmeans.fit(x) trained_dispersion = dispersion(kmeans, x) random_dispersions = [ dispersion(kmeans, data) for data in reference_datasets ] gap_statistic = np.log(sum(random_dispersions) / random_datasets) - np.log(trained_dispersion) if gap_statistic > max_gap_statistic: max_gap_statistic = gap_statistic best_num_clusters = num_clusters if gap_statistic < max_gap_statistic * SETTINGS.GAP_STATISTIC.MAXIMUM_DECLINE: break if num_clusters > best_num_clusters + SETTINGS.GAP_STATISTIC.NUM_CLUSTERS_WITHOUT_IMPROVEMENT: break return best_num_clusters
def callRandomizedPCA(X, n, type): # type = 1 for Energy data to avoid 1D plot, 2 for others rpca = RandomizedPCA(n_components=n) rpca.fit(X) transformed = rpca.transform(X) print("original shape: ", X.shape) print("transformed shape after Randomized PCA:", transformed.shape) X_recons = rpca.inverse_transform(transformed) print("reconstruct shape after Randomized PCA:", X_recons.shape) if type == 2: # Gstore data myplot(transformed[:, 0:2], np.transpose(rpca.components_[0:2, :])) plt.show() myplot(X_recons[:, 0:2], np.transpose(rpca.components_[0:2, :])) plt.show() return transformed
def gap_statistic(x, random_datasets=64): """ Returns the gap statistic of the data set. Keeps increasing the number of clusters until the maximum gap statistic is more than double the current gap statistic. http://blog.echen.me/2011/03/19/counting-clusters/ """ assert isinstance(x, np.ndarray) assert len(x.shape) == 2 if x.shape > GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD: pca = RandomizedPCA(GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD) else: pca = PCA() pca.fit(x) transformed = pca.transform(x) reference_datasets = [pca.inverse_transform(generate_random_dataset(transformed)) for _ in range(random_datasets)] max_gap_statistic = -1 best_num_clusters = 1 for num_clusters in range(1, x.shape[0] + 1): kmeans = MiniBatchKMeans(num_clusters) kmeans.fit(x) trained_dispersion = dispersion(kmeans, x) random_dispersions = [dispersion(kmeans, data) for data in reference_datasets] gap_statistic = np.log(sum(random_dispersions) / random_datasets) - np.log(trained_dispersion) if gap_statistic > max_gap_statistic: max_gap_statistic = gap_statistic best_num_clusters = num_clusters if gap_statistic < max_gap_statistic * GAP_STATISTIC.MAXIMUM_DECLINE: break if num_clusters > best_num_clusters + GAP_STATISTIC.NUM_CLUSTERS_WITHOUT_IMPROVEMENT: break return best_num_clusters
def PCA(): img = mpimg.imread('imagep.png') print("Real Image Shape:", img.shape) a = img.shape[0] b = img.shape[1] c = img.shape[2] img_r = np.reshape(img, (a, b * c)) print("Reshaped Image Shape:", img_r.shape) ipca = RandomizedPCA(1000).fit(img_r) img_c = ipca.transform(img_r) print(img_c.shape) print(np.sum(ipca.explained_variance_ratio_)) temp = ipca.inverse_transform(img_c) temp = np.reshape(temp, (a, b, c)) print(temp.shape, a, b, c) plt.axis('off') plt.imshow(temp) #plt.imshow(temp) plt.show()
kmeans = KMeans(k=49, n_init=1) if not os.path.exists(MODEL_NAME): print "Training on ", train_set_x.shape print "Fitting PCA" X_tr = pca.fit_transform(X_tr) X_tst = pca.transform(X_tst) print "Fitted PCA" # Train KMeans on whitened data print "Transforming data" X_tr_white = pca.fit_transform(X_tr) print "Fitting KMEANS" kmeans.fit(X_tr_white) filters_kmeans = pca.inverse_transform(kmeans.cluster_centers_) else: pca, kmeans = cPickle.load(open(MODEL_NAME, "r")) import matplotlib.pylab as plt if Visualise: for i, f in enumerate(F): plt.subplot(7, 7, i + 1) plt.imshow(f.reshape(ImageSideFinal, ImageSideFinal), cmap="gray") plt.axis("off") plt.show() N=min(1000, test_set_x.shape[0]) x_plt, y_plt, clr_plt = [0]*int(N), [0]*int(N), [0]*int(N)
# range(2,74) means its goes from col 2 to col 73 df_input_data = df_input[list(range(2, 74))] df_input_target = df_input[list(range(0, 1))] colors = numpy.random.rand(len(df_input_target)) # Randomized PCA from sklearn.decomposition import RandomizedPCA pca = RandomizedPCA(n_components=6) #from optimal pca components chart n_components=6 proj1 = pca.fit_transform(df_input_data) # Relative weights on features print pca.explained_variance_ratio_ print pca.components_ # Plotting mpyplot.figure(1) p1 = mpyplot.scatter(proj1[:, 0], proj1[:, 1], c=colors) mpyplot.colorbar(p1) mpyplot.show(p1) # Randomized PCA using inverse transform - to make it linear proj2 = pca.inverse_transform(proj1) # Plotting mpyplot.figure(2) # p1 = mpyplot.scatter(proj1[:, 0], proj1[:, 1], c=colors, alpha=0.2) p2 = mpyplot.scatter(proj2[:, 0], proj2[:, 1], c=colors, alpha=0.8) mpyplot.colorbar(p1) mpyplot.show(p2)
print('reshaping image into 2 dimensions for PCA') img_r = np.reshape(img, (img.shape[0],img.shape[1]*img.shape[2] )) print(img_r.shape) number_of_components = 64 print('transofming image with'+str(number_of_components)+'number of components') ipca = RandomizedPCA(number_of_components).fit(img_r) img_c = ipca.transform(img_r) print('new shape of image after transformation') print(img_c.shape) print('Randomized PCA with 64 components:') print(np.sum(ipca.explained_variance_ratio_)) print('inversing the transformation back to image') temp = ipca.inverse_transform(img_c) print('reshaping back to three dimension') temp = np.reshape(temp, (img.shape[0],img.shape[1],img.shape[2])) print(temp.shape) m = interp1d([temp.min(),temp.max()],[0,1]) print('rescaling image this may take some time....') for i in range(temp.shape[0]): for j in range(temp.shape[1]): for k in range(temp.shape[2]): temp[i][j][k] = float(m(temp[i][j][k])) fig = plt.figure() plt.axis('off') plt.imshow(temp)
plotImageGrid(X[sample_patches_ind, ...], image_size=(patch_size, patch_size, 3), nrow=6, ncol=6) #plt.savefig('patches16.png') plt.show() #perform whitening # 590 components = 99% explained varience pca = RandomizedPCA(n_components=590, whiten=True, random_state=seed) w_X = pca.fit_transform(X) print("==== PCA fitted =====") print("variance explained:") print(pca.explained_variance_ratio_) #plot whitened patches after inverse transform orig_X = pca.inverse_transform(w_X[sample_patches_ind, ...]) plotImageGrid(orig_X, image_size=(patch_size, patch_size, 3), nrow=6, ncol=6) #plt.savefig('patches_whitened16.png') plt.show() ###KMEANS k_means = cluster.KMeans(n_clusters=50, n_jobs=3) k_means.fit(X) print("==== K-Means fitted ====") # get centroids and transform them to original space #tmp = pca.inverse_transform(k_means.cluster_centers_.copy()) tmp = k_means.cluster_centers_.copy() plotImageGrid(tmp, image_size=(16, 16, 3), nrow=5, ncol=10 ) plt.savefig('centroids_nw.png')
cmap='bone') plt.show() # In image above, you should could check, that at first the principal components are the base face structure, then we moved to recognize face features like nose, eyes, mouth, etc # Let's find out the cumulative some of the variance to determine how much PCs suitable for our case plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('Number of components: ') plt.ylabel('Cumulative explained variance: ') plt.show() # From the plot above, we can safely assume that using 150 PCs alreaed retrieve ~90% of our variances # Let's see it by comparing the original image with image using only 150PCs pca = RandomizedPCA(150).fit(faces.data) components = pca.transform(faces.data) pca_faces = pca.inverse_transform(components) # Plot the results fig, ax = plt.subplots(2, 10, figsize=(10, 2.5), subplot_kw={ 'xticks': [], 'yticks': [] }, gridspec_kw=dict(hspace=0.1, wspace=0.1)) for i in range(10): dimensionH = faces.images.shape[1] dimensionW = faces.images.shape[2] ax[0, i].imshow(faces.data[i].reshape(dimensionH, dimensionW),
for i, ax in enumerate(axes.flat): ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone') plt.show() plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') plt.show() # compute the components and projected faces pca = RandomizedPCA(150).fit(faces.data) components = pca.transform(faces.data) projected = pca.inverse_transform(components) # plot the results fig,ax=plt.subplots(2,10,figsize=(10,2.5),subplot_kw={'xticks':[],'yticks':[]},\ gridspec_kw=dict(hspace=0.1,wspace=0.1)) for i in range(10): ax[0, i].imshow(faces.data[i].reshape(62, 47), cmap='binary_r') ax[1, i].imshow(projected[i].reshape(62, 47), cmap='binary_r') ax[0, 0].set_ylabel('full-dim\ninput') ax[1, 0].set_ylabel('150-dim\nreconstruction') plt.show()
resolution = 50 alpha_X, beta_Y = np.meshgrid( np.linspace( np.min(path[:, 0]) - 1.0, np.max(path[:, 0]) + 1.0, resolution), np.linspace( np.min(path[:, 1]) - 1.0, np.max(path[:, 1]) + 1.0, resolution)) Esurface = np.zeros((resolution, resolution)) for alpha_idx in xrange(resolution): for beta_idx in xrange(resolution): alpha, beta = alpha_X[alpha_idx, beta_idx], beta_Y[alpha_idx, beta_idx] sigma = pca.inverse_transform(np.asarray((alpha, beta))) Esurface[alpha_idx, beta_idx] = run_net(net, sigma, ds, gradient_postproc) errors = [ run_net(net, pca.inverse_transform(path[epoch]), ds, gradient_postproc, learn=False) for epoch in xrange(epochs) ] fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(alpha_X, beta_Y,
pfd_data = pfddata(os.path.join(input_dir, fn)) nbins = 64 data = pfd_data.getdata(subbands=nbins) print("processing %s" % fn) fig = plt.figure() fig.set_size_inches(5, 5) ax = plt.Axes(fig, [0., 0., 1., 1.]) ax = plt.gca() ax.set_axis_off() fig.add_axes(ax) pca = PCA(n_components=24) rd = data.reshape(nbins, int(data.shape[0] / nbins)) pca.fit(rd) data = pca.inverse_transform(pca.transform(rd)).flatten() data = data.reshape((nbins, data.shape[0] / nbins)) plt.imshow(data, origin='lower', interpolation='bilinear', cmap=plt.cm.gray_r ) #aspect='auto'plt.cm.Greys) #interpolation='bilinear' plt.savefig("%s_subbands.png" % os.path.join(output_dir, fn)) intervals_data = pfd_data.getdata(intervals=64) fig = plt.figure() fig.set_size_inches(5, 5) ax = plt.Axes(fig, [0., 0., 1., 1.]) ax = plt.gca() ax.set_axis_off()
betweenss[kIdx] / totss * 100, marker='o', markersize=12, markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') ax.set_ylim((0, 100)) plt.grid(True) plt.xlabel('Number of clusters') plt.ylabel('Percentage of variance explained (%)') plt.title('Elbow for KMeans clustering') # show centroids for K=10 clusters plt.figure() for i in range(kIdx + 1): img = pca.inverse_transform(centroids[kIdx][i]).reshape(8, 8) ax = plt.subplot(3, 4, i + 1) ax.set_xticks([]) ax.set_yticks([]) plt.imshow(img, cmap=cm.gray) plt.title('Cluster %d' % i) # compare K=10 clustering vs. actual digits (PCA projections) fig = plt.figure() ax = fig.add_subplot(121) for i in range(10): ind = (t == i) ax.scatter(X[ind, 0], X[ind, 1], s=35, c=clr[i],