def do_RandomizedPCA(armadillo): # # TODO: Write code to import the libraries required for # RandomizedPCA. Then, train your RandomizedPCA on the armadillo # dataframe. Finally, drop one dimension (reduce it down to 2D) # and project the armadillo down to the 2D principal component # feature space. # # NOTE: Be sure to RETURN your projected armadillo! # (This projection is actually stored in a NumPy NDArray and # not a Pandas dataframe, which is something Pandas does for # you automatically. =) # # NOTE: SKLearn deprecated the RandomizedPCA method, but still # has instructions on how to use randomized (truncated) method # for the SVD solver. To find out how to use it, check out the # full docs here: # http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html # # .. your code here .. from sklearn.decomposition import RandomizedPCA rpca = RandomizedPCA(n_components=2) rpca.fit(armadillo) rpca.transform(armadillo) return rpca.transform(armadillo)
def pca2(Xtrain, Xtest): newTrain = [] pca = RandomizedPCA(n_components=len(Xtrain[0]) - 30) pca.fit(Xtrain) newTrain = pca.transform(Xtrain) newTest = pca.transform(Xtest) return newTrain, newTest
def pca_knn(train, test): y = [] Xtrain, ytrain, Xtest, ytest = loadData(train, test) #PCA, fit and transform pca = RandomizedPCA(n_components=200) pca.fit(Xtrain) Xtrain = pca.transform(Xtrain) new_Xtest = pca.transform(Xtest) #Make classifier clf = KNeighborsClassifier(n_neighbors=3) clf.fit(Xtrain, ytrain) y = clf.predict(new_Xtest) #y1 = clf.predict(Xtrain) #terror = test_error(ytrain, y1) #print "training error for KNN, k=3:" #print terror error = test_error(ytest, y) print "test error for KNN, k=3:" print error print "\\\\\\\\\\\\\\\\" return y
def compute_pca(reception_stats, n_components=5): reception_mean = reception_stats.mean(axis=0) pca = RandomizedPCA(n_components - 1) pca.fit(reception_stats) pca_components = np.vstack([reception_mean, pca.components_]) return pca, pca_components
def train_data(): n_components = 256 pca = RandomizedPCA(n_components=n_components, whiten=True) clf=svm.SVC(kernel='rbf',C=5., gamma=0.001) train_directory = 'dataset/real_train' images, labels = prepare_dataset(train_directory) training_data=[] for i in range(len(images)): training_data.append(images[i].flatten()) print("% shape of traing data => ",np.array(training_data).shape) print('labels =>',np.array(labels).shape) pca.fit(np.array(training_data)) transformed = pca.transform(np.array(training_data)) filename = 'models/pca_model.sav' pickle.dump(pca, open(filename, 'wb')) print("% shape of transformed data => ",transformed.shape) clf.fit(transformed,np.array(labels)) filename = 'models/svm_model.sav' pickle.dump(clf, open(filename, 'wb'))
def open_img(): x = filedialog.askopenfilenames( parent=root, initialdir='/', initialfile='tmp', filetypes=[ ("All files", "*")]) img = Image.open(x[0]) img = img.resize((250, 250), Image.ANTIALIAS) img = ImageTk.PhotoImage(img) panel = tk.Label(root, image=img) panel.image = img panel.grid(row=70, column=1) image = cv2.imread(x[0]) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) cv2.imwrite("grey.jpeg", gray) gray.shape img = mpimg.imread("grey.jpeg") f=compo() ipca = RandomizedPCA(f) ipca.fit(img) img_c = ipca.transform(img) print(img_c.shape) temp = ipca.inverse_transform(img_c) print(temp.shape) cv2.imwrite("pca1.jpg", temp) print(np.sum(ipca.explained_variance_ratio_)) plt.plot(np.cumsum(ipca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); plt.savefig("graph.jpg")
def main(): X_train, X_test, y_train, y_test, y_encoder = get_binary_encoded_xy_split(5000) # reduce 1000 X 1024 dimensions to 11 (number of X columns before label binarization in table) X_train_randPCA = RandomizedPCA() X_train_randPCA.fit(X_train) print("pca fit") X_train_reduced = X_train_randPCA.transform(X_train) X_test_reduced = X_train_randPCA.transform(X_test) print("Reduced components") print("Begin classifier") clf = GradientBoostingClassifier(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=1) print(y_train.shape, y_test.shape) print(y_encoder.classes_) print(y_encoder.transform(["Accident"])) print(np.where(y_encoder.classes_ == "Accident")) clf.fit(X_train_reduced, y_train[:, np.where(y_encoder.classes_=="Accident")[0]]) print("Fitted") print("_" * 80) feature_vals = y_encoder.transform(y_encoder.classes_) feature_labels = y_encoder.classes_ print(feature_vals) print(feature_labels) fig, axs = plot_partial_dependence(clf, X_train,[0,1], n_jobs=4, grid_resolution=100) plt.show()
def pca(self, y): # select a random subset of Y dimensions (possibly gives robustness as well as speed) rand_dims = np.sort( np.random.choice(y.shape[1], np.minimum(self.tree_params['num_dims_for_pca'], y.shape[1]), replace=False)) y_dim_subset = y.take(rand_dims, 1) pca = RandomizedPCA(n_components=1) # compute for all components # optional: select a subset of exs (not so important if PCA is fast) if self.tree_params['sub_sample_exs_pca']: rand_exs = np.sort( np.random.choice(y.shape[0], np.minimum( self.tree_params['num_exs_for_pca'], y.shape[0]), replace=False)) pca.fit(y_dim_subset.take(rand_exs, 0)) return pca.transform(y_dim_subset) else: # perform PCA return pca.fit_transform(y_dim_subset)
def rpca(train_X, test_X, n): start_time = time.time() pca = RandomizedPCA(n_components=n) pca.fit(train_X.toarray()) train_X_pca = pca.transform(train_X.toarray()) test_X_pca = pca.transform(test_X.toarray()) print("--- %s seconds ---" % (time.time() - start_time)) return pca, train_X_pca, test_X_pca
def compute_PCA(n_components=5): spec_mean = spectra.mean(axis=0) print spec_mean.shape #Randomized PCA is faster (according to astroML): pca = RandomizedPCA(n_components - 1) pca.fit(spectra) pca_components = np.vstack([spec_mean, pca.components_]) return pca_components
def RPCA(model_data, components=None, transform_data=None): t0 = time() rpca = RandomizedPCA(n_components=components) if transform_data == None: projection = rpca.fit_transform(model_data) else: rpca.fit(model_data) projection = rpca.transform(transform_data) print "Randomized PCA Explained Variance: ", rpca.explained_variance_ratio_ print "Randomized PCA Time: %0.3f" % (time() - t0) return projection
def pca_knn(train, test): y = [] Xtrain, ytrain, Xtest, ytest = load_data(train, test) dim_red = RandomizedPCA(n_components=43) dim_red.fit(Xtrain) Rtrain = dim_red.transform(Xtrain) Rtest = dim_red.transform(Xtest) clf = KNeighborsClassifier(n_neighbors=knn_para[2], weights='distance') clf.fit(X=Rtrain, y=ytrain) y = clf.predict(X=Rtest) #print(1 - clf.score(X=Rtest, y=ytest)) return y
def pca_test(img_kind): import pylab as pl from mpl_toolkits.mplot3d import Axes3D subdir = "data/" classes = [] data = [] the_ones = glob.glob(subdir + "f_" + img_kind + "*.jpg") all_of_them = glob.glob(subdir + "f_*_*.jpg") the_others = [] for x in all_of_them: if the_ones.count(x) < 1: the_others.append(x) for x in the_ones: classes.append(1) data.append(get_image_features(cv.LoadImageM(x))) for x in the_others: classes.append(-1) data.append(get_image_features(cv.LoadImageM(x))) pca = PCA(46, whiten=True) print 'fiting' pca.fit(data) print 'transforming' X_r = pca.transform(data) print '----' print X_r.shape x0 = [x[0] for x in X_r] x1 = [x[1] for x in X_r] pl.figure() for i in xrange(0,len(x0)): if classes[i] == 1: pl.scatter(x0[i], x1[i], c = 'r') else: pl.scatter(x0[i], x1[i], c = 'b') # for c, i, target_name in zip("rg", [1, -1], target_names): # pl.scatter(X_r[classes == i, 0], X_r[classes == i, 1], c=c, label=target_name) pl.legend() pl.title('PCA of dataset') pl.show()
def pca_svm(train, test): y = [] Xtrain, ytrain, Xtest, ytest = load_data(train, test) dim_red = RandomizedPCA(n_components=50) dim_red.fit(Xtrain) Rtrain = dim_red.transform(Xtrain) Rtest = dim_red.transform(Xtest) clf = SVC(kernel='poly', C=1, gamma=0.02) clf.fit(X=Rtrain, y=ytrain) y = clf.predict(X=Rtest) #print(1 - clf.score(X=Rtest, y=ytest)); return y
def bootstrap_pc(seed): np.random.seed(seed) b = np.copy(zscored) nrows, ncols = b.shape for i in range(ncols): b[:, i] = b[:, i][np.random.permutation(nrows)] with warnings.catch_warnings(): warnings.simplefilter('ignore') pca = RandomizedPCA(n_components=1) pca.fit(b) return pca.explained_variance_[0]
def transform_PCA(k, train_X, test_X): pca = RandomizedPCA(n_components=k) pca.fit(train_X) # Transform test data with principal components: X_reduced = pca.transform(test_X) # Reconstruct: X_rec = np.dot(X_reduced, pca.components_) # Restore mean: X_rec += pca.mean_ return X_rec
def pca_test(X): #这个函数是用来返回最佳的n值,即ng讲的那个测评函数要达到99% pca = RandomizedPCA() pca.fit(X) n_components = X.shape[1] for n in range(10, X.shape[1], 5): s = sum(pca.explained_variance_ratio_[:n]) if (s >= 0.99): n_components = n print n #print "%d is best for pca" %n_components break #pca.set_params(n_components=n_components) return n_components
def pca_knn(train, test): fid = open(train) tid = open(test) for line in fid: line = line.strip() m = [int(float(x)) for x in line.split(' ')] train_label.append(m[0]) train_data.append(m[1:]) for line in tid: line = line.strip() m = [int(float(x)) for x in line.split(' ')] test_real_label.append(m[0]) test_data.append(m[1:]) pca = RandomizedPCA(n_components=5) pca.fit(train_data) train_data_5 = pca.transform(train_data) test_data_5 = pca.transform(test_data) count = 0 neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(train_data_5, train_label) y1 = neigh.predict(test_data_5) for i in range(2007): if int(float(y1[i])) == test_real_label[i]: count += 1 acc1 = count * 1.0 / 2007 pca = RandomizedPCA(n_components=20) pca.fit(train_data) train_data_20 = pca.transform(train_data) test_data_20 = pca.transform(test_data) count = 0 neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(train_data_20, train_label) y2 = neigh.predict(test_data_20) for i in range(2007): if int(float(y2[i])) == test_real_label[i]: count += 1 acc2 = count * 1.0 / 2007 y = y2 #acc1 = 0.7777 acc2 = 0.9337 return y
def run_stuff (): dataset = refactor_labels(get_data("C:\\Users\\user\\PycharmProjects\\AnxietyClassifier(2)\Alls_data_NO_specific_vars_corr.xlsx", "Sheet1"),"group") dataset = imputing_avarage(dataset) features_df = dataset.drop(['Age','group','PHQ9','Subject_Number'],1) X = features_df.values X = StandardScaler().fit_transform(X) #X = array[:,3:116] pca = RandomizedPCA(50) pca.fit(X) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); plt.show()
def _pca(hosts, dim=2): """ Principal component analysis Reduce the numpy-matrix hosts to a dim-dimensional vector space """ pca = RandomizedPCA(n_components=dim) pca.fit(hosts) # Return most discriminating values by axis. pca_indexes = [] for ind in xrange(dim): vect = pca.components_[ind] pca_indexes.append([(idx, vect[idx]) for idx in (-abs(vect)).argsort()]) return (pca, pca_indexes)
def do_RandomizedPCA(armadillo): # For importing the libraries required for RandomizedPCA. from sklearn.decomposition import RandomizedPCA # Training the RandomizedPCA on the armadillo dataframe, then # dropping one dimension and projecting the armadillo # down to the 2D principal component feature space. rpca = RandomizedPCA(n_components=2) rpca.fit(armadillo) RRarmadillo = rpca.transform(armadillo) return RRarmadillo
def pca_svm(train, test): fid = open(train) tid = open(test) for line in fid: line = line.strip() m = [int(float(x)) for x in line.split(' ')] train_label.append(m[0]) train_data.append(m[1:]) for line in tid: line = line.strip() m = [int(float(x)) for x in line.split(' ')] test_real_label.append(m[0]) test_data.append(m[1:]) pca = RandomizedPCA(n_components=5) pca.fit(train_data) train_data_5 = pca.transform(train_data) test_data_5 = pca.transform(test_data) trained_model = SVC(C=100, kernel='rbf', degree=3, gamma=0.01) trained_model.fit(train_data_5, train_label) count = 0 y1 = trained_model.predict(test_data_5) for i in range(2007): if int(float(y1[i])) == test_real_label[i]: count += 1 acc1 = count * 1.0 / 2007 pca = RandomizedPCA(n_components=20) pca.fit(train_data) train_data_20 = pca.transform(train_data) test_data_20 = pca.transform(test_data) trained_model = SVC(C=100, kernel='rbf', degree=3, gamma=0.01) trained_model.fit(train_data_20, train_label) count = 0 y2 = trained_model.predict(test_data_20) for i in range(2007): if int(float(y2[i])) == test_real_label[i]: count += 1 acc2 = count * 1.0 / 2007 y = y2 #acc1 = 0.7997 acc2 = 0.9417 return y
def train(directory): images, labels = prepare_dataset(directory) n_components = 10 pca = RandomizedPCA(n_components=n_components, whiten=True) param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1], } clf = GridSearchCV( SVC(kernel='rbf', class_weight='auto', probability=True), param_grid) testing_data = [] for i in range(len(images)): print images[i].flatten().shape testing_data.append(images[i].flatten()) pca = pca.fit(testing_data) transformed = pca.transform(testing_data) clf.fit(transformed, labels) scores = cross_val_score(clf, transformed, labels, cv=5) print("Mean cross-validation accuracy") print(sum(scores) / 5) joblib.dump(clf, "svm.pkl") joblib.dump(pca, "pca.pkl")
def do_RandomizedPCA(armadillo): # # TODO: Write code to import the libraries required for RandomizedPCA. Then, train your RandomizedPCA on the armadillo # dataframe. Finally, drop one dimension (reduce it down to 2D) and project the armadillo down to the 2D principal component # feature space. # # NOTE: Be sure to RETURN your projected armadillo! # (This projection is actually stored in a NumPy NDArray and not a Pandas dataframe, which is something Pandas does for # you automatically. =) # # .. your code here .. from sklearn.decomposition import RandomizedPCA rpca = RandomizedPCA(n_components=2) rpca.fit(armadillo) R = rpca.transform(armadillo) return R
def gap_statistic(x, random_datasets=64): """ Returns the gap statistic of the data set. Keeps increasing the number of clusters until the maximum gap statistic is more than double the current gap statistic. http://blog.echen.me/2011/03/19/counting-clusters/ """ assert isinstance(x, np.ndarray) assert len(x.shape) == 2 if x.shape > SETTINGS.GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD: pca = RandomizedPCA(SETTINGS.GAP_STATISTIC.RANDOMIZED_PCA_THRESHOLD) else: pca = PCA() pca.fit(x) transformed = pca.transform(x) reference_datasets = [ pca.inverse_transform(generate_random_dataset(transformed)) for _ in range(random_datasets) ] max_gap_statistic = -1 best_num_clusters = 1 for num_clusters in range(1, x.shape[0] + 1): kmeans = MiniBatchKMeans(num_clusters) kmeans.fit(x) trained_dispersion = dispersion(kmeans, x) random_dispersions = [ dispersion(kmeans, data) for data in reference_datasets ] gap_statistic = np.log(sum(random_dispersions) / random_datasets) - np.log(trained_dispersion) if gap_statistic > max_gap_statistic: max_gap_statistic = gap_statistic best_num_clusters = num_clusters if gap_statistic < max_gap_statistic * SETTINGS.GAP_STATISTIC.MAXIMUM_DECLINE: break if num_clusters > best_num_clusters + SETTINGS.GAP_STATISTIC.NUM_CLUSTERS_WITHOUT_IMPROVEMENT: break return best_num_clusters
def perform_weighted_PCA(data, weights, max_components=200): """ Performs Weighted PCA on the data Parameters ---------- data : (Num_Features x Num_Samples) numpy.ndarray (or subclass) Matrix containing data to project into 2 dimensions weights : (Num_Features x Num_Samples) numpy.ndarray (or subclass) Matrix containing weights to use for each coordinate in data max_components: int Maximum number of components to calculate Returns ------- pca_data : (Num_Components x Num_Samples) numpy.ndarray Data transformed using PCA. Num_Components = Num_Samples """ np.random.seed(RANDOM_SEED) proj_data = data #Weighted means wmean = np.sum(proj_data * weights, axis=1) / np.sum(weights, axis=1) wmean = wmean.reshape((wmean.size, 1)) data_centered = proj_data - wmean weighted_data_centered = data_centered * weights wcov = np.dot(weighted_data_centered, weighted_data_centered.T) / np.dot( weights, weights.T) wcov[np.isnan(wcov)] = 0.0 # Need this when weight dot product is zero model = RandomizedPCA(n_components=min(proj_data.shape[0], proj_data.shape[1], max_components)) model.fit(wcov) e_vec = model.components_ wpca_data = np.dot(e_vec, data_centered) e_val = np.var(wpca_data, axis=1) total_var = np.sum(np.var(proj_data, axis=1)) e_val /= total_var return wpca_data, e_val, e_vec.T
def callRandomizedPCA(X, n, type): # type = 1 for Energy data to avoid 1D plot, 2 for others rpca = RandomizedPCA(n_components=n) rpca.fit(X) transformed = rpca.transform(X) print("original shape: ", X.shape) print("transformed shape after Randomized PCA:", transformed.shape) X_recons = rpca.inverse_transform(transformed) print("reconstruct shape after Randomized PCA:", X_recons.shape) if type == 2: # Gstore data myplot(transformed[:, 0:2], np.transpose(rpca.components_[0:2, :])) plt.show() myplot(X_recons[:, 0:2], np.transpose(rpca.components_[0:2, :])) plt.show() return transformed
class RandomizedPCAReduction(AbstractReduction): """ Use Randomized PCA to reduce dimensionality http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.RandomizedPCA.html """ def __init__(self, n_components, **kwargs): self.pca = RandomizedPCA(n_components=n_components, **kwargs) def n_components(self): return self.n_components def fit(self, X): self.pca.fit(X) def transform(self, X): return self.pca.transform(X)
def pca_knn(train, test): y = [] xTrain, yTrain = loadData(train) xTest, yTest = loadData(test) for i in [32, 64, 128] : print "n_components", i pca = RandomizedPCA(n_components = i, random_state = 1) pca.fit(xTrain) reducedXTrain = pca.transform(xTrain) reducedXTest = pca.transform(xTest) kNN = KNeighborsClassifier(n_neighbors = 4, weights = 'distance') kNN.fit(reducedXTrain, yTrain) y = kNN.predict(reducedXTest) testError = 1 - kNN.score(reducedXTest, yTest) print 'Test error: ' , testError print "sum of explained_variance_ratio_", pca.explained_variance_ratio_.sum() return y
def fit_and_save_pca(np_array, savepath): if parameters['pca']['subsample_length'] < np_array.shape[0]: idxs = np.random.choice(np_array.shape[0], parameters['pca']['subsample_length'], replace=False) np_array = np_array[idxs] # fit the pca model # NOTE that by setting copy=False, we overwrite the input data in fitting. # This helps on memory but could cause issues if this function is reused elsewhere. pca = RandomizedPCA(n_components=parameters['pca']['number_dims'], copy=False) pca.fit(np_array) with open(savepath, 'wb') as f: pickle.dump(pca, f, pickle.HIGHEST_PROTOCOL)