def tryLinearDiscriminantAnalysis(goFast): from sklearn.datasets import dump_svmlight_file, load_svmlight_file if goFast: training_data, training_labels = load_svmlight_file("dt1_1500.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1_1500.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1_1500.tst.svm", n_features=253659, zero_based=True) else: training_data, training_labels = load_svmlight_file("dt1.trn.svm", n_features=253659, zero_based=True) validation_data, validation_labels = load_svmlight_file("dt1.vld.svm", n_features=253659, zero_based=True) testing_data, testing_labels = load_svmlight_file("dt1.tst.svm", n_features=253659, zero_based=True) from sklearn.lda import LDA from sklearn.metrics import accuracy_score from sklearn.grid_search import ParameterGrid from sklearn.decomposition import RandomizedPCA rpcaDataGrid = [{"n_components": [10,45,70,100], "iterated_power": [2, 3, 4], "whiten": [True]}] for rpca_parameter_set in ParameterGrid(rpcaDataGrid): rpcaOperator = RandomizedPCA(**rpca_parameter_set) rpcaOperator.fit(training_data,training_labels) new_training_data = rpcaOperator.transform(training_data,training_labels) new_validation_data = rpcaOperator.transform(validation_data,validation_labels) ldaOperator = LDA() ldaOperator.fit(new_training_data,training_labels) print "Score = " + str(accuracy_score(validation_labels,ldaOperator.predict(new_validation_data)))
def test_randomized_pca_check_list(): """Test that the projection by RandomizedPCA on list data is correct""" X = [[1.0, 0.0], [0.0, 1.0]] X_transformed = RandomizedPCA(n_components=1, random_state=0).fit(X).transform(X) assert_equal(X_transformed.shape, (2, 1)) assert_almost_equal(X_transformed.mean(), 0.00, 2) assert_almost_equal(X_transformed.std(), 0.71, 2)
def make_pca_datapoints(terms_map, stopwords, clusters): new_terms_map = {} raw_data = [] target = [] for line in open(tweets_file): tokens = line.split() terms = [terms_map[int(term)] for term in tokens[3].split(',') if terms_map[int(term)] not in stopwords] for term in terms: if not term in new_terms_map: new_terms_map[term] = len(new_terms_map) new_term_ids = [new_terms_map[term] for term in terms] tags = [terms_map[int(term)] for term in tokens[4].split(',')] raw_data.append(new_term_ids) target.append(tags) data = lil_matrix( (len(raw_data), len(new_terms_map)) ) count = 0 for cur_vector in raw_data: for point in cur_vector: data[(count, point)] += 1 count += 1 pca = RandomizedPCA (n_components=100) transformed_data = pca.fit_transform(data) xs = [] ys = [] count = 0 for datum in transformed_data: for tag in target[count]: if (len(tag) > 1) and tag[1:] in clusters: xs.append(datum) ys.append(clusters[tag[1:]]) count += 1 del transformed_data return xs, ys
def build_classifier(train_data_x_in, train_data_y, classifier_in="svc_basic"): print "Attempting to build classifier." train_data_x = train_data_x_in transformer = "" # classifier = grid_search.GridSearchCV(svm.SVC(), parameters).fit(train_data_x, train_data_y) if classifier_in == "svc_basic": classifier = svm.SVC() print "Selection was basic svm.SVC." elif classifier_in == "svc_extensive": classifier = svm.SVC(kernel="linear", C=0.025, gamma=0.01) print "Selection was extensive svm.SVC, with linear kernel, C==0.025 and gamma==0.01." elif classifier_in == "kneighbors_basic": transformer = RandomizedPCA(n_components=2000) train_data_x = transformer.fit_transform(train_data_x) classifier = KNeighborsClassifier() print "Selection was KNeighbors basic, using RandomizedPCA to transform data first. n_components==2000." elif classifier_in == "bagging_basic": classifier = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5) print "Selection was Bagging basic, with max_samples==0.5 and max_features==0.5." elif classifier_in == "spectral_basic": transformer = SpectralEmbedding(n_components=2000) train_data_x = transformer.fit_transform(train_data_x) classifier = KNeighborsClassifier() print "Selection was Spectral basic, using svm.SVC with Spectral data fitting. n_components==2000." # default to SVC in case of any sort of parsing error. else: print "Error in selecting classifier class. Reverting to SVC." classifier = svm.SVC() classifier.fit(train_data_x, train_data_y) print "Doing classifier estimation." return classifier, train_data_x, transformer
def _prepare_pca(self, data, max_n_components): """ Helper Function """ from sklearn.decomposition import RandomizedPCA # sklearn < 0.11 does not support random_state argument kwargs = {'n_components': max_n_components, 'whiten': False} aspec = inspect.getargspec(RandomizedPCA.__init__) if 'random_state' not in aspec.args: warnings.warn('RandomizedPCA does not support random_state ' 'argument. Use scikit-learn to version 0.11 ' 'or newer to get reproducible results.') else: kwargs['random_state'] = 0 pca = RandomizedPCA(**kwargs) pca_data = pca.fit_transform(data.T) if self._explained_var > 1.0: if self.n_components is not None: # normal n case self._comp_idx = np.arange(self.n_components) to_ica = pca_data[:, self._comp_idx] else: # None case to_ica = pca_data self.n_components = pca_data.shape[1] self._comp_idx = np.arange(self.n_components) else: # float case expl_var = pca.explained_variance_ratio_ self._comp_idx = (np.where(expl_var.cumsum() < self._explained_var)[0]) to_ica = pca_data[:, self._comp_idx] self.n_components = len(self._comp_idx) return to_ica, pca
def test_explained_variance(): """Check that PCA output has unit-variance""" rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2).fit(X) rpca = RandomizedPCA(n_components=2, random_state=42).fit(X) assert_array_almost_equal(pca.explained_variance_, rpca.explained_variance_, 1) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 3) # compare to empirical variances X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, np.var(X_pca, axis=0)) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0)) # Compare with RandomizedPCA using sparse data X = csr_matrix(X) rpca = assert_warns(DeprecationWarning, rpca.fit, X) assert_array_almost_equal(pca.explained_variance_, rpca.explained_variance_, 1) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 3)
def rpca(numpy_file='../data/Paintings/two_class/Paintings_train.csv'): """ Performs randomized PCA on given numpy file. Given a numpy file of n-rows and n-cols, where the last column is the label and rest are features,n-rows are the samples. :type numpy_file: string :param numpy_file: The file name of numpy file to be analyzed. """ import numpy as np import matplotlib.pyplot as pl import pandas as pd from sklearn.decomposition import RandomizedPCA all_data = np.loadtxt(numpy_file,delimiter=',') data = all_data[:,:-1] y = all_data[:,-1] pca = RandomizedPCA(n_components=2) X = pca.fit_transform(data) df = pd.DataFrame({"x": X[:, 0], "y": X[:, 1],\ "label":np.where(y==1, "realism", "abstract")}) colors = ["red", "yellow"] for label, color in zip(df['label'].unique(), colors): mask = df['label']==label pl.scatter(df[mask]['x'], df[mask]['y'], c=color, label=label) pl.legend() pl.title('Randomized PCA analysis') pl.show()
def SVM(X_train, y_train, X_test): print("SVM with PCA of rbf, writening all on, no normalize") preprocessing.normalize(X_train, 'max') preprocessing.normalize(X_test, 'max') #preprocessing.robust_scale(X, axis=1, with_centering = True) #bad X_train = equalize_hist(X_train) X_test = equalize_hist(X_test) '''X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO)''' n_components = 147 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) return list(classifier13.predict(X_test_pca))
def main(): ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required = True, help = "Path to the image") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) rects, img = detect(image) cropped = [] for idx, (x1, y1, x2, y2) in enumerate(rects): crop_img = image[y1:y1 + (y2 - y1), x1:x1 + (x2 - x1)] crop_img = cv2.resize(crop_img, (100,100), interpolation = cv2.INTER_AREA) cv2.imshow("image" + str(idx), crop_img) new_img = crop_img.reshape(crop_img.shape[0] * crop_img.shape[1], 3) cropped.append(new_img.flatten()) # reduce feature size cropped_pca = [] pca = RandomizedPCA(n_components=100) cropped_pca = pca.fit_transform(cropped) # training (hardcoded for now) clf = SVC(probability=True) train = cropped_pca[:7] test = cropped_pca[7:13] # clf.fit([[0,0],[1,1]], [1, 2]) clf.fit(train, [1,2,2,1,2,1,1]) for item in test: print clf.predict_proba(item) print clf.predict(item) cv2.waitKey(0)
def SVM(X_data, y_data): X_data = equalize_hist(X_data) preprocessing.normalize(X_data, 'max') preprocessing.scale(X_data, axis=1) # preprocessing.normalize(X_data, 'max') # X_data = equalize_hist(X_data) # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_data, y_data, test_size=TRAIN_TEST_SPLIT_RATIO) n_components = 120 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier.fit(X_train_pca, y_train) print("====== PCA 150 ========") print('TRAIN SCORE', classifier.score(X_train_pca, y_train)) print('TEST SCORE', classifier.score(X_test_pca, y_test))
def detect(self, imageURLs, params): array = [] for param in params: img = self.img_to_matrix(param['imageURL']) data = self.flatten_image(img) array.append(data) array = np.array(array) pca = RandomizedPCA(n_components=5) n_data = pca.fit_transform(array) clf = joblib.load('src/resource/models/model.pkl') result = clf.predict(n_data).tolist() for param, r in zip(params, result): raw_img = urllib2.urlopen(param['imageURL']).read() if r == 1: cntr = len([i for i in os.listdir("test/images/rain/") if 'rain' in i]) + 1 path = "static/images/rain_" + str(cntr) + '.jpg' f = open(path, 'wb') f.write(raw_img) f.close() # イベント情報作成 when = {'type': 'timestamp', 'time':param['time']} where = { "type": "Point", "coordinates": [param['longitude'], param['latitude']]} what = {'topic': {'value':u'雨'}, 'tweet': param['value']} who = [{"type": "url", "value": param['imageURL']}, {"value": "evwh <*****@*****.**>", "type": "author"}] event = {'observation':{'what': what, 'when': when, 'where': where, 'who': who}} self.connection['event']['TwitterImageRainSensor'].insert(event)
def test_sparse_randomized_pca_inverse(): """Test that RandomizedPCA is inversible on sparse data""" rng = np.random.RandomState(0) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small # no large means because the sparse version of randomized pca does not do # centering to avoid breaking the sparsity X = csr_matrix(X) # same check that we can find the original data from the transformed signal # (since the data is almost of rank n_components) pca = RandomizedPCA(n_components=2, random_state=0) assert_warns(DeprecationWarning, pca.fit, X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) assert_almost_equal(X.todense(), Y_inverse, decimal=2) # same as above with whitening (approximate reconstruction) pca = assert_warns(DeprecationWarning, RandomizedPCA(n_components=2, whiten=True, random_state=0).fit, X) Y = pca.transform(X) Y_inverse = pca.inverse_transform(Y) relative_max_delta = (np.abs(X.todense() - Y_inverse) / np.abs(X).mean()).max() # XXX: this does not seam to work as expected: assert_almost_equal(relative_max_delta, 0.91, decimal=2)
def pca_data(test_x, train_x, params): print 'pcaing data ...' components = int(params['components']) pca = RandomizedPCA(components, whiten=True).fit(train_x) pca_train_x = pca.transform(train_x) pca_test_x = pca.transform(test_x) return pca_test_x, pca_train_x
def compute_pca(reception_stats,n_components=5): reception_mean = reception_stats.mean(axis=0) pca = RandomizedPCA(n_components-1) pca.fit(reception_stats) pca_components = np.vstack([reception_mean,pca.components_]) return pca,pca_components
def getPrincipleComponents(xtr, xte, n_components=50): train = np.array(xtr) test = np.array(xte) pca = RandomizedPCA(n_components=n_components).fit(train) xtrain = pca.transform(train) xtest = pca.transform(test) return xtrain, xtest
def do_pca(corr_matrix: _nested_ndarray, num_dim: int, min_var_explanation: float =0.7) -> _nested_ndarray: ''' This method performs PCA on a self-correlation matrix, reducing the number of columns to `num_dim`. If such analysis does not sufficiently explain the underlying variance in the data, an exception is thrown. Args: * `corr_matrix` - a square matrix of correlations * `num_dim` - the number of dimensions to which the data should be reduced * `min_var_explanation` - the minimum fraction of the underlying data variance that should be explained Returns: > A matrix of the PCA result on `corr_matrix`. ''' num_dim = int(num_dim) pca = PCA(n_components=num_dim, random_state=0) pca_result = pca.fit_transform(corr_matrix) var_ratio = pca.explained_variance_ratio_ if sum(var_ratio) < min_var_explanation: raise PcaAccuracyException( 'PCA doesn\'t explain enough of the variance in the data') return pca_result
def fit(self): wordids_map = NameToIndex() labs_map = NameToIndex() wordscount = self._word_cluster.get_words_count() print "start compute_tfidf ..." #计算文档的词袋模型 docs = self._word_cluster.get_samples() count =0 bow = [] labs = [] for k,v in docs.iteritems(): vec = numpy.zeros(wordscount).tolist() for i in v: vec[wordids_map.map(i)]+=1 bow.append(vec) labs.append(labs_map.map(k[0])) labs = numpy.array(labs) tfidf = TfidfTransformer(smooth_idf=True, sublinear_tf=True,use_idf=True) datas = numpy.array(tfidf.fit_transform(bow).toarray()) print "compute_tfidf done" pca = RandomizedPCA(n_components=20, whiten=True).fit(datas) svc = train_svc(numpy.array(labs_map.names), labs, pca.transform(datas)) self._tfidf = tfidf self._svc = svc self._labs_map = labs_map self._wordids_map = wordids_map self._pca = pca
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def do_nbnn(train_folder, test_folder): train = load_patches(args.train_folder) test = load_patches(args.test_folder) if options.relu: get_logger().info("Applying RELU") for class_data in train: class_data.patches = class_data.patches.clip(min=0) for class_data in test: class_data.patches = class_data.patches.clip(min=0) if options.scale: get_logger().info("Applying standardization") scaler = StandardScaler(copy=False) scaler.fit(np.vstack([t.patches for t in train])) for class_data in train: class_data.patches = scaler.transform(class_data.patches) for class_data in test: class_data.patches = scaler.transform(class_data.patches) if options.pca: get_logger().info("Calculating PCA") pca = RandomizedPCA(n_components=options.pca) pca.fit(np.vstack([t.patches for t in train])) #for class_data in train: #get_logger().info("Fitting class " + class_data.name) #pca.partial_fit(class_data.patches) get_logger().info("Keeping " + str(pca.explained_variance_ratio_.sum()) + " variance (" + str(options.pca) + ") components\nApplying PCA") for class_data in train: class_data.patches = pca.transform(class_data.patches) for class_data in test: class_data.patches = pca.transform(class_data.patches) nbnn(train, test, NN_Engine())
def main(): img_dir = 'images/' images = [img_dir + f for f in os.listdir(img_dir)] labels = [f.split('/')[-1].split('_')[0] for f in images] label2ids = {v: i for i, v in enumerate(sorted(set(labels), key=labels.index))} y = np.array([label2ids[l] for l in labels]) data = [] for image_file in images: img = img_to_matrix(image_file) img = flatten_image(img) data.append(img) data = np.array(data) # training samples is_train = np.random.uniform(0, 1, len(data)) <= 0.7 train_X, train_y = data[is_train], y[is_train] # training a classifier pca = RandomizedPCA(n_components=5) train_X = pca.fit_transform(train_X) multi_svm = OneVsRestClassifier(LinearSVC()) multi_svm.fit(train_X, train_y) # evaluating the model test_X, test_y = data[is_train == False], y[is_train == False] test_X = pca.transform(test_X) print pd.crosstab(test_y, multi_svm.predict(test_X), rownames=['Actual'], colnames=['Predicted'])
def test_explained_variance(): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2).fit(X) rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 1) # compare to empirical variances X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, np.var(X_pca, axis=0)) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0), decimal=1) # Same with correlated data X = datasets.make_classification(n_samples, n_features, n_informative=n_features-2, random_state=rng)[0] pca = PCA(n_components=2).fit(X) rpca = RandomizedPCA(n_components=2, random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 5)
def reduce_features(features, var_explained=0.9, n_components=0, verbose=False): """ Performs feature reduction using PCA. Automatically selects nr. components for explaining min_var_explained variance. :param features: Features. :param var_explained: Minimal variance explained. :param n_components: Nr. of components. :param exclude_columns: Columns to exclude. :param verbose: Verbosity. :return: Reduced feature set. """ if n_components == 0: # Run full PCA to estimate nr. components for explaining given # percentage of variance. estimator = RandomizedPCA() estimator.fit_transform(features) variance = 0.0 for i in range(len(estimator.explained_variance_ratio_)): variance += estimator.explained_variance_ratio_[i] if variance > var_explained: n_components = i + 1 if verbose: print('{} % of variance explained using {} components'.format(var_explained, n_components)) break # Re-run PCA with only estimated nr. components estimator = RandomizedPCA(n_components=n_components) features = estimator.fit_transform(features) return features
def SVM(X, y): X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=TRAIN_TEST_SPLIT_RATIO) print(len(X_train)) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") X_train_pca = equalize_hist(X_train_pca) preprocessing.scale(X_train_pca * 1.0, axis=1) X_test_pca = equalize_hist(X_test_pca) preprocessing.scale(X_test_pca * 1.0, axis=1) # classifier = svm.SVC(kernel='poly', degree = 3) # classifier.fit(X_train, y_train) # # print("======",3,"========") # print('TRAIN SCORE', classifier.score(X_train, y_train)) # print('TEST SCORE', classifier.score(X_test, y_test)) param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier2 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier2.fit(X_train_pca, y_train) # print("======",3,"========") print('TRAIN SCORE', classifier2.score(X_train_pca, y_train)) print('TEST SCORE', classifier2.score(X_test_pca, y_test))
def dimentionality_reduction(train_x , test_x): print "Dimentionality reduction to 10D on training and test data...." pca = RandomizedPCA(n_components=10) train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print "Done." return train_x , test_x
def pca_estimator(data, targets, estimator, components_number=DEFAULT_COMPONENTS_NUMBER, folds_number=DEFAULT_FOLDS_NUMBER): kf = KFold(len(targets), n_folds=folds_number) # 'scores' is numpy array. An index is a number of a fold. A value is a percent of right # predicted samples from a test. scores = np.zeros(folds_number) start = time() index = 0 for train, test in kf: x_train, x_test, y_train, y_test = data[train], data[test], targets[train], targets[test] pca = RandomizedPCA(n_components=components_number, whiten=True).fit(x_train) x_train_pca = pca.transform(x_train) x_test_pca = pca.transform(x_test) clf = estimator.fit(x_train_pca, y_train) scores[index] = clf.score(x_test_pca, y_test) index += 1 # print("Iteration %d from %d has done! Score: %f" % (index, folds_number, # scores[index - 1])) finish = time() return scores.mean(), scores.std() * 2, (finish - start)
def scatter(data, labels=None, title=None, name=None): """2d PCA scatter plot with optional class info Return the pca model to be able to introspect the components or transform new data with the same model. """ data = atleast2d_or_csr(data) if data.shape[1] == 2: # No need for a PCA: data_2d = data else: pca = RandomizedPCA(n_components=2) data_2d = pca.fit_transform(data) for i, c, m in zip(np.unique(labels), cycle(COLORS), cycle(MARKERS)): plt.scatter(data_2d[labels == i, 0], data_2d[labels == i, 1], c=c, marker=m, label=i, alpha=0.5) plt.legend(loc='best') if title is None: title = "2D PCA scatter plot" if name is not None: title += " for " + name plt.xlabel('First Principal Component') plt.ylabel('Second Principal Component') plt.title(title) return pca
def LogisticRegressionPCA(X, y): # divide our data set into a training set and a test set X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=TRAIN_TEST_SPLIT_RATIO) # get randomized PCA model num_components = 147 print("Extracting the top %d eigenfaces from %d faces" % (num_components, X_train.shape[0])) pca = RandomizedPCA(n_components=num_components, whiten=True).fit(X_train) # use the PCA model on our training set and test set. print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") h = .02 # step size in the mesh logistic_regression = linear_model.LogisticRegression(C=1e5) # we create an instance of Neighbours Classifier and fit the data. logistic_regression.fit(X, y) # print the performance of logistic regression print("====== Logistic Regression with PCA ========") print('TRAIN SCORE', logistic_regression.score(X_train, y_train)) print('TEST SCORE', logistic_regression.score(X_test, y_test))
def calc_hog(fpaths, save=False): ''' Compute histogram of gradients (HOG). Saves in batches to prevent memory issues. Input: fpaths : files on which HOG will be computed save : if true, output is saved to disk ''' hogs = np.empty((len(fpaths), 15876)) for i, fpath in enumerate(fpaths): img = imread(os.path.join(imgdir, fpath)) if len(img.shape)==3: img = rgb2gray(img) # rescale so all feature vectors are the same length img_resize = resize(img, (128, 128)) img_hog = hog(img_resize) hogs[i, :] = img_hog hogs_sc = scale(hogs) n_components = 15 pca = RandomizedPCA(n_components=n_components) hogs_decomp = pca.fit_transform(hogs_sc) df = pd.DataFrame(hogs_decomp, index=[os.path.split(i)[1] for i in fpaths]) df.index.name='fpath' df.columns = ['feat_hog_%2.2u' % i for i in range(1, n_components+1)] if save: df.to_csv('hog.csv') return df
def pca_linear_initialization(self, data): """ We initialize the map, just by using the first two first eigen vals and eigenvectors Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each X = UsigmaWT XTX = Wsigma^2WT T = XW = Usigma // Transformed by W EigenVector, can be calculated by multiplication PC matrix by eigenval too // Further, we can get lower ranks by using just few of the eigen vevtors T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors (*) Note that 'X' is the covariance matrix of original data :param data: data to use for the initialization :returns: initialized matrix with same dimension as input data """ cols = self.mapsize[1] coord = None pca_components = None if np.min(self.mapsize) > 1: coord = np.zeros((self.nnodes, 2)) pca_components = 2 for i in range(0, self.nnodes): coord[i, 0] = int(i / cols) # x coord[i, 1] = int(i % cols) # y elif np.min(self.mapsize) == 1: coord = np.zeros((self.nnodes, 1)) pca_components = 1 for i in range(0, self.nnodes): coord[i, 0] = int(i % cols) # y mx = np.max(coord, axis=0) mn = np.min(coord, axis=0) coord = (coord - mn)/(mx-mn) coord = (coord - .5)*2 me = np.mean(data, 0) data = (data - me) tmp_matrix = np.tile(me, (self.nnodes, 1)) pca = RandomizedPCA(n_components=pca_components) # Randomized PCA is scalable pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec)) eigvec = ((eigvec.T/norms)*eigval).T for j in range(self.nnodes): for i in range(eigvec.shape[0]): tmp_matrix[j, :] = tmp_matrix[j, :] + coord[j, i]*eigvec[i, :] self.matrix = np.around(tmp_matrix, decimals=6) self.initialized = True
def rpca(train_X, test_X, n): start_time = time.time() pca = RandomizedPCA(n_components=n) pca.fit(train_X.toarray()) train_X_pca = pca.transform(train_X.toarray()) test_X_pca = pca.transform(test_X.toarray()) print("--- %s seconds ---" % (time.time() - start_time)) return pca, train_X_pca, test_X_pca
def SVM(X, y): print("SVM with PCA of rbf, writening all on, no normalize") preprocessing.normalize(X, 'max') #preprocessing.robust_scale(X, axis=1, with_centering = True) #bad X = equalize_hist(X) X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=TRAIN_TEST_SPLIT_RATIO) n_components = 120 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 120 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 130 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 130 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) n_components = 147 print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=False).fit(X_train) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print("done ") param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } classifier13 = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) classifier13.fit(X_train_pca, y_train) print("====== PCA 147 ========") print('TRAIN SCORE', classifier13.score(X_train_pca, y_train)) print('TEST SCORE', classifier13.score(X_test_pca, y_test)) '''
def test_SVM(face_profile_data, face_profile_name_index, face_dim, face_profile_names): """ Testing: Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle Parameters ---------- face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image) The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data face_profile_name_index : ndarray The name corresponding to the face profile is encoded in its index face_dim : tuple (int, int) The dimension of the face data is reshaped to face_profile_names: ndarray The names corresponding to the face profiles Returns ------- clf : theano object The trained SVM classification model pca : theano ojbect The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data """ X = face_profile_data y = face_profile_name_index X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 # maximum number of components to keep print("\nExtracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape((n_components, face_dim[0], face_dim[1])) # This portion of the code is used if the data is scarce, it uses the number # of imputs as the number of features # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train) # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1])) print("\nProjecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model print("\nFitting the classifier to the training set") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) # Train_pca Test Error Rate: 0.0670016750419 # Train_pca Test Recognition Rate: 0.932998324958 # clf = SVC(kernel='linear', C=1) # 2452 samples from 38 people are loaded # Extracting the top 150 eigenfaces from 1839 faces # Extracting the top 150 eigenfaces from 1790 faces # Train_pca Test Error Rate: 0.0904522613065 # Train_pca Test Recognition Rate: 0.909547738693 # clf = SVC(kernel='poly') # Train_pca Test Error Rate: 0.201005025126 # Train_pca Test Recognition Rate: 0.798994974874 # clf = SVC(kernel='sigmoid') # Train_pca Test Error Rate: 0.985318107667 # Train_pca Test Recognition Rate: 0.0146818923328 # clf = SVC(kernel='rbf').fit(X_train, y_train) # Train_pca Test Error Rate: 0.0619765494137 # Train_pca Test Recognition Rate: 0.938023450586 # Best Estimator found using Radial Basis Function Kernal: clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) # Train_pca with Alex Test Error Rate: 0.088424437299 # Train_pca with Alex Test Recognition Rate: 0.911575562701 clf = clf.fit(X_train_pca, y_train) # print("\nBest estimator found by grid search:") # print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set print("\nPredicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("\nPrediction took %0.8f second per sample on average" % ((time() - t0)/y_pred.shape[0]*1.0)) # print "predicated names: ", y_pred # print "actual names: ", y_test error_rate = errorRate(y_pred, y_test) print ("\nTest Error Rate: %0.4f %%" % (error_rate * 100)) print ("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100)) ############################################################################### # Testing # X_test_pic1 = X_test[0] # X_test_pic1_for_display = np.reshape(X_test_pic1, face_dim) # t0 = time() # pic1_pred_name = predict(clf, pca, X_test_pic1, face_profile_names) # print("\nPrediction took %0.3fs" % (time() - t0)) # print "\nPredicated result for picture_1 name: ", pic1_pred_name # for i in range(1,3): print ("\n") # Display the picture # plt.figure(1) # plt.title(pic1_pred_name) # plt.subplot(111) # plt.imshow(X_test_pic1_for_display) # plt.show() ############################################################################### # Qualitative evaluation of the predictions using matplotlib # import matplotlib.pyplot as plt # def plot_gallery(images, titles, face_dim, n_row=3, n_col=4): # """Helper function to plot a gallery of portraits""" # plt.figure(figsize=(1.8 * n_col, 2.4 * n_row)) # plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) # for i in range(n_row * n_col): # plt.subplot(n_row, n_col, i + 1) # plt.imshow(images[i].reshape(face_dim), cmap=plt.cm.gray) # plt.title(titles[i], size=12) # plt.xticks(()) # plt.yticks(()) # # plot the result of the prediction on a portion of the test set # def title(y_pred, y_test, face_profile_names, i): # pred_name = face_profile_names[y_pred[i]].rsplit(' ', 1)[-1] # true_name = face_profile_names[y_test[i]].rsplit(' ', 1)[-1] # return 'predicted: %s\ntrue: %s' % (pred_name, true_name) # prediction_titles = [title(y_pred, y_test, face_profile_names, i) # for i in range(y_pred.shape[0])] # plot_gallery(X_test, prediction_titles, face_dim) # # plot the gallery of the most significative eigenfaces # eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] # plot_gallery(eigenfaces, eigenface_titles, face_dim) # plt.show() return clf, pca
def build_SVC(face_profile_data, face_profile_name_index, face_dim): """ Build the SVM classification modle using the face_profile_data matrix (numOfFace X numOfPixel) and face_profile_name_index array, face_dim is a tuple of the dimension of each image(h,w) Returns the SVM classification modle Parameters ---------- face_profile_data : ndarray (number_of_images_in_face_profiles, width * height of the image) The pca that contains the top eigenvectors extracted using approximated Singular Value Decomposition of the data face_profile_name_index : ndarray The name corresponding to the face profile is encoded in its index face_dim : tuple (int, int) The dimension of the face data is reshaped to Returns ------- clf : theano object The trained SVM classification model pca : theano ojbect The pca that contains the top 150 eigenvectors extracted using approximated Singular Value Decomposition of the data """ X = face_profile_data y = face_profile_name_index X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 10 # maximum number of components to keep print("\nExtracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])) pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) eigenfaces = pca.components_.reshape((n_components, face_dim[0], face_dim[1])) # This portion of the code is used if the data is scarce, it uses the number # of imputs as the number of features # pca = RandomizedPCA(n_components=None, whiten=True).fit(X_train) # eigenfaces = pca.components_.reshape((pca.components_.shape[0], face_dim[0], face_dim[1])) print("\nProjecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) # Train a SVM classification model print("\nFitting the classifier to the training set") param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) # Best Estimator found using Radial Basis Function Kernal: clf = SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) # Train_pca with Alex Test Error Rate: 0.088424437299 # Train_pca with Alex Test Recognition Rate: 0.911575562701 clf = clf.fit(X_train_pca, y_train) # print("\nBest estimator found by grid search:") # print(clf.best_estimator_) ############################################################################### # Quantitative evaluation of the model quality on the test set print("\nPredicting people's names on the test set") t0 = time() y_pred = clf.predict(X_test_pca) print("\nPrediction took %s per sample on average" % ((time() - t0)/y_pred.shape[0]*1.0)) # print "predicated names: ", y_pred # print "actual names: ", y_test error_rate = errorRate(y_pred, y_test) print ("\nTest Error Rate: %0.4f %%" % (error_rate * 100)) print ("Test Recognition Rate: %0.4f %%" % ((1.0 - error_rate) * 100)) return clf, pca
mapping_targets = np.hstack(np.array(mapping_targets)) first_half = np.hstack(Data[0:no_mappings/2,:,:,:]) first_half = np.vstack(first_half) second_half = np.hstack(Data[no_mappings/2:no_mappings,:,:,:]) second_half = np.vstack(second_half) Data = np.vstack([first_half,second_half]) # for true targets uncomment next line targets = np.hstack([targets,targets]) #for random targets uncomment next line #targets = np.random.randint(1,no_locations+1,no_mappings*no_locations*no_thwacks) lda = LDA(n_components=14) pca = RandomizedPCA(n_components = 125) classifier = KNeighborsClassifier(8) proj = pca.fit_transform(Data) proj = lda.fit_transform(proj,targets) proj1 = pca.fit_transform(Data) proj1 = lda.fit_transform(proj1,mapping_targets) print(file) plt.clf() plt.scatter(proj[0:proj.shape[0]/2,0],proj[0:proj.shape[0]/2,1],c=targets[0:targets.shape[0]/2]) plt.title(file.rsplit('_')[0]+'_'+file.rsplit('_')[1]+" Before "+file.rsplit('_')[2]+" injection") plt.colorbar() plt.ylabel("LD1") plt.xlabel("LD2") plt.savefig(file.rsplit('_')[0]+'_'+file.rsplit('_')[1]+" Before "+file.rsplit('_')[2]+file[-11:-4]+" injection.svg") plt.show() plt.clf()
color='w', zorder=10) plt.title('Kmeans clustering on Pima dataset after ICA\n' 'Centroids are marked with white cross') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) file_name = 'Plots/Kmeans Pima after ICA ' + str(n_components) + '.png' fig.savefig(file_name) plt.close() ############################################################################## # Visualize the results on RP-reduced data reduced_data = RandomizedPCA(n_components=2).fit_transform(data) kmeans = KMeans(init="random", n_clusters=n_components, n_init=10) kmeans.fit(reduced_data) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max]. # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() print(x_min, x_max, y_min, y_max) xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) fig = plt.figure() plt.clf()
def pca_linear_initialization(self, data): """ We initialize the map, just by using the first two first eigen vals and eigenvectors Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each X = UsigmaWT XTX = Wsigma^2WT T = XW = Usigma // Transformed by W EigenVector, can be calculated by multiplication PC matrix by eigenval too // Further, we can get lower ranks by using just few of the eigen vevtors T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors (*) Note that 'X' is the covariance matrix of original data :param data: data to use for the initialization :returns: initialized matrix with same dimension as input data """ cols = self.mapsize[1] coord = None pca_components = None if np.min(self.mapsize) > 1: coord = np.zeros((self.nnodes, 2)) pca_components = 2 for i in range(0, self.nnodes): coord[i, 0] = int(i / cols) # x coord[i, 1] = int(i % cols) # y elif np.min(self.mapsize) == 1: coord = np.zeros((self.nnodes, 1)) pca_components = 1 for i in range(0, self.nnodes): coord[i, 0] = int(i % cols) # y mx = np.max(coord, axis=0) mn = np.min(coord, axis=0) coord = (coord - mn) / (mx - mn) coord = (coord - .5) * 2 me = np.mean(data, 0) data = (data - me) tmp_matrix = np.tile(me, (self.nnodes, 1)) pca = RandomizedPCA( n_components=pca_components) # Randomized PCA is scalable pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec)) eigvec = ((eigvec.T / norms) * eigval).T for j in range(self.nnodes): for i in range(eigvec.shape[0]): tmp_matrix[ j, :] = tmp_matrix[j, :] + coord[j, i] * eigvec[i, :] self.matrix = np.around(tmp_matrix, decimals=6) self.initialized = True
def lininit(self): #X = UsigmaWT #XTX = Wsigma^2WT #T = XW = Usigma #Transformed by W EigenVector, can be calculated by #multiplication PC matrix by eigenval too #Furthe, we can get lower ranks by using just few of the eigen vevtors #T(2) = U(2)sigma(2) = XW(2) ---> 2 is the number of selected eigenvectors # This is how we initialize the map, just by using the first two first eigen vals and eigenvectors # Further, we create a linear combination of them in the new map by giving values from -1 to 1 in each #Direction of SOM map # it shoud be noted that here, X is the covariance matrix of original data msize = getattr(self, 'mapsize') rows = msize[0] cols = msize[1] nnodes = getattr(self, 'nnodes') if np.min(msize)>1: # set coordinates of the nodes in the grid (row, col) coord = np.zeros((nnodes, 2)) for i in range(0, nnodes): coord[i,0] = int(i / cols) #x coord[i,1] = int(i % cols) #y mx = np.max(coord, axis = 0) mn = np.min(coord, axis = 0) # normalize the coordinates between [-1,1] coord = (coord - mn) / (mx - mn) coord = (coord - .5) * 2 # for each column, shift data around its mean data = getattr(self, 'data') me = np.mean(data, 0) data = (data - me) # initialize codebook as a matrix dim * nnodes codebook = np.tile(me, (nnodes, 1)) # pca pca = RandomizedPCA(n_components=2) #Randomized PCA is scalable #pca = PCA(n_components=2) pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ # compute the norms of the eigenvectors, normalize and multiply by eigenvalue norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec)) eigvec = ((eigvec.T / norms) * eigval).T # add the normalized eigenvector for j in range(nnodes): for i in range(eigvec.shape[0]): codebook[j,:] += coord[j,i] * eigvec[i,:] return np.around(codebook, decimals = 6) elif np.min(msize) == 1: coord = np.arange(nnodes)[:, np.newaxis] mx = np.max(coord, axis = 0) mn = np.min(coord, axis = 0) # normalize the coordinates between [-1,1] coord = (coord - mn) / (mx - mn) coord = (coord - .5) * 2 # for each column, shift data around its mean data = getattr(self, 'data') me = np.mean(data, 0) data = (data - me) # initialize codebook as a matrix dim * nnodes codebook = np.tile(me, (nnodes,1)) # pca pca = RandomizedPCA(n_components=1) #Randomized PCA is scalable pca.fit(data) eigvec = pca.components_ eigval = pca.explained_variance_ # compute the norms of the eigenvectors, normalize and multiply by eigenvalue norms = np.sqrt(np.einsum('ij,ij->i', eigvec, eigvec)) eigvec = ((eigvec.T/norms)*eigval).T; eigvec.shape # add the normalized eigenvector for j in range(nnodes): for i in range(eigvec.shape[0]): codebook[j,:] += coord[j,i]*eigvec[i,:] return np.around(codebook, decimals = 6)
""" from sklearn.datasets import fetch_lfw_people from sklearn.decomposition import PCA as RandomizedPCA import matplotlib.pyplot as plt faces = fetch_lfw_people(min_faces_per_person=60) print(faces.target_names) print(faces.images.shape) n_samples, h, w = faces.images.shape print(n_samples) n_components = 150 pca = RandomizedPCA( n_components=n_components, svd_solver='randomized') ##Randomized PCA for the the first 150 components pca.fit(faces.data) print(pca.components_) ##These are the first 150 Principal Components pcacomponents25 = pca.components_[0:25] ##First 25 Principal Components eigenfaces = pca.components_.reshape( (n_components, h, w)) ##Eigenfaces for 150 PCs eigenfaces25 = pcacomponents25.reshape( (25, h, w)) ##Eigenfaces for first 25 PCs ## Plotting EigenFaces for First 25 PCs fig, axes = plt.subplots(3,
import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set() from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people(min_faces_per_person=60) print(faces.target_names) print(faces.images.shape) from sklearn.decomposition import RandomizedPCA pca = RandomizedPCA(150) pca.fit(faces.data) fig, axes = plt.subplots(3, 8, figsize=(9, 4), subplot_kw={ 'xticks': [], 'yticks': [] }, gridspec_kw=dict(hspace=0.1, wspace=0.1)) for i, ax in enumerate(axes.flat): ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone') plt.show() plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components')
img = img_to_matrix(image) img = flatten_image(img) data.append(img) data = np.array(data) is_train = np.random.uniform(0, 1, len(data)) <= 0.7 y = np.where(np.array(labels)==query, 1, 0) train_x, train_y = data[is_train], y[is_train] test_x, test_y = data[is_train==False], y[is_train==False] #add input to specify number of components to determine UniqueImageComponents = int(raw_input("How many unique features are needed to distinguish between your image types? Choose a number between 2 and 6. ")) pca = RandomizedPCA(n_components=UniqueImageComponents) X = pca.fit_transform(data) make_plot(pd) pl.show() train_x = pca.fit_transform(train_x) test_x = pca.transform(test_x) print "Training and Test sets are created." knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') print "Running your machine learning model on the test set." knn.fit(train_x, train_y) result = knn.predict(test_x)
def find_and(n_components, plot): print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_components, h, w)) # print 'components', pca.explained_variance_ratio_, pca.components_, print 'components1', pca.explained_variance_ratio_[0] print 'components1', pca.explained_variance_ratio_[1] print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set" t0 = time() param_grid = { 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], } # for sklearn version 0.16 or prior, the class_weight parameter value is 'auto' clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) prediction_titles = [ title(y_pred, y_test, target_names, i) for i in range(y_pred.shape[0]) ] if plot == True: plot_gallery(X_test, prediction_titles, h, w) # plot the gallery of the most significative eigenfaces eigenface_titles = [ "eigenface %d" % i for i in range(eigenfaces.shape[0]) ] plot_gallery(eigenfaces, eigenface_titles, h, w) pl.show()
random_pca_data_50 = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') random_pca_data_25 = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') random_pca_data_10 = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') pca_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') sparse_pca_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') kernel_pca_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') fast_ica_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') nmf_data = normalization('gene_IndividualsArr.pkl', 'top10Genes_Indiv.pkl') random_pca_50 = RandomizedPCA(n_components=50) random_pca_model_50 = random_pca_50.fit(random_pca_data_50) random_X_new_50 = random_pca_50.fit_transform(X) print 'random_pca_50 explained', random_pca_50.explained_variance_ratio_ print 'random_pca_50 explained sum', sum( random_pca_50.explained_variance_ratio_) joblib.dump(random_pca_model_50, 'random_pca_model_50.pkl') joblib.dump(random_pca_50.explained_variance_ratio_, 'random_pca_50.explained_variance_ratio_.pkl') joblib.dump(random_X_new_50, 'random_X_new_50.pkl') random_pca_25 = RandomizedPCA(n_components=25) random_pca_model_25 = random_pca_25.fit(random_pca_data_25) random_X_new_25 = random_pca_25.fit_transform(X) print 'random_pca_25 explained', random_pca_25.explained_variance_ratio_ print 'random_pca_25 explained sum', sum(
print "n_features: %d" % n_features print "n_classes: %d" % n_classes ############################################################################### # Split into a training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) ############################################################################### # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 250 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_components, h, w)) print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model print "Fitting the classifier to the training set"
print "n_features: %d" % n_features print "n_classes: %d" % n_classes ############################################################################### # Split into a training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) ############################################################################### # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled # dataset): unsupervised feature extraction / dimensionality reduction n_components = 150 print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) t0 = time() pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) print "done in %0.3fs" % (time() - t0) eigenfaces = pca.components_.reshape((n_components, h, w)) print "pca component variance ", pca.explained_variance_[:2] print "Projecting the input data on the eigenfaces orthonormal basis" t0 = time() X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) print "done in %0.3fs" % (time() - t0) ############################################################################### # Train a SVM classification model
# Encode the dependent variable labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) Y_Results = np.array(['0=Block', '1=NB-No_Block', '2=NB-Wait', '3=No_Block']) print(Y_Results) print(Y[:10]) # Part B: Run Random Component Analysis (RCA) algorithm # Scale the independent variobles sc = StandardScaler() X = sc.fit_transform(X) # Apply RCA to the independent variables rca = RCA(random_state=1) X_new = rca.fit_transform(X) var = rca.explained_variance_ratio_ print(pd.DataFrame(var[:10])) # Part C: Use dimensionally reduced dataset to cluster # Using the elbow method to find the optimal number of clusters wcss = [] for i in range(1, 15): kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 1) kmeans.fit(X_new) wcss.append(kmeans.inertia_) plt.plot(range(1, 15), wcss) plt.title('Finding the Best K: The Elbow Method') plt.xlabel('Number of clusters')
Use PCA to reconstruct some of the MNIST test digits. """ # My libraries import mnist_loader # Third-party libraries import matplotlib import matplotlib.pyplot as plt import numpy as np from sklearn.decomposition import RandomizedPCA # Training training_data, test_inputs, actual_test_results = mnist_loader.load_data_nn() pca = RandomizedPCA(n_components=30) nn_images = [x for (x, y) in training_data] pca_images = np.concatenate(nn_images, axis=1).transpose() pca_r = pca.fit(pca_images) # Try PCA on first ten test images test_images = np.array(test_inputs[:10]).reshape((10, 784)) test_outputs = pca_r.inverse_transform(pca_r.transform(test_images)) # Plot the first ten test images and the corresponding outputs fig = plt.figure() ax = fig.add_subplot(111) images_in = [test_inputs[j].reshape(-1, 28) for j in range(10)] images_out = [test_outputs[j].reshape(-1, 28) for j in range(10)] image_in = np.concatenate(images_in, axis=1) image_out = np.concatenate(images_out, axis=1)
plot_digits(filtered) show() #Example Eigenfaces #get out face data from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people(min_faces_per_person=60) print(faces.target_names) print(faces.images.shape) #we will use RandomizedPCA since this is a large dataset #we will reduce from near 3000 to 150 components from sklearn.decomposition import PCA as RandomizedPCA pca = RandomizedPCA(150) pca.fit(faces.data) fig, axes = plt.subplots(3, 8, figsize=(9, 4), subplot_kw={'xticks':[], 'yticks':[]}, gridspec_kw=dict(hspace=0.1, wspace=0.1)) for i, ax in enumerate(axes.flat): ax.imshow(pca.components_[i].reshape(62, 47), cmap='bone') show() #lets check the cumulative variance plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); #150 turns out to be around 90% of variance #lets compare to the full data
## Do the tensor factorization np.random.seed(seed) M, cpstats, mstats = CP_APR.cp_apr(trainX, R, maxiters=outerIter, maxinner=10) M.normalize_sort(1) # zero out the small factors for n in range(1,2): zeroIdx = np.where(M.U[n] < zeroThr) M.U[n][zeroIdx] = 0 klp = KLProjection.KLProjection(M.U, M.R) ptfFeat = klp.projectSlice(X, 0) ptfMatrix = khatrirao.khatrirao(M.U[1], M.U[2]) dbOutput = getDBEntry("CP-APR", ptfMatrix) ## now we want to do PCA and NMF as well flatX = sptenmat.sptenmat(X, [0]).tocsrmat() # matricize along the first mode pcaModel = RandomizedPCA(n_components=R) pcaModel.fit(flatX[train, :]) pcaFeat = pcaModel.transform(flatX) pcaBasis = pcaModel.components_ dbOutput = np.vstack((dbOutput, getDBEntry("PCA", pcaBasis))) nmfModel = nimfa.mf(flatX[train,:], method="nmf", max_iter=outerIter, rank=R) nmfResult = nimfa.mf_run(nmfModel) nmfFeat = nmfTransform(R, nmfResult, flatX) ## get the basis to be stored off nmfBasis = nmfResult.coef().transpose() nmfBasis = preprocessing.normalize(nmfBasis, norm="l1", axis=0) nmfBasis = nmfBasis.toarray() zeroIdx = np.where(nmfBasis < zeroThr*zeroThr) nmfBasis[zeroIdx]= 0 dbOutput = np.vstack((dbOutput, getDBEntry("NMF", nmfBasis)))
print(faces.target_names) print(faces.images.shape) fig, ax = plt.subplots(3, 5) for i, axi in enumerate(ax.flat): axi.imshow(faces.images[i], cmap='bone') axi.set(xticks=[], yticks=[], xlabel=faces.target_names[faces.target[i]]) from sklearn.svm import SVC from sklearn.decomposition import RandomizedPCA from sklearn.pipeline import make_pipeline pca = RandomizedPCA(n_components=150, whiten=True, random_state=42) svc = SVC(kernel='rbf', class_weight='balanced') model = make_pipeline(pca, svc) from sklearn.cross_validation import train_test_split Xtrain, Xtest, ytrain, ytest = train_test_split(faces.data, faces.target, random_state=42) from sklearn.grid_search import GridSearchCV param_grid = {'svc__C': [1, 5, 10, 50], 'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]} grid = GridSearchCV(model, param_grid) get_ipython().run_line_magic("time", " grid.fit(Xtrain, ytrain)")
def fit_deprecated(X): global Y rpca = RandomizedPCA(random_state=0) Y = rpca.fit_transform(X)
cells_per_block=self.cells_per_block, ) result.append(features) return np.array(result) MODELS = { 'linearsvc': ( LinearSVC(), { 'C': [0.01, 0.1, 1.0] }, ), 'linearsvc-pca': ( Pipeline([ ('pca', RandomizedPCA(n_components=100, whiten=True)), ('clf', LinearSVC(C=1.0)), ]), { 'pca__n_components': [10, 30, 100], 'clf__C': [0.01, 0.1, 1.0] }, ), 'linearsvc-hog': ( Pipeline([ ('hog', HOGFeatures( orientations=8, pixels_per_cell=(4, 4), cells_per_block=(3, 3), )),
plot_digits(noisy) pca = PCA(.5).fit(noisy) pca.n_components_ components = pca.transform(noisy) filtered = pca.inverse_transform(components) plot_digits(filtered) from sklearn.datasets import fetch_lfw_people faces = fetch_lfw_people(min_faces_per_person=60) print(faces.target_names) print(faces.images.shape) from sklearn.decomposition import RandomizedPCA pca = RandomizedPCA(150) pca.fit(faces.data) # print eigenfaces fig, axes = plt.subplots(3, 8, figsize=(9, 4), subplot_kw = {'xticks' : [], 'yticks' : []}, gridspec_kw = dict(hspace=.1, wspace=.1)) for i, ax in enumerate(axes.flat): ax.imshow(pca.components_[i].reshape(62,47), cmap='bone') plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance') pca = RandomizedPCA(150).fit(faces.data)
test_files_count = data['test_files_count'] validationOriginalImage = data['validationOriginalImage'] #print valid_files print train_data.shape print valid_data.shape # record time used for training start = time.clock() for i in range(0, len(superpixels)): print np.max(superpixels[i][0]) # Preprocessing normalize data scaler = StandardScaler() scaler.fit(train_data) train_data = scaler.transform(train_data) # Preprocessing RandomizePCA pca = RandomizedPCA(n_components=15) pca.fit(train_data) #train_data = pca.transform(train_data) print train_data.shape # set classifier and fit data clf = chooseClassification('RF') clf = clf.fit(train_data, train_labels.ravel()) #scores = cross_val_score(clf, train_data, train_label) #scores.mean() # benchmark using validation data valid_data = scaler.transform(valid_data) #valid_data = pca.transform(valid_data) #print clf.predict_proba(valid_data[0]) #wait = input("PRESS ENTER TO CONTINUE.")
import input_data_svm datasets = input_data_svm.read_data_sets() X = np.vstack((datasets.train_set.inputs(), datasets.validation_set.inputs())) y = np.hstack((datasets.train_set.targets(), datasets.validation_set.targets())) # X = datasets.train_set.inputs() # y = datasets.train_set.targets() X = X[:] y = y[:] # Reduce the dimensionality of the dataset print("Applying PCA to reduce dimensions") pca = RandomizedPCA(n_components=PCA_COMPONENTS, whiten=True).fit(X) # eigenfaces = pca.componenets_reshape((PCA_COMPONENTS, h, w)) X = pca.transform(X) print("Finished PCA preprocessing") # Normalize the data scaler = StandardScaler() X = scaler.fit_transform(X) ############################################################################## # Train classifiers # # For an initial search, a logarithmic grid with basis # 10 is often helpful. Using a basis of 2, a finer # tuning can be achieved but at a much higher cost.
import numpy as np from sklearn.datasets import make_circles from sklearn.ensemble import RandomTreesEmbedding, ExtraTreesClassifier from sklearn.decomposition import RandomizedPCA from sklearn.naive_bayes import BernoulliNB # make a synthetic dataset X, y = make_circles(factor=0.5, random_state=0, noise=0.05) # use RandomTreesEmbedding to transform data hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3) X_transformed = hasher.fit_transform(X) # Visualize result using PCA pca = RandomizedPCA(n_components=2) X_reduced = pca.fit_transform(X_transformed) # Learn a Naive Bayes classifier on the transformed data nb = BernoulliNB() nb.fit(X_transformed, y) # Learn an ExtraTreesClassifier for comparison trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0) trees.fit(X, y) # scatter plot of original and reduced data fig = pl.figure(figsize=(9, 8)) ax = pl.subplot(221) ax.scatter(X[:, 0], X[:, 1], c=y, s=50)
discretizer3=discretizer(3), discretizer10=discretizer(10), kmeans3=clusterizer(MiniBatchKMeans(3)), kmeans10=clusterizer(MiniBatchKMeans(10)), kmeans_gap=clusterizer(FitClusterer(min_clusters=3)), ward3=clusterizer(Ward(3)), ward10=clusterizer(Ward(10)), meanshift=clusterizer(MeanShift()), # spectral3=clusterizer(SpectralClustering(3)), # FIXME # spectral10=clusterizer(SpectralClustering(10)), # FIXME affinity_prop=clusterizer(AffinityPropagation()), dbscan=clusterizer(DBSCAN()), ) BINARY_TO_NUMERICAL = dict(identity=identity, ) BINARY_TO_CATEGORICAL = dict(identity=identity, ) CATEGORICAL_TO_NUMERICAL = dict( noop=identity, binarize=binary_transformer(), pca1=binary_transformer(RandomizedPCA(1)), # ica1=binary_transformer(FastICA(1)), # FIXME median_ordinal_pred=discrete_ordinal_predictor("median"), mean_ordinal_pred=discrete_ordinal_predictor("mean"), max_ordinal_pred=discrete_ordinal_predictor("max"), min_ordinal_pred=discrete_ordinal_predictor("min"), ) CATEGORICAL_TO_CATEGORICAL = dict(identity=identity, )
import numpy as np from sklearn.cross_validation import train_test_split from sklearn.decomposition import RandomizedPCA from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import FunctionTransformer # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_classes, testing_classes = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( make_union( VotingClassifier([("est", KNeighborsClassifier(n_neighbors=5, weights="uniform"))]), FunctionTransformer(lambda X: X)), RandomizedPCA(iterated_power=1), RandomForestClassifier(n_estimators=500)) exported_pipeline.fit(training_features, training_classes) results = exported_pipeline.predict(testing_features)
from sklearn.cross_validation import train_test_split features_train, features_test, labels_train, labels_test = \ train_test_split(features_scaled, labels, test_size=0.25, random_state=42) ##features_train_pca = pca.transform(features_train) ##features_test_pca = pca.transform(features_test) #from sklearn.pipeline import Pipeline from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.neighbors import KNeighborsClassifier #from sklearn.grid_search import GridSearchCV estimator_svm = [('reduce_dim', RandomizedPCA()), ('clf_svm', SVC(kernel='rbf', class_weight='balanced', gamma=0.1, C=1000))] estimator_tree = [('reduce_dim', RandomizedPCA()), ('clf_tree', DecisionTreeClassifier(criterion='entropy', max_features='sqrt', splitter='best'))] #estimator_knn = [('reduce_dim', PCA()), ('clf_knn', KNeighborsClassifier())] #estimator_rf = [('reduce_dim', PCA()), ('clf_rf', RandomForestClassifier())] #estimator_ab = [('reduce_dim', PCA()), ('clf_ab', AdaBoostClassifier())] #param_svm = { # 'kernel': ['linear', 'poly', 'rbf'], # 'C': [10, 100, 1000, 10000],
# Create an array with flattened images X # and an array with ID of the people on each image y X = np.zeros([NUM_TRAINIMAGES, IMG_RES], dtype='int8') names = [] # Populate training array with flattened imags from subfolders of train_faces and names c = 0 for x, folder in enumerate(folders): train_faces = glob.glob(folder + '/*') for i, face in enumerate(train_faces): X[c, :] = prepare_image(face) names.append(ID_from_filename(face)) c = c + 1 # perform principal component analysis on the images pca = RandomizedPCA(n_components=NUM_EIGENFACES, whiten=True).fit(X) X_pca = pca.transform(X) #''' while (True): _, frame = cap.read() gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) # Detect faces in the image faces = faceCascade.detectMultiScale(gray, scaleFactor=1.3, minNeighbors=5) for (x, y, w, h) in faces: if len(faces) == 1: cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 0, 255), 2) gray = gray[y:y + h, x:x + w] s = cv2.resize(gray, (92, 112)) Snap(s)
def doPCA(data, dimensions=2): from sklearn.decomposition import RandomizedPCA model = RandomizedPCA(n_components=dimensions) model.fit(data) return model