class PCAMODEL(object): n_components = None trainX = None trainY = None testX = None model = None def __init__(self, n='mle', X = None, Y = None): self.n_components = n self.trainX = X self.trainY = Y def build_model(self): self.model = PCA(self.n_components) self.model.fit(self.trainX) def reduce_dim(self, data): return self.model.transform(data)
def PDBpca(pdblist_file, npcs=5,refPDB_file=None): # read pdblist file and fit each structure to refPDB pdbdata, miscs, rmsds = pynumpdb.readPDBlist(pdblist_file,refPDB_file) # run PCA #v,P,PC = pynumpdb._pca.pca_train(pdbdata,npcs,do_norm=0) pca = PCA() pca.fit(pdbdata) v = pca.explained_variance_ P = pca.components_ PC = pca.transform(pdbdata) print v print P print len(PC),len(PC[0]) #print PC.T return v,P,PC
import numpy as np import pylab as pl from scikits.learn.decomposition import PCA, FastICA ############################################################################### # Generate sample data S = np.random.standard_t(1.5, size=(2, 10000)) S[0] *= 2. # Mix data A = [[1, 1], [0, 2]] # Mixing matrix X = np.dot(A, S) # Generate observations pca = PCA() S_pca_ = pca.fit(X.T).transform(X.T).T ica = FastICA() S_ica_ = ica.fit(X).transform(X) # Estimate the sources S_ica_ /= S_ica_.std(axis=1)[:, np.newaxis] ############################################################################### # Plot results def plot_samples(S, axis_list=None): pl.scatter(S[0], S[1], s=2, marker='o', linewidths=0, zorder=10) if axis_list is not None: colors = [(0, 0.6, 0), (0.6, 0, 0)]
def build_model(self): self.model = PCA(self.n_components) self.model.fit(self.trainX)
#!/usr/bin/env python import os, numpy from scikits.learn.decomposition import PCA from ift6266h12.utils.ift6266h12_io import load_train_input, load_test_input, load_valid_input dest_path = '/data/lisa/data/UTLC/pca' trainset = load_train_input('sylvester', normalize=True) testset = load_test_input('sylvester', normalize=True) validset = load_valid_input('sylvester', normalize=True) pca = PCA(32) pca.fit(trainset) numpy.save(os.path.join(dest_path, 'sylvester_train_x_pca32.npy'), pca.transform(trainset)) numpy.save(os.path.join(dest_path, 'sylvester_valid_x_pca32.npy'), pca.transform(validset)) numpy.save(os.path.join(dest_path, 'sylvester_test_x_pca32.npy'), pca.transform(testset))
def generate_clusters(n_samples=200): mean1 = np.array([0, 2]) mean2 = np.array([2, 0]) cov = np.array([[2.0, 1.0], [1.0, 2.0]]) X_red = np.random.multivariate_normal(mean1, cov, n_samples) X_blue = np.random.multivariate_normal(mean2, cov, n_samples) return np.vstack((X_red, X_blue)) X = genenerate_rings() #X = generate_clusters() kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=0.5) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) pca = PCA() X_pca = pca.fit_transform(X) # Plot results pl.figure() pl.subplot(2, 2, 1, aspect='equal') pl.title("Original space") pl.plot(X[:200, 0], X[:200, 1], "ro") pl.plot(X[200:, 0], X[200:, 1], "bo") pl.xlabel("$x_1$") pl.ylabel("$x_2$") X1, X2 = np.meshgrid(np.linspace(-6, 6, 50), np.linspace(-6, 6, 50)) X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T # projection on the first principal component (in the phi space)
import pylab as pl from scikits.learn import datasets from scikits.learn.decomposition import PCA from scikits.learn.lda import LDA iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names print target_names pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) lda = LDA(n_components=2) X_r2 = lda.fit(X, y).transform(X) # Percentage of variance explained for each components print 'explained variance ratio (first two components):', \ pca.explained_variance_ratio_ pl.figure() for c, i, target_name in zip("rgb", [0, 1, 2], target_names): pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) pl.legend() pl.title('PCA of IRIS dataset')
'box1.npy': 4, 'box2.npy': 4, 'box3.npy': 4, 'box4.npy': 4, 'box5.npy': 4, 'bottle1.npy': 3, 'bottle2.npy': 3, 'bottle3.npy': 3, 'bottle4.npy': 3, 'bottle5.npy': 3 } X, Y = load_and_pack_data(dnames, 20, 20) # PCA and Kernel PCA pca = PCA(n_components=3) X_pca = pca.fit_transform(X) print 'done simple pca' kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True) X_kpca = kpca.fit_transform(X) print 'fitted kernel pca' X_back = kpca.inverse_transform(X_kpca) print 'done back transforming with kpca' # plots reds = Y == 1 blues = Y == 2 greens = Y == 3 magentas = Y == 4 yellows = Y == 5
print "n_digits: %d" % n_digits print "n_features: %d" % n_features print "n_samples: %d" % n_samples print print "Raw k-means with k-means++ init..." t0 = time() km = KMeans(init='k-means++', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print print "Raw k-means with random centroid init..." t0 = time() km = KMeans(init='random', k=n_digits, n_init=10).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print print "Raw k-means with PCA-based centroid init..." # in this case the seeding of the centers is deterministic, hence we run the # kmeans algorithm only once with n_init=1 t0 = time() pca = PCA(n_components=n_digits).fit(data) km = KMeans(init=pca.components_, k=n_digits, n_init=1).fit(data) print "done in %0.3fs" % (time() - t0) print "inertia: %f" % km.inertia_ print
""" print __doc__ import pylab as pl from scikits.learn import datasets from scikits.learn.decomposition import PCA from scikits.learn.lda import LDA iris = datasets.load_iris() X = iris.data y = iris.target target_names = iris.target_names pca = PCA(n_components=2) X_r = pca.fit(X).transform(X) lda = LDA(n_components=2) X_r2 = lda.fit(X, y).transform(X) # Percentage of variance explained for each components print 'explained variance ratio (first two components):', \ pca.explained_variance_ratio_ pl.figure() for c, i, target_name in zip("rgb", [0, 1, 2], target_names): pl.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=target_name) pl.legend() pl.title('PCA of IRIS dataset')
import numpy as np import pylab as pl from scikits.learn.decomposition import PCA, FastICA ############################################################################### # Generate sample data S = np.random.standard_t(1.5, size=(10000, 2)) S[0] *= 2. # Mix data A = np.array([[1, 1], [0, 2]]) # Mixing matrix X = np.dot(S, A.T) # Generate observations pca = PCA() S_pca_ = pca.fit(X).transform(X) ica = FastICA() S_ica_ = ica.fit(X).transform(X) # Estimate the sources S_ica_ /= S_ica_.std(axis=0) ############################################################################### # Plot results def plot_samples(S, axis_list=None): pl.scatter(S[:,0], S[:,1], s=2, marker='o', linewidths=0, zorder=10) if axis_list is not None: colors = [(0, 0.6, 0), (0.6, 0, 0)]
def generate_clusters(n_samples=200): mean1 = np.array([0, 2]) mean2 = np.array([2, 0]) cov = np.array([[2.0, 1.0], [1.0, 2.0]]) X_red = np.random.multivariate_normal(mean1, cov, n_samples) X_blue = np.random.multivariate_normal(mean2, cov, n_samples) return np.vstack((X_red, X_blue)) X = genenerate_rings() #X = generate_clusters() kpca = KernelPCA(kernel="rbf", fit_inverse_transform=True) X_kpca = kpca.fit_transform(X) X_back = kpca.inverse_transform(X_kpca) pca = PCA() X_pca = pca.fit_transform(X) # Plot results pl.figure() pl.subplot(2, 2, 1, aspect='equal') pl.title("Original space") pl.plot(X[:200, 0], X[:200, 1], "ro") pl.plot(X[200:, 0], X[200:, 1], "bo") pl.xlabel("$x_1$") pl.ylabel("$x_2$") X1, X2 = np.meshgrid(np.linspace(-6, 6, 50), np.linspace(-6, 6, 50)) X_grid = np.array([np.ravel(X1), np.ravel(X2)]).T # projection on the first principal component (in the phi space)
def main_train(work_dir="../results/avicenna/", corruption_level=0.3, nvis=75, nhid=600, tied_weights=True, act_enc="sigmoid", act_dec=None, max_epochs=2, learning_rate=0.001, batch_size=20, monitoring_batches=5, save_freq=1, n_components_trans_pca=7): conf = { 'corruption_level': corruption_level, 'nvis': nvis, 'nhid': nhid, 'tied_weights': tied_weights, 'act_enc': act_enc, 'act_dec': act_dec, 'max_epochs': max_epochs, 'learning_rate': learning_rate, 'batch_size': batch_size, 'monitoring_batches': monitoring_batches, 'save_freq': save_freq, 'n_components_trans_pca': n_components_trans_pca } start = time.clock() ############### TRAIN THE DAE train_file = work_dir + "train_pca" + str(conf['nvis']) + ".npy" save_path = work_dir + "train_pca" + str(conf['nvis']) + "_dae" + str( conf['nhid']) + "_model.pkl" trainset = NpyDataset(file=train_file) trainset.yaml_src = 'script' corruptor = BinomialCorruptor(corruption_level=conf['corruption_level']) dae = DenoisingAutoencoder(nvis=conf['nvis'], nhid=conf['nhid'], tied_weights=conf['tied_weights'], corruptor=corruptor, act_enc=conf['act_enc'], act_dec=conf['act_dec']) cost = MeanSquaredReconstructionError() termination_criterion = EpochCounter(max_epochs=conf['max_epochs']) algorithm = UnsupervisedExhaustiveSGD( learning_rate=conf['learning_rate'], batch_size=conf['batch_size'], monitoring_batches=conf['monitoring_batches'], monitoring_dataset=trainset, cost=cost, termination_criterion=termination_criterion) train_obj = Train(dataset=trainset, model=dae, algorithm=algorithm, save_freq=conf['save_freq'], save_path=save_path) train_obj.main_loop() ############### APPLY THE MODEL ON THE TRAIN DATASET print("Applying the model on the train dataset...") model = load(save_path) save_train_path = work_dir + "train_pca" + str( conf['nvis']) + "_dae" + str(conf['nhid']) + ".npy" dump_obj = FeatureDump(encoder=model, dataset=trainset, path=save_train_path) dump_obj.main_loop() ############### APPLY THE MODEL ON THE VALID DATASET print("Applying the model on the valid dataset...") valid_file = work_dir + "valid_pca" + str(conf['nvis']) + ".npy" validset = NpyDataset(file=valid_file) validset.yaml_src = 'script' save_valid_path = work_dir + "valid_pca" + str( conf['nvis']) + "_dae" + str(conf['nhid']) + ".npy" dump_obj = FeatureDump(encoder=model, dataset=validset, path=save_valid_path) dump_obj.main_loop() ############### APPLY THE MODEL ON THE TEST DATASET print("Applying the model on the test dataset...") test_file = work_dir + "test_pca" + str(conf['nvis']) + ".npy" testset = NpyDataset(file=test_file) testset.yaml_src = 'script' save_test_path = work_dir + "test_pca" + str(conf['nvis']) + "_dae" + str( conf['nhid']) + ".npy" dump_obj = FeatureDump(encoder=model, dataset=testset, path=save_test_path) dump_obj.main_loop() ############### COMPUTE THE ALC SCORE ON VALIDATION SET valid_data = ift6266h12.load_npy(save_valid_path) label_data = ift6266h12.load_npy( '/data/lisa/data/UTLC/numpy_data/avicenna_valid_y.npy') alc_1 = score(valid_data, label_data) ############### APPLY THE TRANSDUCTIVE PCA test_data = ift6266h12.load_npy(save_test_path) trans_pca = PCA(n_components=conf['n_components_trans_pca']) final_valid = trans_pca.fit_transform(valid_data) final_test = trans_pca.fit_transform(test_data) save_valid_path = work_dir + "valid_pca" + str( conf['nvis']) + "_dae" + str(conf['nhid']) + "_tpca" + str( conf['n_components_trans_pca']) + ".npy" save_test_path = work_dir + "test_pca" + str(conf['nvis']) + "_dae" + str( conf['nhid']) + "_tpca" + str(conf['n_components_trans_pca']) + ".npy" np.save(save_valid_path, final_valid) np.save(save_test_path, final_test) ############### COMPUTE THE NEW ALC SCORE ON VALIDATION SET alc_2 = score(final_valid, label_data) ############### OUTPUT AND RETURN THE RESULTS timeSpent = ((time.clock() - start) / 60.) print 'FINAL RESULTS (PCA-' + str(conf['nvis']) + ' DAE-' + str(conf['nhid']) + ' TransPCA-' + str(conf['n_components_trans_pca']) + ') ALC after DAE: ', alc_1, ' FINAL ALC: ', alc_2, \ ' Computed in %5.2f min' % (timeSpent) return timeSpent, alc_1, alc_2