def compute_pca(data_path=os.path.join(BASE_DIR, 'data/memmap/'), out_path=os.path.join(BASE_DIR, 'data/'), batch_size=500, image_size=3*300*300): ipca = IncrementalPCA(n_components=3, batch_size=batch_size) path = os.path.join(data_path, 'tn_x.dat') train = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(4044,image_size)) n_samples, _ = train.shape for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)): X = train[batch,:] X = np.reshape(X, (X.shape[0], 3, int(image_size/3))) X = X.transpose(0, 2, 1) X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3)) ipca.partial_fit(X) path = os.path.join(data_path, 'v_x.dat') valid = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(500,image_size)) n_samples, _ = valid.shape for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)): X = valid[batch,:] X = np.reshape(X, (X.shape[0], 3, int(image_size/3))) X = X.transpose(0, 2, 1) X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3)) ipca.partial_fit(X) eigenvalues, eigenvectors = np.linalg.eig(ipca.get_covariance()) eigenvalues.astype('float32').dump(os.path.join(out_path, 'eigenvalues.dat')) eigenvectors.astype('float32').dump(os.path.join(out_path, 'eigenvectors.dat'))
def reduceDataset(self,nr=3,method='PCA'): '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library Methods available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] #dataset=self.dataset[Model.in_columns] #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']] #PCA if method=='PCA': sklearn_pca = sklearnPCA(n_components=nr) reduced = sklearn_pca.fit_transform(dataset) #Factor Analysis elif method=='FactorAnalysis': fa=FactorAnalysis(n_components=nr) reduced=fa.fit_transform(dataset) #kernel pca with rbf kernel elif method=='KPCArbf': kpca=KernelPCA(nr,kernel='rbf') reduced=kpca.fit_transform(dataset) #kernel pca with poly kernel elif method=='KPCApoly': kpca=KernelPCA(nr,kernel='poly') reduced=kpca.fit_transform(dataset) #kernel pca with cosine kernel elif method=='KPCAcosine': kpca=KernelPCA(nr,kernel='cosine') reduced=kpca.fit_transform(dataset) #kernel pca with sigmoid kernel elif method=='KPCAsigmoid': kpca=KernelPCA(nr,kernel='sigmoid') reduced=kpca.fit_transform(dataset) #ICA elif method=='IPCA': ipca=IncrementalPCA(nr) reduced=ipca.fit_transform(dataset) #Fast ICA elif method=='FastICAParallel': fip=FastICA(nr,algorithm='parallel') reduced=fip.fit_transform(dataset) elif method=='FastICADeflation': fid=FastICA(nr,algorithm='deflation') reduced=fid.fit_transform(dataset) elif method == 'All': self.dimensionalityReduction(nr=nr) return self self.ModelInputs.update({method:reduced}) self.datasetsAvailable.append(method) return self
def ipca(mov, components = 50, batch =1000): # vectorize the images num_frames, h, w = mov.shape frame_size = h * w frame_samples = np.reshape(mov, (num_frames, frame_size)).T # run IPCA to approxiate the SVD ipca_f = IncrementalPCA(n_components=components, batch_size=batch) ipca_f.fit(frame_samples) # construct the reduced version of the movie vectors using only the # principal component projection proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples)) # get the temporal principal components (pixel time series) and # associated singular values eigenseries = ipca_f.components_.T # the rows of eigenseries are approximately orthogonal # so we can approximately obtain eigenframes by multiplying the # projected frame matrix by this transpose on the right eigenframes = np.dot(proj_frame_vectors, eigenseries) return eigenseries, eigenframes, proj_frame_vectors
def get_pca_array(list_chunks, topology): """ Takes a list of mdtraj.Trajectory objects and featurize them to backbone - Alpha Carbons pairwise distances. Perform 2 component Incremental PCA on the featurized trajectory. Parameters ---------- list_chunks: list of mdTraj.Trajectory objects topology: str Name of the Topology file Returns ------- Y: np.array shape(frames, features) """ pca = IncrementalPCA(n_components=2) top = md.load_prmtop(topology) ca_backbone = top.select("name CA") pairs = top.select_pairs(ca_backbone, ca_backbone) pair_distances = [] for chunk in list_chunks: X = md.compute_distances(chunk, pairs) pair_distances.append(X) distance_array = np.concatenate(pair_distances) print("No. of data points: %d" % distance_array.shape[0]) print("No. of features (pairwise distances): %d" % distance_array.shape[1]) Y = pca.fit_transform(distance_array) return Y
def ipca(): train_features, test_features = gf.get_tfidf() vectorizer = gf.get_tfidf() n_components = 250 ipca = IncrementalPCA(n_components=n_components, batch_size=1250) start_time = time.time() print 'start ipca on train' X_ipca = ipca.fit_transform(train_features) runtime = time.time() - start_time print '-----' print '%.2f seconds to ipca on train' % runtime print '-----' train_features = None print 'ipca train done' np.savetxt('train_features.csv', X_ipca, fmt='%.8e', delimiter=",") X_ipca = None print 'ipca train file done' test_features = gf.get_tfidf(vectorizer, False) Y_ipca = ipca.fit_transform(test_features) test_features, vectorizer = None, None print 'ipca test done' np.savetxt('test_features.csv', Y_ipca, fmt='%.8e', delimiter=",") svd_test_features = None print 'ipca test file done'
def dimensionalityReduction(self,nr=5): '''It applies all the dimensionality reduction techniques available in this class: Techniques available: 'PCA' 'FactorAnalysis' 'KPCArbf','KPCApoly' 'KPCAcosine','KPCAsigmoid' 'IPCA' 'FastICADeflation' 'FastICAParallel' 'Isomap' 'LLE' 'LLEmodified' 'LLEltsa' ''' dataset=self.ModelInputs['Dataset'] sklearn_pca = sklearnPCA(n_components=nr) p_components = sklearn_pca.fit_transform(dataset) fa=FactorAnalysis(n_components=nr) factors=fa.fit_transform(dataset) kpca=KernelPCA(nr,kernel='rbf') rbf=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='poly') poly=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='cosine') cosine=kpca.fit_transform(dataset) kpca=KernelPCA(nr,kernel='sigmoid') sigmoid=kpca.fit_transform(dataset) ipca=IncrementalPCA(nr) i_components=ipca.fit_transform(dataset) fip=FastICA(nr,algorithm='parallel') fid=FastICA(nr,algorithm='deflation') ficaD=fip.fit_transform(dataset) ficaP=fid.fit_transform(dataset) '''isomap=Isomap(n_components=nr).fit_transform(dataset) try: lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset) except ValueError: lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset) try: lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset) except ValueError: lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) try: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset) except ValueError: lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)''' values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3] keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa'] self.ModelInputs.update(dict(zip(keys, values))) [self.datasetsAvailable.append(key) for key in keys ] #debug #dataset=pd.DataFrame(self.ModelInputs['Dataset']) #dataset['Output']=self.ModelOutput #self.debug['Dimensionalityreduction']=dataset ### return self
def get_pca(file_dir, s, t, i): from sklearn.decomposition import IncrementalPCA ipca = IncrementalPCA(n_components=48) for counter in range(s, t, i): features_file = np.load(file_dir + "/pca" + str(counter) + "_code.npy") ipca.partial_fit(features_file[:, 0:4096]) return ipca
def test_incremental_pca_num_features_change(): """Test that changing n_components will raise an error.""" rng = np.random.RandomState(1999) n_samples = 100 X = rng.randn(n_samples, 20) X2 = rng.randn(n_samples, 50) ipca = IncrementalPCA(n_components=None) ipca.fit(X) assert_raises(ValueError, ipca.partial_fit, X2)
def train_pca(file_dir, s, t, i): from sklearn.decomposition import IncrementalPCA global timer_pca timer_pca = Timer() timer_pca.tic() ipca = IncrementalPCA(n_components=pca_dimensions) for counter in range(s, t, i): features_file = np.load(file_dir + '/pca' + str(counter) + '_code.npy') ipca.partial_fit(features_file[:, 0:4096]) timer_pca.toc() return ipca
def create_pool_pca_from_files(file_dir, dir_output, s, t, i): from sklearn.decomposition import IncrementalPCA ipca = IncrementalPCA(n_components=number_dim_pca) for counter in range(s, t, i): features_file = np.load(file_dir + '/pca' + str(counter) + '_code.npy') ipca.partial_fit(features_file[:, 0:4096]) for counter in range(s, t, i): out_file = dir_output + 'pca_red_' + str(counter) + '_code.npy' features_file = np.load(file_dir + '/pca' + str(counter) + '_code.npy') features_red = ipca.transform(features_file[:, 0:4096]) np.save(out_file, np.append(features_red, features_file[:, 4096:], axis=1))
def ipca(data, labels, new_dimension): print "start incremental pca..." if hasattr(data, "todense"): data = np.array(data.todense()) start = time.time() pca = IncrementalPCA(n_components=new_dimension) reduced = pca.fit_transform(data) end = time.time() return (reduced, end-start)
def PCA_Train(data, result_fold, n_components=128): print_info("PCA training (n_components=%d)..." % n_components) pca = IncrementalPCA(n_components=n_components) pca.fit(data) joblib.dump(pca, result_fold + "pca_model.m") print_info("PCA done.") return pca
def train_pca_model(collection_name, feature_name, n_components, iterations=100, batch_size=20): collection = collection_from_name(collection_name) model = IncrementalPCA(n_components=n_components) partial_unpickle_data = partial(unpickle_data, feature_name=feature_name) for _ in range(iterations): feature = map(partial_unpickle_data, collection.aggregate([{'$sample': {'size': batch_size}}])) feature = np.hstack(feature).T model.partial_fit(feature) return model
def test_incremental_pca_inverse(): """Test that the projection of data can be inverted.""" rng = np.random.RandomState(1999) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X) Y = ipca.transform(X) Y_inverse = ipca.inverse_transform(Y) assert_almost_equal(X, Y_inverse, decimal=3)
def test_singular_values(): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng) pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal(np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro")**2.0, 2) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) assert_array_almost_equal(ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng) pca = PCA(n_components=3, svd_solver='full', random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def generate_pca_compression(X, n_components=16, batch_size=100): """ Compresses the data using sklearn PCA implementation. :param X: Data (n_samples, n_features) :param n_components: Number of dimensions for PCA to keep :param batch_size: Batch size for incrimental PCA :return: X_prime (the compressed representation), pca """ pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) pca.fit(X) return pca.transform(X), pca
def __init__(self, components): PCAnalyzer.__init__(self) if isinstance(components, int): self.n_components = components self.pca = IncrementalPCA(n_components=components, batch_size=500) self.num_seen = 0 self.type = 'incremental'
def test_n_components_none(): # Ensures that n_components == None is handled correctly rng = np.random.RandomState(1999) for n_samples, n_features in [(50, 10), (10, 50)]: X = rng.rand(n_samples, n_features) ipca = IncrementalPCA(n_components=None) # First partial_fit call, ipca.n_components_ is inferred from # min(X.shape) ipca.partial_fit(X) assert ipca.n_components_ == min(X.shape) # Second partial_fit call, ipca.n_components_ is inferred from # ipca.components_ computed from the first partial_fit call ipca.partial_fit(X) assert ipca.n_components_ == ipca.components_.shape[0]
def test_incremental_pca_partial_fit(): """Test that fit and partial_fit get equivalent results.""" rng = np.random.RandomState(1999) n, p = 50, 3 X = rng.randn(n, p) # spherical data X[:, 1] *= .00001 # make middle component relatively small X += [5, 4, 3] # make a large mean # same check that we can find the original data from the transformed # signal (since the data is almost of rank n_components) batch_size = 10 ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X) pipca = IncrementalPCA(n_components=2, batch_size=batch_size) # Add one to make sure endpoint is included batch_itr = np.arange(0, n + 1, batch_size) for i, j in zip(batch_itr[:-1], batch_itr[1:]): pipca.partial_fit(X[i:j, :]) assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
class MyPCA: def __init__(self, filename=None): if not filename: self.model = IncrementalPCA(NUM_COMP) else: with open(filename, 'r') as f: self.model = pickle.load(f) def train(self, X): self.model.partial_fit(X) def transform(self, X): return self.model.transform(X) def dump(self, filename): with open(filename, 'w') as f: pickle.dump(self.model, f)
def test_whitening(): """Test that PCA and IncrementalPCA transforms match to sign flip.""" X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0., effective_rank=2, random_state=1999) prec = 3 n_samples, n_features = X.shape for nc in [None, 9]: pca = PCA(whiten=True, n_components=nc).fit(X) ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X) Xt_pca = pca.transform(X) Xt_ipca = ipca.transform(X) assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec) Xinv_ipca = ipca.inverse_transform(Xt_ipca) Xinv_pca = pca.inverse_transform(Xt_pca) assert_almost_equal(X, Xinv_ipca, decimal=prec) assert_almost_equal(X, Xinv_pca, decimal=prec) assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def run_pca(n_components,n_sites,order_dict,sim_mat): output_file = open('pca_100000_100','w') ipca = IncrementalPCA(n_components=n_components,batch_size=8000) sim_mat_ipca = ipca.fit_transform(sim_mat) var_sim_ipca = ipca.explained_variance_ratio_ output_file.write(",".join(str(x) for x in var_sim_ipca)+'\n') for siteid in order_dict: stringa = ' '.join( [siteid, str(sim_mat_ipca[order_dict[siteid], 0]), str(sim_mat_ipca[order_dict[siteid], 1]), str(sim_mat_ipca[order_dict[siteid], 2]), str(sim_mat_ipca[order_dict[siteid], 3]), str(sim_mat_ipca[order_dict[siteid], 4]), str(sim_mat_ipca[order_dict[siteid], 5]), str(sim_mat_ipca[order_dict[siteid], 6]) ]) output_file.write(stringa +'\n') n_bins = 1000. binned = np.empty((n_sites,5)).astype(np.int32) for k in range(5): delta = (sim_mat_ipca[:, k].max()-sim_mat_ipca[:, k].min())/n_bins min_k = sim_mat_ipca[:, k].min() for i in range(n_sites): binned[i,k] = int((sim_mat_ipca[i, k]-min_k)/delta) f = open('pc_100000_100.csv','w') for siteid in order_dict: stringa = ' '.join( [siteid, str(binned[order_dict[siteid], 0]), str(binned[order_dict[siteid], 1]), str(binned[order_dict[siteid], 2]), str(binned[order_dict[siteid], 3]), str(binned[order_dict[siteid], 4]) ]) f.write(stringa +'\n') f.close()
def reduce_data(features, out_dir, dim=10, first_column=True): array = np.load(features) subarray = array if not first_column: subarray = array[:, 1:] ipca = IncrementalPCA(n_components=dim, copy=False, batch_size=500000) ipca.fit_transform(subarray) new_array = subarray # when it cannot fit into memory do it incrementally like below # new_array_1 = tsvd.fit_transform(subarray[:1500000, :]) # new_array_2 = tsvd.fit_transform(subarray[1500000:3400000, :]) # new_array_3 = tsvd.fit_transform(subarray[3400000:, :]) # new_array = np.vstack([new_array_1, new_array_2, new_array_3]) if not first_column: new_array = np.c_[array[:, 0], new_array] assert new_array.shape[0] == array.shape[0] np.save(os.path.join(out_dir, os.path.basename(features) + "_pca"), new_array)
def PCA_train(self): pcafun = None if self.pca == None: (a,b) = self.descriptors.shape self.pca = IncrementalPCA(n_components = int(b*self.pca_ratio)) pcafun = self.pca.fit else: pcafun = self.pca.partial_fit pcafun(self.descriptors) self.PCA_common()
def ipca(self, X, n_components=100): from sklearn.decomposition import IncrementalPCA # trials = h5py.File(self.path + "/trials.hdf5", 'r') # scaled_meg = trials['scaled_meg'] # it's ok, the dataset is not fetched to memory yet # scaled_meeg = trials['scaled_meeg'] n1 = X.shape[0] # how many rows we have in the dataset chunk_size = 1000 # how many rows we feed to IPCA at a time, the divisor of n ipca = IncrementalPCA(n_components=n_components) for i in range(0, n1//chunk_size): print("{} to {} out of {}.".format(i*chunk_size,(i+1)*chunk_size,n1)) print(X[i*chunk_size : (i+1)*chunk_size].shape) ipca.partial_fit(X[i*chunk_size : (i+1)*chunk_size]) x = ipca.transform(X) print(x.shape) # n_comp = sum(i > 10.0e-05 for i in ipca.explained_variance_ratio_) # print(n_comp) return x
class PCASK(AbstractFeature): def __init__(self, n_components): AbstractFeature.__init__(self) self.n_components = n_components #for key in options: #setattr(self,key,options[key]) def compute(self,X,y): if X.ndim == 3: X = X.reshape((X.shape[0],X.shape[1]*X.shape[2])) self.ipca = IncrementalPCA(n_components=self.n_components, batch_size=None) return self.ipca.fit_transform(X) def extract(self,X): if X.ndim == 2: X = X.reshape((X.shape[0]*X.shape[1])) return list(self.ipca.transform([X])[0]) def __repr__(self): return "PCASK"
def project(self, ndim=None): """ Projects the data object given to the constructor onto `ndim` dimensions Parameters ---------- ndim : int The number of dimensions we want to project the data on. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the projected data Example ------- >>> gw = GWPCA(data) >>> dataproj = gw.project(5) """ from sklearn.decomposition import IncrementalPCA from htmd.progress.progress import ProgressBar from htmd.metricdata import MetricData pca = IncrementalPCA(n_components=ndim, batch_size=10000) p = ProgressBar(len(self.data.dat)) for d in self.data.dat: pca.partial_fit(d * self.weights) p.progress() p.stop() projdata = self.data.copy() p = ProgressBar(len(self.data.dat)) for i, d in enumerate(self.data.dat): projdata.dat[i] = pca.transform(d * self.weights) p.progress() p.stop() # projdataconc = pca.fit_transform(self.weighedconcat) # projdata.dat = projdata.deconcatenate(projdataconc) return projdata
def IPCA(self, components = 50, batch =1000): ''' Iterative Principal Component analysis, see sklearn.decomposition.incremental_pca Parameters: ------------ components (default 50) = number of independent components to return batch (default 1000) = number of pixels to load into memory simultaneously in IPCA. More requires more memory but leads to better fit Returns ------- eigenseries: principal components (pixel time series) and associated singular values eigenframes: eigenframes are obtained by multiplying the projected frame matrix by the projected movie (whitened frames?) proj_frame_vectors:the reduced version of the movie vectors using only the principal component projection ''' # vectorize the images num_frames, h, w = np.shape(self); frame_size = h * w; frame_samples = np.reshape(self, (num_frames, frame_size)).T # run IPCA to approxiate the SVD ipca_f = IncrementalPCA(n_components=components, batch_size=batch) ipca_f.fit(frame_samples) # construct the reduced version of the movie vectors using only the # principal component projection proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples)) # get the temporal principal components (pixel time series) and # associated singular values eigenseries = ipca_f.components_.T # the rows of eigenseries are approximately orthogonal # so we can approximately obtain eigenframes by multiplying the # projected frame matrix by this transpose on the right eigenframes = np.dot(proj_frame_vectors, eigenseries) return eigenseries, eigenframes, proj_frame_vectors
def PCA(source, num_components, chuck_size): image_path = sorted(list(source), key = lambda x: (int(x.split('_')[0]), x.split('_')[1])) size, images = 0, [] n_chunks = len(image_path)//chunk_size pca = IncrementalPCA(n_components=num_components, batch_size=chunk_size) for i in range(n_chunks): print('Chunk:', i, '\tIndex:', i * chunk_size + size) while size < chunk_size: images.append(imread(source+image_path[i * chunk_size + size]).flatte()) size += 1 pca.partial_fit(np.asarray(images)) size, images = 0, [] if i == n_chunks - 1: i += 1 print('chunk:', i, 'index:', i * chunk_size + size) transformed = pca.transform(np.asarray(images)) if xTransformed is None: xTransformed = transformed else: xTransformed = np.vstack((xTransformed, transformed)) size, images = 0, [] if i == n_chunks - 1: i += 1 while i * chunk_size + size < len(image_path): images.append(imread(source+image_path[i * chunk_size]).flatten()) size += 1 trasformed = pca.transform(np.asarray(images)) xTransformed = np.vstack((xTransformed, transformed)) print("\nTransformed matrix shape:", xTransformed.shape) return xTransformed if __name__ == "__main__": source = './train/right' new_size = '32x32' pool = Pool() start = time.time() pool.map(imageResize, zip(itertools.repeat(source), listdir(source), itertools.repeat(new_size))) print("Resized Images in {0} seconds".formate(time.time() - start))
def performPCA(source, num_components, chunk_size): image_paths = sorted(listdir(source), key=lambda x: (int(x.split('_')[0]), x.split('_')[1])) size, images = 0, [] n_chunks = len(image_paths)//chunk_size pca = IncrementalPCA(n_components=num_components, batch_size=chunk_size) # Read in all images and do a partial fit on the PCA model. for i in range(n_chunks): print 'Chunk:', i, 'Index:', i * chunk_size + size while size < chunk_size: images.append(imread(source+image_paths[i * chunk_size + size]).flatten()) size += 1 pca.partial_fit(np.asarray(images)) size, images = 0, [] if i == n_chunks - 1: i += 1 while i * chunk_size + size < len(image_paths): images.append(imread(source+image_paths[i * chunk_size + size]).flatten()) size += 1 pca.partial_fit(np.asarray(images)) # Only works with Python 3 #print("\nExplained variance ratios: {0}".format(pca.explained_variance_ratio_)) #print("Sum of variance captured by components: {0}\n".format(sum(pca.explained_variance_ratio_))) xTransformed = None # Read in all images again and transform them using the PCA model. for i in range(n_chunks): while size < chunk_size: images.append(imread(source+image_paths[i * chunk_size + size]).flatten()) size += 1 print 'Chunk:', i, 'index:', i * chunk_size + size transformed = pca.transform(np.asarray(images)) if xTransformed is None: xTransformed = transformed else: xTransformed = np.vstack((xTransformed, transformed)) size, images = 0, [] if i == n_chunks - 1: i += 1 while i * chunk_size + size < len(image_paths): images.append(imread(source+image_paths[i * chunk_size + size]).flatten()) size += 1 transformed = pca.transform(np.asarray(images)) xTransformed = np.vstack((xTransformed, transformed)) print "\nTransformed matrix shape:", xTransformed.shape return xTransformed
# from sklearn.decomposition import PCA from sklearn.decomposition import IncrementalPCA from pprint import pprint import numpy as np import pickle import os from tqdm import tqdm diri = '/home/dpappas/bioasq_all/bert_elmo_embeds/' filename = '/home/dpappas/bioasq_all/pca_elmo_transformer.sav' mat, m = None, 0 if (not os.path.exists(filename)): transformer = IncrementalPCA(n_components=50) for f in tqdm(os.listdir(diri), ascii=True): m += 1 fpath = os.path.join(diri, f) d = pickle.load(open(fpath, 'rb')) # if (mat is None): mat = np.concatenate(d['title_sent_elmo_embeds'] + d['abs_sent_elmo_embeds'], axis=0) else: mat = np.concatenate([mat] + d['title_sent_elmo_embeds'] + d['abs_sent_elmo_embeds'], axis=0) if (mat.shape[0] > 1000): transformer.partial_fit(mat) mat = None pickle.dump(transformer, open(filename, 'wb'))
def decomposition(self, output_dimension, normalize_poissonian_noise=False, algorithm='PCA', signal_mask=None, navigation_mask=None, get=threaded.get, num_chunks=None, reproject=True, bounds=True, **kwargs): """Perform Incremental (Batch) decomposition on the data, keeping n significant components. Parameters ---------- output_dimension : int the number of significant components to keep normalize_poissonian_noise : bool If True, scale the SI to normalize Poissonian noise algorithm : str One of ('PCA', 'ORPCA', 'ONMF'). By default ('PCA') IncrementalPCA from scikit-learn is run. get : dask scheduler the dask scheduler to use for computations; default `dask.threaded.get` num_chunks : int the number of dask chunks to pass to the decomposition model. More chunks require more memory, but should run faster. Will be increased to contain atleast output_dimension signals. navigation_mask : {BaseSignal, numpy array, dask array} The navigation locations marked as True are not used in the decompostion. signal_mask : {BaseSignal, numpy array, dask array} The signal locations marked as True are not used in the decomposition. reproject : bool Reproject data on the learnt components (factors) after learning. bounds : {tuple, bool} The (min, max) values of the data to normalize before learning. If tuple (min, max), those values will be used for normalization. If True, extremes will be looked up (expensive), default. If False, no normalization is done (learning may be very slow). If normalize_poissonian_noise is True, this cannot be True. **kwargs passed to the partial_fit/fit functions. Notes ----- Various algorithm parameters and their default values: ONMF: lambda1=1, kappa=1, robust=False, store_r=False batch_size=None ORPCA: fast=True, lambda1=None, lambda2=None, method=None, learning_rate=None, init=None, training_samples=None, momentum=None PCA: batch_size=None, copy=True, white=False """ explained_variance = None explained_variance_ratio = None _al_data = self._data_aligned_with_axes nav_chunks = _al_data.chunks[:self.axes_manager.navigation_dimension] sig_chunks = _al_data.chunks[self.axes_manager.navigation_dimension:] num_chunks = 1 if num_chunks is None else num_chunks blocksize = np.min([multiply(ar) for ar in product(*nav_chunks)]) nblocks = multiply([len(c) for c in nav_chunks]) if blocksize / output_dimension < num_chunks: num_chunks = np.ceil(blocksize / output_dimension) blocksize *= num_chunks # LEARN if algorithm == 'PCA': from sklearn.decomposition import IncrementalPCA obj = IncrementalPCA(n_components=output_dimension) method = partial(obj.partial_fit, **kwargs) reproject = True elif algorithm == 'ORPCA': from hyperspy.learn.rpca import ORPCA kwg = {'fast': True} kwg.update(kwargs) obj = ORPCA(output_dimension, **kwg) method = partial(obj.fit, iterating=True) elif algorithm == 'ONMF': from hyperspy.learn.onmf import ONMF batch_size = kwargs.pop('batch_size', None) obj = ONMF(output_dimension, **kwargs) method = partial(obj.fit, batch_size=batch_size) else: raise ValueError('algorithm not known') original_data = self.data try: if normalize_poissonian_noise: if bounds is True: bounds = False # warnings.warn? data = self._data_aligned_with_axes ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension nm = da.logical_not( da.zeros(self.axes_manager.navigation_shape[::-1], chunks=nav_chunks) if navigation_mask is None else to_array(navigation_mask, chunks=nav_chunks)) sm = da.logical_not( da.zeros(self.axes_manager.signal_shape[::-1], chunks=sig_chunks) if signal_mask is None else to_array(signal_mask, chunks=sig_chunks)) ndim = self.axes_manager.navigation_dimension sdim = self.axes_manager.signal_dimension bH, aG = da.compute(data.sum(axis=range(ndim)), data.sum(axis=range(ndim, ndim + sdim))) bH = da.where(sm, bH, 1) aG = da.where(nm, aG, 1) raG = da.sqrt(aG) rbH = da.sqrt(bH) coeff = raG[(..., ) + (None, ) * rbH.ndim] *\ rbH[(None, ) * raG.ndim + (...,)] coeff.map_blocks(np.nan_to_num) coeff = da.where(coeff == 0, 1, coeff) data = data / coeff self.data = data # normalize the data for learning algs: if bounds: if bounds is True: _min, _max = da.compute(self.data.min(), self.data.max()) else: _min, _max = bounds self.data = (self.data - _min) / (_max - _min) # LEARN this_data = [] try: for chunk in progressbar(self._block_iterator( flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask), total=nblocks, leave=True, desc='Learn'): this_data.append(chunk) if len(this_data) == num_chunks: thedata = np.concatenate(this_data, axis=0) method(thedata) this_data = [] if len(this_data): thedata = np.concatenate(this_data, axis=0) method(thedata) except KeyboardInterrupt: pass # GET ALREADY CALCULATED RESULTS if algorithm == 'PCA': explained_variance = obj.explained_variance_ explained_variance_ratio = obj.explained_variance_ratio_ factors = obj.components_.T elif algorithm == 'ORPCA': _, _, U, S, V = obj.finish() factors = U * S loadings = V explained_variance = S**2 / len(factors) elif algorithm == 'ONMF': factors, loadings = obj.finish() loadings = loadings.T # REPROJECT if reproject: if algorithm == 'PCA': method = obj.transform post = lambda a: np.concatenate(a, axis=0) elif algorithm == 'ORPCA': method = obj.project obj.R = [] post = lambda a: obj.finish()[4] elif algorithm == 'ONMF': method = obj.project post = lambda a: np.concatenate(a, axis=1).T _map = map( lambda thing: method(thing), self._block_iterator(flat_signal=True, get=get, signal_mask=signal_mask, navigation_mask=navigation_mask)) H = [] try: for thing in progressbar(_map, total=nblocks, desc='Project'): H.append(thing) except KeyboardInterrupt: pass loadings = post(H) if explained_variance is not None and \ explained_variance_ratio is None: explained_variance_ratio = \ explained_variance / explained_variance.sum() # RESHUFFLE "blocked" LOADINGS ndim = self.axes_manager.navigation_dimension try: loadings = _reshuffle_mixed_blocks(loadings, ndim, (output_dimension, ), nav_chunks).reshape( (-1, output_dimension)) except ValueError: # In case the projection step was not finished, it's left # as scrambled pass finally: self.data = original_data target = self.learning_results target.decomposition_algorithm = algorithm target.output_dimension = output_dimension target._object = obj target.factors = factors target.loadings = loadings target.explained_variance = explained_variance target.explained_variance_ratio = explained_variance_ratio
def main(_): z_size = 2 (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape(-1, 28**2) y_train = y_train x_test = x_test.reshape(-1, 28**2) x_train_normed, mu_train = normalize(x_train) x_test_normed, mu_test = normalize(x_test) batch_size = 4096 # numpy pure pca ##################################################################### # for PCA it is important to have 0 mean otherwise it does not work # ##################################################################### u, s, v = np.linalg.svd(x_train_normed, full_matrices=False) z_pca_train = (x_train_normed @ v.T)[:, :z_size] z_pca_test = (x_test_normed @ v.T)[:, :z_size] r_pca_train = denormalize(z_pca_train @ v[:z_size, :], mu_train) # reconstruction r_pca_test = denormalize(z_pca_test @ v[:z_size, :], mu_test) # reconstruction err_train = np.sum( (x_train - r_pca_train).astype(np.int64)**2) / r_pca_train.size err_test = np.sum( (x_test - r_pca_test).astype(np.int64)**2) / r_pca_test.size print('PCA train reconstruction error with 2 PCs: ' + str(round(err_train, 3))) print('PCA test reconstruction error with 2 PCs: ' + str(round(err_test, 3))) for i in range(z_size): plt.imshow(v.reshape(-1, 28, 28)[i], cmap="gray") plt.show() visualize_data(x_train, y_train, r_pca_train, z_pca_train, 'train_pca') visualize_data(x_test, y_test, r_pca_test, z_pca_test, 'test_pca') # # scikit-learn pca pca = PCA(n_components=z_size) z_pca_train = pca.fit_transform(x_train) z_pca_test = pca.transform(x_test) r_pca_train = pca.inverse_transform(z_pca_train) r_pca_test = pca.inverse_transform(z_pca_test) err_train = np.sum( (x_train - r_pca_train).astype(np.int64)**2) / r_pca_train.size err_test = np.sum( (x_test - r_pca_test).astype(np.int64)**2) / r_pca_test.size print('scikit-learn PCA train reconstruction error with 2 PCs: ' + str(round(err_train, 3))) print('scikit-learn PCA test reconstruction error with 2 PCs: ' + str(round(err_test, 3))) for i in range(z_size): plt.imshow(pca.components_.reshape(-1, 28, 28)[i], cmap="gray") plt.show() visualize_data(x_train, y_train, r_pca_train, z_pca_train, 'train_sklearn_pca') visualize_data(x_test, y_test, r_pca_test, z_pca_test, 'test_sklearn_pca') # scikit-learn incremental pca pca = IncrementalPCA(n_components=z_size, batch_size=100) z_pca_train = pca.fit_transform(x_train) z_pca_test = pca.transform(x_test) r_pca_train = pca.inverse_transform(z_pca_train) r_pca_test = pca.inverse_transform(z_pca_test) err_train = np.sum( (x_train - r_pca_train).astype(np.int64)**2) / r_pca_train.size err_test = np.sum( (x_test - r_pca_test).astype(np.int64)**2) / r_pca_test.size print( 'scikit-learn incremental PCA train reconstruction error with 2 PCs: ' + str(round(err_train, 3))) print( 'scikit-learn incremental PCA test reconstruction error with 2 PCs: ' + str(round(err_test, 3))) for i in range(z_size): plt.imshow(pca.components_.reshape(-1, 28, 28)[i], cmap="gray") plt.show() visualize_data(x_train, y_train, r_pca_train, z_pca_train, 'train') visualize_data(x_test, y_test, r_pca_test, z_pca_test, 'test') # keras pca using autoencoder m = Sequential() m.add( Dense(z_size, activation='linear', input_shape=(784, ), name='bottleneck')) m.add(Dense(784, activation='linear', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) print(m.summary()) tensorboard = TensorBoard(log_dir='logs/ae_pca', histogram_freq=5) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=10, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_pca') K.clear_session() # keras autoencoder with tanh, not centered, but normalized to [-1, 1] x_train_normed, mu_train = normalize(x_train, use_mean=False) x_test_normed, mu_test = normalize(x_test, use_mean=False) m = Sequential() m.add(Dense(512, activation='elu', input_shape=(784, ))) m.add(Dense(128, activation='elu')) m.add(Dense(z_size, activation='linear', name='bottleneck')) m.add(Dense(128, activation='elu')) m.add(Dense(512, activation='elu')) m.add(Dense(784, activation='tanh', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae_tanh_no_mean', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_tanh_no_mean') K.clear_session() # keras autoencoder, centered x_train_normed, mu_train = normalize(x_train) x_test_normed, mu_test = normalize(x_test) m = Sequential() m.add(Dense(512, activation='elu', input_shape=(784, ))) m.add(Dense(128, activation='elu')) m.add(Dense(z_size, activation='linear', name='bottleneck')) m.add(Dense(128, activation='elu')) m.add(Dense(512, activation='elu')) m.add(Dense(784, activation='linear', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae') K.clear_session() # keras autoencoder, not centered, but normalized to [-1, 1] x_train_normed, mu_train = normalize(x_train, use_mean=False) x_test_normed, mu_test = normalize(x_test, use_mean=False) m = Sequential() m.add(Dense(512, activation='elu', input_shape=(784, ))) m.add(Dense(128, activation='elu')) m.add(Dense(z_size, activation='linear', name='bottleneck')) m.add(Dense(128, activation='elu')) m.add(Dense(512, activation='elu')) m.add(Dense(784, activation='linear', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae_no_mean', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_no_mean') K.clear_session() # keras autoencoder, not centered, but normalized to [-1, 1] x_train_normed, mu_train = normalize(x_train, use_mean=False) x_test_normed, mu_test = normalize(x_test, use_mean=False) regul_const = 10e-9 m = Sequential() m.add( Dense(512, activation='elu', input_shape=(784, ), activity_regularizer=l1(regul_const))) m.add(Dense(128, activation='elu', activity_regularizer=l1(regul_const))) m.add( Dense(z_size, activation='linear', name='bottleneck', activity_regularizer=l1(regul_const))) m.add(Dense(128, activation='elu', activity_regularizer=l1(regul_const))) m.add(Dense(512, activation='elu', activity_regularizer=l1(regul_const))) m.add( Dense(784, activation='linear', name='decoder', activity_regularizer=l1(regul_const))) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae_no_mean_reg', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_no_mean_reg') K.clear_session() # keras autoencoder, regularizing only latent space x_train_normed, mu_train = normalize(x_train, use_mean=False) x_test_normed, mu_test = normalize(x_test, use_mean=False) regul_const = 10e-6 m = Sequential() m.add(Dense(512, activation='elu', input_shape=(784, ))) m.add(Dense(128, activation='elu')) m.add( Dense(z_size, activation='linear', name='bottleneck', activity_regularizer=l1(regul_const))) m.add(Dense(128, activation='elu')) m.add(Dense(512, activation='elu')) m.add(Dense(784, activation='linear', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae_no_mean_reg_lat_e6', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_no_mean_reg_lat_e6') K.clear_session() x_train_normed, mu_train = normalize(x_train, use_mean=False) x_test_normed, mu_test = normalize(x_test, use_mean=False) regul_const = 10e-7 m = Sequential() m.add(Dense(512, activation='elu', input_shape=(784, ))) m.add(Dense(128, activation='elu')) m.add( Dense(z_size, activation='linear', name='bottleneck', activity_regularizer=l1(regul_const))) m.add(Dense(128, activation='elu')) m.add(Dense(512, activation='elu')) m.add(Dense(784, activation='linear', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae_no_mean_reg_lat_e7', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_no_mean_reg_lat_e7') K.clear_session() # trying on cifar 100 z_size = 2 (x_train, y_train), (x_test, y_test) = cifar100.load_data() x_train = x_train.reshape(-1, 32**2) y_train = y_train x_test = x_test.reshape(-1, 32**2) x_train_normed, mu_train = normalize(x_train, use_mean=False) x_test_normed, mu_test = normalize(x_test, use_mean=False) regul_const = 10e-7 m = Sequential() m.add(Dense(512, activation='elu', input_shape=(32**2, ))) m.add(Dense(128, activation='elu')) m.add( Dense(z_size, activation='linear', name='bottleneck', activity_regularizer=l1(regul_const))) m.add(Dense(128, activation='elu')) m.add(Dense(512, activation='elu')) m.add(Dense(32**2, activation='linear', name='decoder')) m.compile(loss='mean_squared_error', optimizer=Adam()) tensorboard = TensorBoard(log_dir='logs/ae_cifar_100', histogram_freq=5) print(m.summary()) history = m.fit(x_train_normed, x_train_normed, batch_size=batch_size, epochs=50, verbose=1, validation_data=(x_test_normed, x_test_normed), callbacks=[tensorboard]) eval_show_network(m, mu_train, mu_test, x_train_normed, x_test_normed, y_train, y_test, history, 'ae_cifar_100', (32, 32)) K.clear_session() print('done')
dt_features = StandardScaler().fit_transform(dt_features) x_train, x_test, y_train, y_test = train_test_split(dt_features, dt_target, test_size=0.3, random_state=42) print(x_train.shape) print(y_train.shape) # default n_components = min(n_muestras, n-features) pca = PCA(n_components=3) pca.fit(x_train) ipca = IncrementalPCA(n_components=3, batch_size=10) ipca.fit(x_train) plt.plot(range(len(pca.explained_variance_)), pca.explained_variance_ratio_) #plt.show() logistic = LogisticRegression(solver='lbfgs') dt_train = pca.transform(x_train) dt_test = pca.transform(x_test) logistic.fit(dt_train, y_train) print("SCORE PCA", logistic.score(dt_test, y_test)) dt_train = ipca.transform(x_train) dt_test = ipca.transform(x_test)
def renyi_select(self,X,represent_points,do_pca=False): """ Takes in data and number of prototype vectors and returns the indices of the prototype vectors. The prototype vectors are selected based on maximization of quadratic renyi entropy, which can be written in terms of log sum exp which is a tightly bounded by max operator. Now for rbf kernel, the max_{ij}(-\|x_i-x_j\|^2) is equivalent to min_{ij}(\|x_i-x_j\|^2). Parameters ---------- X: np.ndarray shape = n_samples, n_features represent_points: int number of prototype vectors to return do_pca: boolean whether to perform incremental pca for dimensionality reduction before selecting prototype vectors Returns ------- sv: list list of the prototype vector indices from the data array given by X """ # do_pca = self.do_pca_in_selection N= X.shape[0] capacity=represent_points selectionset=set([]) set_full=set(list(range(N))) np.random.seed(1) if(len(selectionset)==0): selectionset = np.random.permutation(N) sv = list(selectionset)[0:capacity] else: extrainputs = represent_points - len(selectionset) leftindices =list(set_full.difference(selectionset)) info = np.random.permutation(len(leftindices)) info = info[1:extrainputs] sv = selectionset.append(leftindices[info]) if(do_pca == True): if(X.shape[1]>50): #takes more time n_components = 50 ipca = IncrementalPCA(n_components=n_components, batch_size=np.min([128,X.shape[0]])) X = ipca.fit_transform(X) svX = X[sv,:] min_info = np.zeros((capacity,2)) KsV = pairwise_distances(svX,svX)**2 #this is fast KsV[KsV==0] = np.inf min_info[:,1] = np.min(KsV,axis=1) min_info[:,0] = np.arange(capacity) minimum = np.min(min_info[:,1]) counter = 0 for i in range(N): # find for which data the value is minimum replace = np.argmin(min_info[:,1]) ids = int(min_info[min_info[:,0]==replace,0]) #Subtract from totalcrit once for row tempminimum = minimum - min_info[ids,1] #Try to evaluate kernel function tempsvX = np.zeros(svX.shape) tempsvX[:] = svX[:] inputX = X[i,:] tempsvX[replace,:] = inputX tempK = pairwise_distances(tempsvX,np.reshape(inputX,(1,X.shape[1])))**2 #this is fast tempK[tempK==0] = np.inf distance_eval = np.min(tempK) tempminimum = tempminimum + distance_eval if (minimum < tempminimum): minimum = tempminimum min_info[ids,1] = distance_eval svX[:] = tempsvX[:] sv[ids] = i counter +=1 return sv
datos_test.append((test_caras[i],1)) datos_test.append((test_no_caras[i],0)) # Almacenamos el test para obtener lambda f = file("datos_test.dat","wb") pickle.dump(datos_test,f,2) f.close() print "Proyeccion PCA y Clustering (K-means)..." # Nota: Imagenes de entrada son cuadradas # Ventana subregion (cuadrada) l_sr = len(l_imagenes_caras[0])/16 # Clusters para k means clusters = 60 # PCA ipca = IncrementalPCA(n_components=8) # k means kmeans = MiniBatchKMeans(n_clusters=clusters,random_state=1) # Listas auxiliares l_aux = [] l_pos = [] # Obtenemos el modelo de PCA (caras) for img_cara in l_imagenes_caras: # Prepoceso PCA pos = 0 for i in range(0,len(img_cara),l_sr): if i+l_sr <= len(img_cara): subregion = img_cara[i:i+l_sr] l_aux.append(subregion)
""" for color, label, class_name in zip(colors, labels, class_names): plt.scatter(X[y == label, 0], X[y == label, 1], color=color, label=class_name) plt.title(title) plt.legend(loc='best') # 转换前的可视化, 只显示前两维度的数据 plt.figure(1) plot_func('origin data') # KernelPCA 是非线性降维, LDA 只能用于分类降维 # ICA 通常不用于降低维度,而是用于分离叠加信号 models_list = [('LDA', LinearDiscriminantAnalysis(n_components=2)), ('PCA', PCA(n_components=2, random_state=0)), ('PCARand', PCA(n_components=2, random_state=0, svd_solver='randomized')), ('IncrementalPCA', IncrementalPCA(n_components=2, batch_size=10, whiten=True)), ('FactorAnalysis', FactorAnalysis(n_components=2, max_iter=500)), ('FastICA', FastICA(n_components=2, random_state=0)), ('KernelPCA', KernelPCA(n_components=2, random_state=0, kernel='rbf')), ('SparsePCA', SparsePCA(n_components=2, random_state=0, verbose=True)), ('MiniBatchSparsePCA', MiniBatchSparsePCA(n_components=2, verbose=True, batch_size=10, random_state=0)), ('DictionaryLearning', DictionaryLearning(n_components=2, verbose=True, random_state=0)), ('MiniBatchDictionaryLearning', MiniBatchDictionaryLearning(n_components=2, batch_size=5, random_state=0, alpha=0.1))] model = namedtuple('models', ['mod_name', 'mod_ins']) for i in range(len(models_list)): mod = model(*models_list[i]) if mod.mod_name == 'LDA': mod.mod_ins.fit(X, y) X_new = mod.mod_ins.transform(X) else: X_new = mod.mod_ins.fit_transform(X)
def use_incremental_pca() -> Pipeline: pipe = Pipeline([('cv', CountVectorizer()), ('ipca', IncrementalPCA(n_components=2, batch_size=4))]) return pipe
def pca( data: Union[AnnData, np.ndarray, spmatrix], n_comps: int = N_PCS, zero_center: Optional[bool] = True, svd_solver: str = 'auto', random_state: int = 0, return_info: bool = False, use_highly_variable: Optional[bool] = None, dtype: str = 'float32', copy: bool = False, chunked: bool = False, chunk_size: Optional[int] = None, ) -> Union[AnnData, np.ndarray, spmatrix]: """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data The (annotated) data matrix of shape ``n_obs`` × ``n_vars``. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. zero_center If `True`, compute standard PCA from covariance matrix. If ``False``, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing ``None`` decides automatically based on sparseness of the data. svd_solver SVD solver to use: ``'arpack'`` for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) ``'randomized'`` for the randomized algorithm due to Halko (2009). ``'auto'`` (the default) chooses automatically depending on the size of the problem. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in ``.var['highly_variable']``. By default uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If ``True``, perform an incremental PCA on segments of ``chunk_size``. The incremental PCA automatically zero centers and ignores settings of ``random_seed`` and ``svd_solver``. If ``False``, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if ``chunked=True`` was passed. Returns ------- X_pca : :class:`scipy.sparse.spmatrix` or :class:`numpy.ndarray` If `data` is array-like and ``return_info=False`` was passed, this function only returns `X_pca`… adata : :class:`~anndata.AnnData` …otherwise if ``copy=True`` it returns or else adds fields to ``adata``: ``.obsm['X_pca']`` PCA representation of data. ``.varm['PCs']`` The principal components containing the loadings. ``.uns['pca']['variance_ratio']``) Ratio of explained variance. ``.uns['pca']['variance']`` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.` This will likely ' 'become the Scanpy default in the future.') data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: adata = data.copy() if copy else data else: adata = AnnData(data) logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if use_highly_variable is True and 'highly_variable' not in adata.var.keys( ): raise ValueError( 'Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.filter_genes_dispersion` first.') if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys( ) else False adata_comp = adata[:, adata. var['highly_variable']] if use_highly_variable else adata if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata_comp.X.shape[0], n_comps), adata_comp.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: if zero_center is None: zero_center = not issparse(adata_comp.X) if zero_center: from sklearn.decomposition import PCA if issparse(adata_comp.X): logg.msg( ' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata_comp.X.toarray( ) # Copying the whole adata_comp.X here, could cause memory problems else: X = adata_comp.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata_comp.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca if use_highly_variable: adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][ adata.var['highly_variable']] = pca_.components_.T else: adata.varm['PCs'] = pca_.components_.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.msg(' finished', t=True, end=' ', v=4) logg.msg( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca
parser.add_argument('--npca', default=-1, type=int, help='number of points used to calculate PCA') args = parser.parse_args() assert os.path.isfile('train_features.npz') logging.info('Loading features file') train_features = np.load('train_features.npz') img_features = train_features['img_features'] tag_features = train_features['tag_features'] N_PCA = img_features.shape[0] if args.npca == -1 else args.npca logging.info('Training: PCA of image features, N_PCA = %d', N_PCA) start = time.time() pca = IncrementalPCA(n_components=500, batch_size=512) pca.fit(img_features[:N_PCA, :]) end = time.time() logging.info('Time: %.4fm', (end - start) / 60) logging.info('Apply PCA to image features') start = time.time() X = pca.transform(img_features) end = time.time() logging.info('Time: %.4fm', (end - start) / 60) logging.info('Training: fit CCA') start = time.time() W_img, W_tag = cca.fit(X, tag_features, numCC=args.numCC, useGPU=args.gpu) end = time.time() logging.info('Time: %.4fm', (end - start) / 60)
def btnConvert_click(self): msgBox = QMessageBox() # OutFile OutFile = ui.txtOutFile.text() if not len(OutFile): msgBox.setText("Please enter out file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # InFile InFile = ui.txtInFile.text() if not len(InFile): msgBox.setText("Please enter input file!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if not os.path.isfile(InFile): msgBox.setText("Input file not found!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if ui.rbScale.isChecked() == True and ui.rbALScale.isChecked( ) == False: msgBox.setText( "Subject Level Normalization is just available for Subject Level Analysis!" ) msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False InData = io.loadmat(InFile) OutData = dict() OutData["imgShape"] = InData["imgShape"] if not len(ui.txtData.currentText()): msgBox.setText("Please enter Data variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: X = InData[ui.txtData.currentText()] if ui.cbScale.isChecked() and (not ui.rbScale.isChecked()): X = preprocessing.scale(X) print("Whole of data is scaled X~N(0,1).") except: print("Cannot load data") return try: NumFea = np.int32(ui.txtNumFea.text()) except: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea < 1: msgBox.setText("Number of features must be greater than zero!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if NumFea > np.shape(X)[1]: msgBox.setText("Number of features is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False # Batch try: Batch = np.int32(ui.txtBatch.text()) except: msgBox.setText("Size of batch is wrong!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False if Batch == 0: Batch = None # Subject if not len(ui.txtSubject.currentText()): msgBox.setText("Please enter Subject variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False try: Subject = InData[ui.txtSubject.currentText()] OutData[ui.txtOSubject.text()] = Subject except: print("Cannot load Subject ID") return # Label if not len(ui.txtLabel.currentText()): msgBox.setText("Please enter Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOLabel.text()] = InData[ui.txtLabel.currentText()] # Task if ui.cbTask.isChecked(): if not len(ui.txtTask.currentText()): msgBox.setText("Please enter Task variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOTask.text()] = InData[ui.txtTask.currentText()] # Run if ui.cbRun.isChecked(): if not len(ui.txtRun.currentText()): msgBox.setText("Please enter Run variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtORun.text()] = InData[ui.txtRun.currentText()] # Counter if ui.cbCounter.isChecked(): if not len(ui.txtCounter.currentText()): msgBox.setText("Please enter Counter variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOCounter.text()] = InData[ ui.txtCounter.currentText()] # Matrix Label if ui.cbmLabel.isChecked(): if not len(ui.txtmLabel.currentText()): msgBox.setText("Please enter Matrix Label variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOmLabel.text()] = InData[ui.txtmLabel.currentText()] # Design if ui.cbDM.isChecked(): if not len(ui.txtDM.currentText()): msgBox.setText("Please enter Design Matrix variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtODM.text()] = InData[ui.txtDM.currentText()] # Coordinate if ui.cbCol.isChecked(): if not len(ui.txtCol.currentText()): msgBox.setText("Please enter Coordinator variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOCol.text()] = InData[ui.txtCol.currentText()] # Condition if ui.cbCond.isChecked(): if not len(ui.txtCond.currentText()): msgBox.setText("Please enter Condition variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOCond.text()] = InData[ui.txtCond.currentText()] # Number of Scan if ui.cbNScan.isChecked(): if not len(ui.txtScan.currentText()): msgBox.setText("Please enter Number of Scan variable name!") msgBox.setIcon(QMessageBox.Critical) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_() return False OutData[ui.txtOScan.text()] = InData[ui.txtScan.currentText()] Models = dict() Models["Name"] = "IPCA" if ui.rbALScale.isChecked(): print("Partition data to subject level ...") SubjectUniq = np.unique(Subject) X_Sub = list() for subj in SubjectUniq: if ui.cbScale.isChecked() and ui.rbScale.isChecked(): X_Sub.append( preprocessing.scale( X[np.where(Subject == subj)[1], :])) print("Data in subject level is scaled, X_" + str(subj) + "~N(0,1).") else: X_Sub.append(X[np.where(Subject == subj)[1], :]) print("Subject ", subj, " is extracted from data.") print("Running IPCA in subject level ...") X_Sub_PCA = list() lenPCA = len(X_Sub) for xsubindx, xsub in enumerate(X_Sub): model = IncrementalPCA(n_components=NumFea, batch_size=Batch) model.fit(xsub) X_Sub_PCA.append(model.transform(xsub)) Models["Model" + str(xsubindx + 1)] = str( model.get_params(deep=True)) print("IPCA: ", xsubindx + 1, " of ", lenPCA, " is done.") print("Data integration ... ") X_new = None for xsubindx, xsub in enumerate(X_Sub_PCA): X_new = np.concatenate( (X_new, xsub)) if X_new is not None else xsub print("Integration: ", xsubindx + 1, " of ", lenPCA, " is done.") OutData[ui.txtOData.text()] = X_new else: print("Running IPCA ...") model = IncrementalPCA(n_components=NumFea, batch_size=Batch) OutData[ui.txtOData.text()] = model.fit_transform(X) Models["Model"] = str(model.get_params(deep=True)) OutData["ModelParameter"] = Models print("Saving ...") io.savemat(ui.txtOutFile.text(), mdict=OutData) print("DONE.") msgBox.setText("Incremental PCA is done.") msgBox.setIcon(QMessageBox.Information) msgBox.setStandardButtons(QMessageBox.Ok) msgBox.exec_()
clf = LDA() clf.fit(data, label.ravel()) train_predict = clf.predict(data) train_error = 1 - np.mean(train_predict == label) print("Training data error: %f" % train_error) test_predict = clf.predict(test_data) test_error = 1 - np.mean(test_predict == test_label) print("Test data error: %f" % test_error) # ##Question 2 # In[4]: pca = IncrementalPCA(n_components=49) pca.fit(data, label) U = pca.transform(data) clf_lda = LDA() train_pca = U clf_lda.fit(train_pca, label.ravel()) train_pca_predict = clf_lda.predict(train_pca) train_pca_error = 1 - np.mean(train_pca_predict == label) print("Training data error after PCA: %f" % train_pca_error) U = pca.transform(test_data) test_pca = U test_pca_predict = clf_lda.predict(test_pca) test_pca_error = 1 - np.mean(test_pca_predict == test_label) print("Test data error after PCA: %f" % test_pca_error)
def pca( data: Union[AnnData, np.ndarray, spmatrix], n_comps: Optional[int] = None, zero_center: Optional[bool] = True, svd_solver: str = 'arpack', random_state: AnyRandom = 0, return_info: bool = False, use_highly_variable: Optional[bool] = None, dtype: str = 'float32', copy: bool = False, chunked: bool = False, chunk_size: Optional[int] = None, ) -> Union[AnnData, np.ndarray, spmatrix]: """\ Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. .. versionchanged:: 1.5.0 In previous versions, computing a PCA on a sparse matrix would make a dense copy of the array for mean centering. As of scanpy 1.5.0, mean centering is implicit. While results are extremely similar, they are not exactly the same. If you would like to reproduce the old results, pass a dense array. Parameters ---------- data The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_comps Number of principal components to compute. Defaults to 50, or 1 - minimum dimension size of selected representation. zero_center If `True`, compute standard PCA from covariance matrix. If `False`, omit zero-centering variables (uses :class:`~sklearn.decomposition.TruncatedSVD`), which allows to handle sparse input efficiently. Passing `None` decides automatically based on sparseness of the data. svd_solver SVD solver to use: `'arpack'` (the default) for the ARPACK wrapper in SciPy (:func:`~scipy.sparse.linalg.svds`) `'randomized'` for the randomized algorithm due to Halko (2009). `'auto'` chooses automatically depending on the size of the problem. `'lobpcg'` An alternative SciPy solver. .. versionchanged:: 1.4.5 Default value changed from `'auto'` to `'arpack'`. Efficient computation of the principal components of a sparse matrix currently only works with the `'arpack`' or `'lobpcg'` solvers. random_state Change to use different initial states for the optimization. return_info Only relevant when not passing an :class:`~anndata.AnnData`: see “**Returns**”. use_highly_variable Whether to use highly variable genes only, stored in `.var['highly_variable']`. By default uses them if they have been determined beforehand. dtype Numpy data type string to which to convert the result. copy If an :class:`~anndata.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. chunked If `True`, perform an incremental PCA on segments of `chunk_size`. The incremental PCA automatically zero centers and ignores settings of `random_seed` and `svd_solver`. If `False`, perform a full PCA. chunk_size Number of observations to include in each chunk. Required if `chunked=True` was passed. Returns ------- X_pca : :class:`~scipy.sparse.spmatrix`, :class:`~numpy.ndarray` If `data` is array-like and `return_info=False` was passed, this function only returns `X_pca`… adata : anndata.AnnData …otherwise if `copy=True` it returns or else adds fields to `adata`: `.obsm['X_pca']` PCA representation of data. `.varm['PCs']` The principal components containing the loadings. `.uns['pca']['variance_ratio']` Ratio of explained variance. `.uns['pca']['variance']` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ logg_start = logg.info('computing PCA') # chunked calculation is not randomized, anyways if svd_solver in {'auto', 'randomized'} and not chunked: logg.info( 'Note that scikit-learn\'s randomized PCA might not be exactly ' 'reproducible across different computational platforms. For exact ' 'reproducibility, choose `svd_solver=\'arpack\'.`' ) data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: adata = data.copy() if copy else data else: adata = AnnData(data, dtype=data.dtype) if use_highly_variable is True and 'highly_variable' not in adata.var.keys(): raise ValueError( 'Did not find adata.var[\'highly_variable\']. ' 'Either your data already only consists of highly-variable genes ' 'or consider running `pp.highly_variable_genes` first.' ) if use_highly_variable is None: use_highly_variable = True if 'highly_variable' in adata.var.keys() else False if use_highly_variable: logg.info(' on highly variable genes') adata_comp = ( adata[:, adata.var['highly_variable']] if use_highly_variable else adata ) if n_comps is None: min_dim = min(adata_comp.n_vars, adata_comp.n_obs) if settings.N_PCS >= min_dim: n_comps = min_dim - 1 else: n_comps = settings.N_PCS logg.info(f' with n_comps={n_comps}') random_state = check_random_state(random_state) X = adata_comp.X if chunked: if not zero_center or random_state or svd_solver != 'arpack': logg.debug('Ignoring zero_center, random_state, svd_solver') from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((X.shape[0], n_comps), X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata_comp.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) elif (not issparse(X) or svd_solver == "randomized") and zero_center: from sklearn.decomposition import PCA if issparse(X) and svd_solver == "randomized": # This is for backwards compat. Better behaviour would be to either error or use arpack. logg.warning( "svd_solver 'randomized' does not work with sparse input. Densifying the array. " "This may take a very large amount of memory." ) X = X.toarray() pca_ = PCA( n_components=n_comps, svd_solver=svd_solver, random_state=random_state ) X_pca = pca_.fit_transform(X) elif issparse(X) and zero_center: from sklearn.decomposition import PCA if svd_solver == "auto": svd_solver = "arpack" if svd_solver not in {'lobpcg', 'arpack'}: raise ValueError( 'svd_solver: {svd_solver} can not be used with sparse input.\n' 'Use "arpack" (the default) or "lobpcg" instead.' ) output = _pca_with_sparse( X, n_comps, solver=svd_solver, random_state=random_state ) # this is just a wrapper for the results X_pca = output['X_pca'] pca_ = PCA(n_components=n_comps, svd_solver=svd_solver) pca_.components_ = output['components'] pca_.explained_variance_ = output['variance'] pca_.explained_variance_ratio_ = output['variance_ratio'] elif not zero_center: from sklearn.decomposition import TruncatedSVD logg.debug( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely' ) pca_ = TruncatedSVD( n_components=n_comps, random_state=random_state, algorithm=svd_solver ) X_pca = pca_.fit_transform(X) else: raise Exception("This shouldn't happen. Please open a bug report.") if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca adata.uns['pca'] = {} adata.uns['pca']['params'] = { 'zero_center': zero_center, 'use_highly_variable': use_highly_variable, } if use_highly_variable: adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, n_comps)) adata.varm['PCs'][adata.var['highly_variable']] = pca_.components_.T else: adata.varm['PCs'] = pca_.components_.T adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.info(' finished', time=logg_start) logg.debug( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)' ) return adata if copy else None else: logg.info(' finished', time=logg_start) if return_info: return ( X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_, ) else: return X_pca
class IPCA(object): def __init__(self, n_components=None, whiten=False, copy=True, batch_size=None): """ :param n_components: default为None ,int 或None, 想要保留的分量数,None 时, min(n_samples, n_features) :param whiten: bool型,可选项, 默认为False, 当true(默认情况下为false)时,components_ 向量除以 n_samples*components_以确保具有单位组件级方差的不相关输出。 :param copy: 默认为True, False时,x 将被覆盖,将节约能存,但存在不安全 :param batch_size: default None, 批量样本数, 只在fit 中使用,设为None,系统自动设成5*n_features, 以保持经度与内存开销的平衡 """ self.model = IncrementalPCA(n_components=n_components, whiten=whiten, copy=copy, batch_size=batch_size) def fit(self, x, y=None): self.model.fit(X=x, y=y) def transform(self, x): return self.model.transform(X=x) def fit_transform(self, x, y=None): return self.model.fit_transform(X=x, y=y) def get_params(self, deep=True): # 获取评估器的参数 return self.model.get_params(deep=deep) def set_params(self, **params): # 设置评估器的参数 self.model.set_params(**params) def inverse_transform(self, x): # 与 fit_tansform 刚好相反的两个操作 return self.model.inverse_transform(X=x) def get_precision(self): # 根据生成模型计算精度矩阵 return self.model.get_precision() def get_covariance(self): # 根据生成模型获取协方差 return self.model.get_covariance() def partial_fit(self, x, y=None, check_input=True): # 增量训练 self.model.partial_fit(X=x, y=y, check_input=check_input) def get_attributes(self): component = self.model.components_ explained_variance = self.model.explained_variance_ explained_variance_ratio = self.model.explained_variance_ratio_ singular_values = self.model.singular_values_ means = self.model.mean_ # 每个特征的均值 var = self.model.var_ # 每个特征的方差 noise_variance = self.model.noise_variance_ # 评估的噪声协方差 n_component = self.model.n_components_ n_samples_seen = self.model.n_samples_seen_ return component, explained_variance, explained_variance_ratio, singular_values, means, var, noise_variance, \ n_component, n_samples_seen
from scipy import stats import timeit from ConfusionMatrix import confusionMatrixAlgo from sklearn.externals import joblib from sklearn.svm import NuSVC from sklearn import grid_search from sklearn.model_selection import RandomizedSearchCV from sklearn import preprocessing from sklearn.decomposition import PCA from skimage.feature import hog from sklearn.ensemble import RandomForestClassifier from sklearn.cluster import MiniBatchKMeans from sklearn.cluster import KMeans from sklearn.decomposition import IncrementalPCA pca = IncrementalPCA(n_components = 500) def get_feature_space(images): features = [] count = 1 buffer = [] for image in images: des = hog(image) features.extend(des.reshape(-1,36).tolist()) if(count %100 ==0): print str(count) + " out of " + str (len(images)) count+= 1 return features def to_BOW_features(features,codebook): BOW = [codebook.predict(feature) for feature in features] hist = [np.histogram(bag, bins = codebook.n_clusters)[0]for bag in BOW] return hist
img = Image.open(i) img = img.resize((int(480 / 6), int(360 / 6)), Image.BICUBIC) img = img_to_matrix(img) img = flatten_img(img) dataset.append(img) dataset = np.array(dataset) print(dataset.shape) print("Dataset make done.") n = dataset.shape[0] batch_size = 180 ipca = IncrementalPCA(n_components=100) for i in range(n // batch_size): r_dataset = ipca.partial_fit(dataset[i * batch_size:(i + 1) * batch_size]) r_dataset = ipca.transform(dataset) print(r_dataset.shape) print("PCA done.") # K-means clustering import shutil n_clusters_10 = 10 kmeans_10 = KMeans(n_clusters=n_clusters_10, random_state=5).fit(r_dataset) labels_10 = kmeans_10.labels_ print("K-means clustering done.")
def pca(data, n_comps=None, zero_center=True, svd_solver='auto', random_state=0, return_info=False, dtype='float32', copy=False, chunked=False, chunk_size=None): """Principal component analysis [Pedregosa11]_. Computes PCA coordinates, loadings and variance decomposition. Uses the implementation of *scikit-learn* [Pedregosa11]_. Parameters ---------- data : :class:`~scanpy.api.AnnData`, `np.ndarray`, `sp.sparse` The (annotated) data matrix of shape `n_obs` × `n_vars`. Rows correspond to cells and columns to genes. n_comps : `int`, optional (default: 50) Number of principal components to compute. zero_center : `bool` or `None`, optional (default: `True`) If `True`, compute standard PCA from covariance matrix. If `False`, omit zero-centering variables (uses *TruncatedSVD* from scikit-learn), which allows to handle sparse input efficiently. svd_solver : `str`, optional (default: 'auto') SVD solver to use. Either 'arpack' for the ARPACK wrapper in SciPy (scipy.sparse.linalg.svds), or 'randomized' for the randomized algorithm due to Halko (2009). "auto" chooses automatically depending on the size of the problem. random_state : `int`, optional (default: 0) Change to use different intial states for the optimization. return_info : `bool`, optional (default: `False`) Only relevant when not passing an :class:`~scanpy.api.AnnData`: see "Returns". dtype : `str` (default: 'float32') Numpy data type string to which to convert the result. copy : `bool`, optional (default: `False`) If an :class:`~scanpy.api.AnnData` is passed, determines whether a copy is returned. Is ignored otherwise. Returns ------- If `data` is array-like and `return_info == False`, only returns `X_pca`,\ otherwise returns or adds to `adata`: X_pca : `.obsm` PCA representation of data. PCs : `.varm` The principal components containing the loadings. variance_ratio : `.uns['pca']` Ratio of explained variance. variance : `.uns['pca']` Explained variance, equivalent to the eigenvalues of the covariance matrix. """ if n_comps is None: n_comps = N_PCS if isinstance(data, AnnData): data_is_AnnData = True adata = data.copy() if copy else data else: data_is_AnnData = False adata = AnnData(data) logg.msg('computing PCA with n_comps =', n_comps, r=True, v=4) if adata.n_vars < n_comps: n_comps = adata.n_vars - 1 logg.msg('reducing number of computed PCs to', n_comps, 'as dim of data is only', adata.n_vars, v=4) if chunked: if not zero_center or random_state or svd_solver != 'auto': logg.msg('Ignoring zero_center, random_state, svd_solver', v=4) from sklearn.decomposition import IncrementalPCA X_pca = np.zeros((adata.X.shape[0], n_comps), adata.X.dtype) pca_ = IncrementalPCA(n_components=n_comps) for chunk, _, _ in adata.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk pca_.partial_fit(chunk) for chunk, start, end in adata.chunked_X(chunk_size): chunk = chunk.toarray() if issparse(chunk) else chunk X_pca[start:end] = pca_.transform(chunk) else: zero_center = zero_center if zero_center is not None else False if issparse( adata.X) else True if zero_center: from sklearn.decomposition import PCA if issparse(adata.X): logg.msg( ' as `zero_center=True`, ' 'sparse input is densified and may ' 'lead to huge memory consumption', v=4) X = adata.X.toarray( ) # Copying the whole adata.X here, could cause memory problems else: X = adata.X pca_ = PCA(n_components=n_comps, svd_solver=svd_solver, random_state=random_state) else: from sklearn.decomposition import TruncatedSVD logg.msg( ' without zero-centering: \n' ' the explained variance does not correspond to the exact statistical defintion\n' ' the first component, e.g., might be heavily influenced by different means\n' ' the following components often resemble the exact PCA very closely', v=4) pca_ = TruncatedSVD(n_components=n_comps, random_state=random_state) X = adata.X X_pca = pca_.fit_transform(X) if X_pca.dtype.descr != np.dtype(dtype).descr: X_pca = X_pca.astype(dtype) if data_is_AnnData: adata.obsm['X_pca'] = X_pca adata.varm['PCs'] = pca_.components_.T adata.uns['pca'] = {} adata.uns['pca']['variance'] = pca_.explained_variance_ adata.uns['pca']['variance_ratio'] = pca_.explained_variance_ratio_ logg.msg(' finished', t=True, end=' ', v=4) logg.msg( 'and added\n' ' \'X_pca\', the PCA coordinates (adata.obs)\n' ' \'PC1\', \'PC2\', ..., the loadings (adata.var)\n' ' \'pca_variance\', the variance / eigenvalues (adata.uns)\n' ' \'pca_variance_ratio\', the variance ratio (adata.uns)', v=4) return adata if copy else None else: if return_info: return X_pca, pca_.components_, pca_.explained_variance_ratio_, pca_.explained_variance_ else: return X_pca
async def do_run_async(self): # Principal Component Analysis (PCA) is by far the most popular dimensionality reduction # algorithm. First it identifies the hyperplane that lies closest to the data, and then # it projects the data onto it. # How PCA works: For the number of dimensions that you want to reduce a dataset to, it identifies the axis # for which the projection of the dataset onto generates the maximum amount of variance (or the axis that minimizes the mean squared distance # between the original dataset and its projection onto that axis based on Pythagoras' theorem) # It starts with a first axis then finds a second axis orthogonal to the first that maximizes the amount of remaining variance # and then a third axis orthogonal to the first two and so on - as many axes as the number of dimensions required to reduce the dataset to. # The vectors that define the axis are called Principal Components # Once you have identified all the principal components, you can reduce the dimensionality # of the dataset down to d dimensions by projecting it onto the hyperplane # defined by the first d principal components. training_set = super().load_train_images() # Training set needs to be reshaped from 3D (60000,28,28) to 2D (60000, 784) for the classifier to be able to # use in training phase training_set_tr = training_set.reshape((60000, 784)) training_labels = super().load_train_labels() X = training_set_tr[:1000, :] # First 1000 instances # There is a standard matrix factorization technique called Singular Value Decomposition (SVD) # that can decompose the training set matrix X into the dot product of three matrices U # · Σ · VT, where VT contains all the principal components that we are looking for. X_centered = X - X.mean(axis=0) U, s, V = np.linalg.svd(X_centered) # The principal components vectors are then the columns of the transpose of V matrix C1 = V.T[:, 0] # Shape (784,) C2 = V.T[:, 1] # Shape (784,) # To project the training set onto the hyperplane, you can simply compute the dot # product of the training set matrix X by the matrix Wd, defined as the matrix containing the first d principal components # (i.e., the matrix composed of the first d columns of VT) W2 = V.T[:, :2] X2D = X_centered.dot(W2) # Same using Scikit-Learn pca = PCA(n_components=2) X2D = pca.fit_transform(X) # X2D should be identical to the one computed above? # Instead of arbitrarily choosing the number of dimensions to reduce down to, it is # generally preferable to choose the number of dimensions that add up to a sufficiently # large portion of the variance (e.g., 95%). Unless, of course, you are reducing dimensionality # for data visualization—in that case you will generally want to reduce the # dimensionality down to 2 or 3. pca = PCA(n_components=0.95) X_reduced = pca.fit_transform(X) # The 1000 instances should now have 129 features instead of the original 784 # One problem with the preceding implementation of PCA is that it requires the whole # training set to fit in memory in order for the SVD algorithm to run. Fortunately, # Incremental PCA (IPCA) algorithms have been developed: you can split the training # set into mini-batches and feed an IPCA algorithm one mini-batch at a time. This is # useful for large training sets, and also to apply PCA online (i.e., on the fly, as new # instances arrive). X = training_set_tr n_batches = 100 # 100 batches of 600 instances inc_pca = IncrementalPCA(n_components=129) for X_batch in np.array_split(X, n_batches): inc_pca.partial_fit(X_batch) X_mnist_reduced = inc_pca.transform(X) # Measure the difference in the time required to train a K-Neighbors Classifier (known to be slow) # on the original and reduced MNIST dataset...The difference should be huge! start_time = time.time() clf = KNeighborsClassifier() clf.fit(X, training_labels) elapsed = time.time() - start_time print(f"Training a K-Neighbors Classifier on the original MNIST dataset took {elapsed} seconds.") start_time = time.time() clf.fit(X_mnist_reduced, training_labels) elapsed = time.time() - start_time print(f"Training a K-Neighbors Classifier on the reduced MNIST dataset took {elapsed} seconds.")
class AnnStream: def __init__(self, data, k: int, n_cluster: int, reduction_method: str, dims: int, loadings: np.ndarray, use_for_pca: np.ndarray, mu: np.ndarray, sigma: np.ndarray, ann_metric: str, ann_efc: int, ann_ef: int, ann_m: int, nthreads: int, ann_parallel: bool, rand_state: int, do_kmeans_fit: bool, disable_scaling: bool, ann_idx): self.data = data self.k = k if self.k >= self.data.shape[0]: self.k = self.data.shape[0] - 1 self.nClusters = max(n_cluster, 2) self.dims = dims self.loadings = loadings if self.dims is None and self.loadings is None: raise ValueError( "ERROR: Provide either value for atleast one: 'dims' or 'loadings'" ) self.annMetric = ann_metric self.annEfc = ann_efc self.annEf = ann_ef self.annM = ann_m self.nthreads = nthreads if ann_parallel: self.annThreads = self.nthreads else: self.annThreads = 1 self.randState = rand_state self.batchSize = self._handle_batch_size() self.method = reduction_method self.nCells, self.nFeats = self.data.shape self.clusterLabels: np.ndarray = np.repeat(-1, self.nCells) disable_reduction = False if self.dims < 1: disable_reduction = True with threadpool_limits(limits=self.nthreads): if self.method == 'pca': self.mu, self.sigma = mu, sigma if self.loadings is None or len(self.loadings) == 0: if len(use_for_pca) != self.nCells: raise ValueError( "ERROR: `use_for_pca` does not have sample length as nCells" ) if disable_reduction is False: self._fit_pca(disable_scaling, use_for_pca) else: # Even though the dims might have been already adjusted according to loadings before calling # AnnStream, it could still be overwritten by _handle_batch_size. Hence need to hard set it here. self.dims = self.loadings.shape[1] # it is okay for dimensions to be larger than batch size here because we will not fit the PCA if disable_scaling: if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) else: if disable_reduction: self.reducer = lambda x: self.transform_z(x) else: self.reducer = lambda x: self.transform_z(x).dot( self.loadings) elif self.method == 'lsi': if self.loadings is None or len(self.loadings) == 0: if disable_reduction is False: self._fit_lsi() else: self.dims = self.loadings.shape[1] if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) elif self.method == 'custom': if self.loadings is None or len(self.loadings) == 0: logger.warning( "No loadings provided for manual dimension reduction") else: self.dims = self.loadings.shape[1] if disable_reduction: self.reducer = lambda x: x else: self.reducer = lambda x: x.dot(self.loadings) else: raise ValueError( f"ERROR: Unknown reduction method: {self.method}") if ann_idx is None: self.annIdx = self._fit_ann() else: self.annIdx = ann_idx self.annIdx.set_ef(self.annEf) self.annIdx.set_num_threads(1) self.kmeans = self._fit_kmeans(do_kmeans_fit) def _handle_batch_size(self): if self.dims > self.data.shape[0]: self.dims = self.data.shape[0] batch_size = self.data.chunksize[ 0] # Assuming all chunks are same size if self.dims >= batch_size: self.dims = batch_size - 1 # -1 because we will do PCA +1 logger.info( f"Number of PCA/LSI components reduced to batch size of {batch_size}" ) if self.nClusters > batch_size: self.nClusters = batch_size logger.info( f"Cluster number reduced to batch size of {batch_size}") return batch_size def iter_blocks(self, msg: str = '') -> np.ndarray: for i in tqdm(self.data.blocks, desc=msg, total=self.data.numblocks[0]): yield controlled_compute(i, self.nthreads) def transform_z(self, a: np.ndarray) -> np.ndarray: return (a - self.mu) / self.sigma def transform_ann(self, a: np.ndarray, k: int = None, self_indices: np.ndarray = None) -> tuple: if k is None: k = self.k # Adding +1 to k because first neighbour will be the query itself if self_indices is None: i, d = self.annIdx.knn_query(a, k=k) return i, d else: i, d = self.annIdx.knn_query(a, k=k + 1) return fix_knn_query(i, d, self_indices) def _fit_pca(self, disable_scaling, use_for_pca) -> None: from sklearn.decomposition import IncrementalPCA # We fit 1 extra PC dim than specified and then ignore the last PC. self._pca = IncrementalPCA(n_components=self.dims + 1, batch_size=self.batchSize) do_sample_subset = False if use_for_pca.sum() == self.nCells else True s, e = 0, 0 # We store the first block of values here. if such a case arises that we are left with less dims+1 cells to fit # then those cells can be added to end_reservoir for fitting. if there are no such cells then end reservoir is # just by itself after fitting rest of the cells. If may be the case that the first batch itself has less than # dims+1 cells. in that we keep adding cells to carry_over pile until it is big enough. end_reservoir = [] # carry_over store cells that can yet not be added to end_reservoir ot be used for fitting pca directly. carry_over = [] for i in self.iter_blocks(msg='Fitting PCA'): if do_sample_subset: e = s + i.shape[0] i = i[use_for_pca[s:e]] s = e if disable_scaling is False: i = self.transform_z(i) if len(carry_over) > 0: i = np.vstack((carry_over, i)) carry_over = [] if len(i) < (self.dims + 1): carry_over = i continue if len(end_reservoir) == 0: end_reservoir = i continue try: self._pca.partial_fit(i, check_input=False) except LinAlgError: # Add retry counter to make memory consumption doesn't escalate carry_over = i if len(carry_over) > 0: i = np.vstack((end_reservoir, carry_over)) else: i = end_reservoir try: self._pca.partial_fit(i, check_input=False) except LinAlgError: logger.warning( "{i.shape[0]} samples were not used in PCA fitting due to LinAlgError", flush=True) self.loadings = self._pca.components_[:-1, :].T def _fit_lsi(self) -> None: from gensim.models import LsiModel from gensim.matutils import Dense2Corpus self._lsiModel = LsiModel( Dense2Corpus( controlled_compute(self.data.blocks[0], self.nthreads).T), num_topics=self.dims, chunksize=self.data.chunksize[0], id2word={x: x for x in range(self.data.shape[1])}, extra_samples=0) for n, i in enumerate(self.iter_blocks(msg="Fitting LSI model")): if n == 0: continue self._lsiModel.add_documents(Dense2Corpus(i.T)) self.loadings = self._lsiModel.get_topics().T def _fit_ann(self): import hnswlib dims = self.dims if dims < 1: dims = self.data.shape[1] ann_idx = hnswlib.Index(space=self.annMetric, dim=dims) ann_idx.init_index(max_elements=self.nCells, ef_construction=self.annEfc, M=self.annM, random_seed=self.randState) ann_idx.set_ef(self.annEf) ann_idx.set_num_threads(self.annThreads) for i in self.iter_blocks(msg='Fitting ANN'): ann_idx.add_items(self.reducer(i)) return ann_idx def _fit_kmeans(self, do_ann_fit): from sklearn.cluster import MiniBatchKMeans if do_ann_fit is False: return None kmeans = MiniBatchKMeans(n_clusters=self.nClusters, random_state=self.randState, batch_size=self.batchSize) with threadpool_limits(limits=self.nthreads): for i in self.iter_blocks(msg='Fitting kmeans'): kmeans.partial_fit(self.reducer(i)) temp = [] for i in self.iter_blocks(msg='Estimating seed partitions'): temp.extend(kmeans.predict(self.reducer(i))) self.clusterLabels = np.array(temp) return kmeans
# Avoid "Mean of empty slice." in sklearn: batch_size = max(n_components + 1, args.batch_size) print("batch_size = {}".format(batch_size)) # Create data loader minibatchlist = DataLoader.createTestMinibatchList(len(images_path), batch_size) # Training = False -> outputs only the current observation, not a tuple data_loader = DataLoader(minibatchlist, images_path, n_workers=4, is_training=False) print("Fitting PCA with n_components={}".format(n_components)) ipca = IncrementalPCA(n_components=n_components) pbar = tqdm(total=len(data_loader)) for obs_var in data_loader: ipca.partial_fit(toNumpyMatrix(obs_var)) pbar.update(1) pbar.close() # Save PCA transformation with open(log_folder + "/pca.pkl", "wb") as f: pkl.dump(ipca, f) print("Transforming observations to states") predictions = [] for obs_var in data_loader: predictions.append(ipca.transform(toNumpyMatrix(obs_var))) predictions = np.concatenate(predictions, axis=0)
#Making the screeplot - plotting the cumulative variance against the number of components # fig = plt.figure(figsize = (12,9)) plt.plot(np.cumsum(pca.explained_variance_ratio_)) plt.xlabel('number of components', fontsize=10) plt.ylabel('cumulative explained variance', fontsize=10) plt.title('PCA Cumulative Explained Variance', fontsize=12) plt.xticks(fontsize=12) plt.yticks(fontsize=12) img_file = results_path.joinpath('PCA_Cumulative_Explained_Variance.png') plt.savefig(img_file) plt.show() # Looks like approx. 50 components are enough to describe 90% of the variance in the dataset # We'll choose 50 components for our modeling #Using incremental PCA for efficiency - saves a lot of time on larger datasets pca_final = IncrementalPCA(n_components=16) df_train_pca = pca_final.fit_transform(X_train_rus) print("df_train_pca.shape") print(df_train_pca.shape) #Creating correlation matrix for the principal components - I expect little to no correlation corrmat = np.corrcoef(df_train_pca.transpose()) plt.figure(figsize=(16, 16)) sns.set(font_scale=.8) sns.heatmap(corrmat, vmin=df_corr.values.min(), vmax=1, fmt='.1f', square=True, cmap="Blues", linewidths=0.1,
y="Second Vector", hue="Label", data=X_train_scatter, fit_reg=False) ax = plt.gca() ax.set_title("Separation of Observations Using Original Feature Set") # In[ ]: # Incremental PCA from sklearn.decomposition import IncrementalPCA n_components = 784 batch_size = None incrementalPCA = IncrementalPCA(n_components=n_components, batch_size=batch_size) X_train_incrementalPCA = incrementalPCA.fit_transform(X_train) X_train_incrementalPCA = pd.DataFrame(data=X_train_incrementalPCA, index=train_index) X_validation_incrementalPCA = incrementalPCA.transform(X_validation) X_validation_incrementalPCA = pd.DataFrame(data=X_validation_incrementalPCA, index=validation_index) scatterPlot(X_train_incrementalPCA, y_train, "Incremental PCA") # In[ ]: # Sparse PCA from sklearn.decomposition import SparsePCA
class PCANet(object): def __init__(self, image_shape, filter_shape_l1, step_shape_l1, n_l1_output, filter_shape_l2, step_shape_l2, n_l2_output, filter_shape_pooling, step_shape_pooling): """ Parameters ---------- image_shape: int or sequence of ints Input image shape. filter_shape_l1: int or sequence of ints The shape of the kernel in the first convolution layer. If the value is int, a filter of the square shape is applied. If you want to apply a filter of a different aspect ratio, just pass a tuple of shape (height, width). step_shape_l1: int or sequence of ints The shape of kernel step in the first convolution layer. If the value is int, a step of the square shape is applied. If you want to apply a step of a different aspect ratio, just pass a tuple of shape (height, width). n_l1_output: L1 in the original paper. The number of outputs obtained from a set of input images. filter_shape_l2: int or sequence of ints The shape of the kernel in the second convolution layer. If the value is int, a filter of the square shape is applied. If you want to apply a filter of a different aspect ratio, just pass a tuple of shape (height, width). step_shape_l2: int or sequence of ints The shape of kernel step in the second convolution layer. If the value is int, a step of the square shape is applied. If you want to apply a step of a different aspect ratio, just pass a tuple of shape (height, width). n_l2_output: L2 in the original paper. The number of outputs obtained from each L1 output. filter_shape_pooling: int or sequence of ints The shape of the filter in the pooling layer. step_shape_pooling: int or sequence of ints The shape of the filter step in the pooling layer. """ self.image_shape = to_tuple_if_int(image_shape) self.filter_shape_l1 = to_tuple_if_int(filter_shape_l1) self.step_shape_l1 = to_tuple_if_int(step_shape_l1) self.n_l1_output = n_l1_output self.conv1 = Conv2d(1, 6, 3) self.filter_shape_l2 = to_tuple_if_int(filter_shape_l2) self.step_shape_l2 = to_tuple_if_int(step_shape_l2) self.n_l2_output = n_l2_output self.filter_shape_pooling = to_tuple_if_int(filter_shape_pooling) self.step_shape_pooling = to_tuple_if_int(step_shape_pooling) self.n_bins = None # TODO make n_bins specifiable self.pca_l1 = IncrementalPCA(n_l1_output) self.pca_l2 = IncrementalPCA(n_l2_output) def histogram(self, binary_images): """ Separate a given image into blocks and calculate a histogram in each block. Supporse data in a block is in range [0, 3] and the acutual values are :: [0 0 1] [2 2 2] [2 3 3] | If default bins ``[-0.5 0.5 1.5 2.5 3.5]`` applied, the histogram will be ``[2 1 4 2]``. | If ``n_bins`` is specified, the range of data divided equally. | For example, if the data is in range ``[0, 3]`` and ``n_bins = 2``, | bins will be ``[-0.5 1.5 3.5]`` and the histogram will be ``[3 6]``. """ k = pow(2, self.n_l2_output) if self.n_bins is None: self.n_bins = k + 1 bins = xp.linspace(-0.5, k - 0.5, self.n_bins) def bhist(image): # calculate Bhist(T) in the original paper ps = Patches(image, self.filter_shape_pooling, self.step_shape_pooling).patches H = [xp.histogram(p.flatten(), bins)[0] for p in ps] return xp.concatenate(H) return xp.vstack([bhist(image) for image in binary_images]) def process_input(self, images): assert (np.ndim(images) >= 3) assert (images.shape[1:3] == self.image_shape) if np.ndim(images) == 3: # forcibly convert to multi-channel images images = atleast_4d(images) images = to_channels_first(images) return images def fit(self, images): """ Train PCANet Parameters ---------- images: np.ndarray | Color / grayscale images of shape | (n_images, height, width, n_channels) or | (n_images, height, width) """ images = self.process_input(images) # images.shape == (n_images, n_channels, y, x) for image in images: X = [] for channel in image: patches = image_to_patch_vectors(channel, self.filter_shape_l1, self.step_shape_l1) X.append(patches) patches = np.hstack(X) # patches.shape = (n_patches, n_patches * vector length) self.pca_l1.partial_fit(patches) filters_l1 = components_to_filters( self.pca_l1.components_, n_channels=images.shape[1], filter_shape=self.filter_shape_l1, ) images = torch.Tensor(images) images = F.relu(self.conv1(images)) #images.shape(n_images, L1, y, x) images.reshape(-1, *images.shape[2:4]) for image in images: patches = image_to_patch_vectors(image, self.filter_shape_l2, self.step_shape_l2) self.pca_l2.partial_fit(patches) return self def transform(self, images): """ Parameters ---------- images: np.ndarray | Color / grayscale images of shape | (n_images, height, width, n_channels) or | (n_images, height, width) Returns ------- X: np.ndarray A set of feature vectors of shape (n_images, n_features) where :code:`n_features` is determined by the hyperparameters """ images = self.process_input(images) # images.shape == (n_images, n_channels, y, x) filters_l1 = components_to_filters( self.pca_l1.components_, n_channels=images.shape[1], filter_shape=self.filter_shape_l1, ) filters_l2 = components_to_filters(self.pca_l2.components_, n_channels=1, filter_shape=self.filter_shape_l2) images = Conv2d(images, filters_l1, stride=self.step_shape_l1).data images = xp.swapaxes(images, 0, 1) # L1.shape == (L1, n_images, y, x) # iterate over each L1 output X = [] for maps in images: n_images, h, w = maps.shape maps = Conv2d( maps.reshape(n_images, 1, h, w), # 1 channel images filters_l2, stride=self.step_shape_l2).data # maps.shape == (n_images, L2, y, x) right here maps = binarize(maps) maps = binary_to_decimal(maps) # maps.shape == (n_images, y, x) x = self.histogram(maps) # x is a set of feature vectors. # The shape of x is (n_images, vector length) X.append(x) # concatenate over L1 X = xp.hstack(X) if gpu_enabled(): X = X.to('cpu') X = X.astype(np.float64) # The shape of X is (n_images, L1 * vector length) return X def validate_structure(self): """ Check that the filter visits all pixels of input images without dropping any information. Raises ------ ValueError: if the network structure does not satisfy the above constraint. """ def is_valid_(input_shape, filter_shape, step_shape): ys, xs = steps(input_shape, filter_shape, step_shape) fh, fw = filter_shape h, w = input_shape if ys[-1] + fh != h or xs[-1] + fw != w: raise ValueError("Invalid network structure.") return output_shape(ys, xs) output_shape_l1 = is_valid_(self.image_shape, self.filter_shape_l1, self.step_shape_l1) output_shape_l2 = is_valid_(output_shape_l1, self.filter_shape_l2, self.step_shape_l2) is_valid_(output_shape_l2, self.filter_shape_pooling, self.filter_shape_pooling)
def fit_embedding(dataset, embed_dir, standardize_features=True, pca_n_components=None, umap_n_components=2, umap_init='random', umap_n_neighbors=100, umap_min_dist=0.0, umap_metric='euclidean', low_memory=False, save_transform=True, seed=None, verbose=True): """ train_set: a feature matrix e.g. of (N, F) dimensions, used to define the embedding After some experimentation with chemical features of 1024 of 50k-.5M compounds, a reasonable embedding first reduces by PCA to 20 dimensions and then UMAP to 2 dimensions. UMAP parameters of 100 neighbors and min_dist of 0.0 seem to work well too. init='random' can help UMAP from getting stuck. return: saves embedding data to ../intermediate_data/embeddings/tag/embedding_info.tsv ../intermediate_data/embeddings/tag/pca_reducer.joblib ../intermediate_data/embeddings/tag/umap_reducer.joblib """ if not os.path.exists(embed_dir): os.mkdir(embed_dir) else: print("WARNING: embed_dir already exists: {}".format(embed_dir)) random_state = np.random.RandomState(seed=seed) begin_time = time.time() if standardize_features: if verbose: print( "Standardizing dataset so each feature has zero-mean and unit variance." ) standardizer = StandardScaler(copy=False) standardizer.fit(dataset) dataset = standardizer.transform(dataset) if pca_n_components is None: pca_n_components = dataset.shape[1] if verbose: print("Setting PCA n_componets to full rank of dataset: {}".format( pca_n_components)) if verbose: print("Reducing the dimension by PCA from {} to {} dimensions".format( dataset.shape[1], pca_n_components)) pca_reducer = IncrementalPCA(n_components=pca_n_components, batch_size=1000, copy=False) pca_reducer.fit(dataset) pca_embedding = pca_reducer.transform(dataset) if verbose: print("Reducing the dimension by UMAP to {} dimensions".format( umap_n_components)) umap_reducer = umap.UMAP( n_components=umap_n_components, metric=umap_metric, n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, init=umap_init, low_memory=low_memory, #random_state=random_state, verbose=True) umap_embedding = umap_reducer.fit_transform(pca_embedding) umap_embedding = pd.DataFrame( data=umap_embedding, columns=["UMAP_" + str(i + 1) for i in range(umap_n_components)]) end_time = time.time() if verbose: print("created embedding {0} runtime: {1:.2f}s".format( embed_dir, end_time - begin_time)) print("saving embedding to {}".format(embed_dir)) with open("{}/model_info.tsv".format(embed_dir), 'w') as f: f.write("key\tvalue\n") f.write("seed\t{}\n".format(seed)) f.write("input_dim\t{}\n".format(dataset.shape)) f.write("standardize_features\t{}\n".format(standardize_features)) f.write("pca_n_component\t{}\n".format(pca_n_components)) f.write("umap_n_component\t{}\n".format(umap_n_components)) f.write("umap_metric\t{}\n".format(umap_metric)) f.write("umap_n_neighbors\t{}\n".format(umap_n_neighbors)) f.write("umap_min_dist\t{}\n".format(umap_min_dist)) f.write("umap_init\t{}\n".format(umap_init)) if save_transform: if verbose: print("Saving transform to {}.".format(embed_dir)) if pca_n_components is not None: joblib.dump(value=pca_reducer, filename="{}/pca_reducer.joblib".format(embed_dir)) if standardize_features: joblib.dump(value=standardizer, filename="{}/standardizer.joblib".format(embed_dir)) joblib.dump(value=umap_reducer, filename="{}/umap_reducer.joblib".format(embed_dir)) pa.parquet.write_table(table=pa.Table.from_pandas(umap_embedding), where="{}/umap_embedding.parquet".format(embed_dir)) return umap_embedding
app = Flask(__name__) @app.route("/") def index(): return '<img src="static/grafico.png"/>' if __name__ == '__main__': iris = load_iris() X = iris.data y = iris.target n_components = 2 ipca = IncrementalPCA(n_components=n_components, batch_size=10) X_ipca = ipca.fit_transform(X) pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X) colors = ['navy', 'turquoise', 'darkorange'] for X_transformed, title in [(X_ipca, "Incremental PCA"), (X_pca, "PCA")]: plt.figure(figsize=(8, 8)) for color, i, target_name in zip(colors, [0, 1, 2], iris.target_names): plt.scatter(X_transformed[y == i, 0], X_transformed[y == i, 1], color=color, lw=2, label=target_name)
checkpoint_base = os.path.join(base, str(i)) checkpoint_filenames = sorted(os.listdir(checkpoint_base)) for epoch in range(2, epoch_max, epoch_step): filename = checkpoint_filenames[epoch] checkpoint = torch.load(os.path.join(checkpoint_base, filename)) model = eval(checkpoint['arch'])() model.load_state_dict(checkpoint['state_dict']) params = np.zeros((0, )) for p in model.parameters(): params = np.append(params, p.cpu().data.numpy().flatten()) all_params.append(params) all_accs.append(checkpoint['logger'].entries[epoch - 1]['accuracy']) all_params = np.array(all_params) IncrementalPCA(2, batch_size=6).fit_transform(all_params) x = [p[0] for p in all_params] y = [p[1] for p in all_params] seg = len(range(1, epoch_max, epoch_step)) plt.subplot(121) plt.title('All layers (DeepMnistCNN)') for i in range(1, 9): x_ = x[(i - 1) * seg:i * seg] y_ = y[(i - 1) * seg:i * seg] a_ = all_accs[(i - 1) * seg:i * seg] plt.plot(x_, y_, 'o:', color=cmap((i - 1) / 7)) for xi, yi, ai in zip(x_, y_, a_): plt.annotate(str('{:.1f}'.format(ai * 100)), xy=(xi, yi), xytext=(xi + 0.004, yi + 0.004),
def pca_compress_channel(X, k): incr_pca = IncrementalPCA(n_components=k) X_reduced = incr_pca.fit_transform(X).astype(np.float16) return (X_reduced, incr_pca)
from sklearn.decomposition import IncrementalPCA from sklearn import svm from sklearn.externals import joblib datadir = sys.argv[1] pcadir = sys.argv[2] n_comp = 300 step = 1000 fnames = [ os.path.basename(filename) for filename in glob.glob("%s/*.txt" % datadir) ] used = len(fnames) ipca = IncrementalPCA(n_components=n_comp) for batch_start in xrange(0, used, step): batch_end = min(batch_start + step, used) print "Loading from %d to %d" % (batch_start, batch_end) data = [] labels = [] for fname in fnames[batch_start:batch_end]: print "Loading image", fname data.append(np.loadtxt("%s/%s" % (datadir, fname))) print np.array(data).shape ipca.partial_fit(data) if os.path.isdir(pcadir): shutil.rmtree(pcadir) os.mkdir(pcadir)
from __future__ import division import numpy as np from sklearn.decomposition import PCA, IncrementalPCA #import tensorflow as tf X = np.random.random((3, 4)) pca = PCA(n_components=1) inc_pnca = IncrementalPCA(n_components=1) pca.fit(X.T) phi = pca.components_ print phi inc_pnca.fit(X.T) print inc_pnca.components_ def EMPCA(X, n_var, n_obs, n_epochs): X = X - np.mean(X) assert X.shape == (n_var, n_obs), "shape error in dataset" phi = np.random.rand(n_var) c = np.zeros(n_obs) for i in range(n_epochs): #repeat until convergence for j, x_j in enumerate(X.T): #E-step #print np.dot(x_j, phi) #print x_j.shape, phi.shape c[j] = np.dot(x_j, phi) #print ",",(c*X).shape #print np.sum(c*X,axis=1).shape phi = np.sum(c * X, axis=1) / np.sum((c**2)) #print phi.shape phi = phi / np.linalg.norm(phi)
PARSER.add_argument( '-c', '--chunksize', type=int, default=50000, help='the number of lines to be read from the INPUT file ' + 'at a time and stored in memory, the default value is 50000') ARGS = PARSER.parse_args() READER = pd.read_csv(ARGS.input, sep=ARGS.delimiter, chunksize=ARGS.chunksize, header=None, error_bad_lines=False) PCA = IncrementalPCA(n_components=ARGS.dimensions) total_read = 0 total_written = 0 if ARGS.model and os.path.isfile(ARGS.model): PCA = joblib.load(ARGS.model) else: for chunk in READER: PCA.partial_fit(chunk) total_read += ARGS.chunksize #print(str(total_read) + ' vectors read ...\n') if ARGS.model: if not ARGS.model.endswith('.pkl'): ARGS.model += '.pkl' joblib.dump(PCA, ARGS.model)