def compute_pca(data_path=os.path.join(BASE_DIR, 'data/memmap/'), out_path=os.path.join(BASE_DIR, 'data/'), batch_size=500, image_size=3*300*300): ipca = IncrementalPCA(n_components=3, batch_size=batch_size) path = os.path.join(data_path, 'tn_x.dat') train = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(4044,image_size)) n_samples, _ = train.shape for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)): X = train[batch,:] X = np.reshape(X, (X.shape[0], 3, int(image_size/3))) X = X.transpose(0, 2, 1) X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3)) ipca.partial_fit(X) path = os.path.join(data_path, 'v_x.dat') valid = np.memmap(path, dtype=theano.config.floatX, mode='r+', shape=(500,image_size)) n_samples, _ = valid.shape for batch_num, batch in enumerate(gen_batches(n_samples, batch_size)): X = valid[batch,:] X = np.reshape(X, (X.shape[0], 3, int(image_size/3))) X = X.transpose(0, 2, 1) X = np.reshape(X, (reduce(np.multiply, X.shape[:2]), 3)) ipca.partial_fit(X) eigenvalues, eigenvectors = np.linalg.eig(ipca.get_covariance()) eigenvalues.astype('float32').dump(os.path.join(out_path, 'eigenvalues.dat')) eigenvectors.astype('float32').dump(os.path.join(out_path, 'eigenvectors.dat'))
def test_incremental_pca_sparse(matrix_class): # Incremental PCA on sparse arrays. X = iris.data pca = PCA(n_components=2) pca.fit_transform(X) X_sparse = matrix_class(X) batch_size = X_sparse.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) X_transformed = ipca.fit_transform(X_sparse) assert X_transformed.shape == (X_sparse.shape[0], 2) np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X_sparse) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose(np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13) with pytest.raises(TypeError, match="IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches."): ipca.partial_fit(X_sparse)
def get_ipca(paths, batch_size): """ Computes IPCA for BGR values of given images :param paths: :param batch_size: :return: eigen_vectors, eigen_values, covariance matrix, order is BGR """ ipca = IncrementalPCA(n_components=3) all_pixels = None count = len(paths) for i, path in enumerate(helper.show_progress(paths, 100)): # Load image and convert to a vector of RGB values pixels = load_image_pixels(path) # Accumulate pixels if all_pixels is None: all_pixels = pixels else: all_pixels = np.concatenate((all_pixels, pixels), axis=0) # IPCA if i % batch_size-1 == 0 or i == count - 1: ipca.partial_fit(all_pixels) all_pixels = None cov = ipca.get_covariance() eigen_vals, eigen_vecs = linalg.eig(cov) return eigen_vecs, eigen_vals, cov
def test_incremental_pca(): # Incremental PCA on dense arrays. X = iris.data batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) pca = PCA(n_components=2) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) assert X_transformed.shape == (X.shape[0], 2) np.testing.assert_allclose( ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3, ) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose( np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13 )
class IPCA(object): def __init__(self, n_components=None, whiten=False, copy=True, batch_size=None): """ :param n_components: default为None ,int 或None, 想要保留的分量数,None 时, min(n_samples, n_features) :param whiten: bool型,可选项, 默认为False, 当true(默认情况下为false)时,components_ 向量除以 n_samples*components_以确保具有单位组件级方差的不相关输出。 :param copy: 默认为True, False时,x 将被覆盖,将节约能存,但存在不安全 :param batch_size: default None, 批量样本数, 只在fit 中使用,设为None,系统自动设成5*n_features, 以保持经度与内存开销的平衡 """ self.model = IncrementalPCA(n_components=n_components, whiten=whiten, copy=copy, batch_size=batch_size) def fit(self, x, y=None): self.model.fit(X=x, y=y) def transform(self, x): return self.model.transform(X=x) def fit_transform(self, x, y=None): return self.model.fit_transform(X=x, y=y) def get_params(self, deep=True): # 获取评估器的参数 return self.model.get_params(deep=deep) def set_params(self, **params): # 设置评估器的参数 self.model.set_params(**params) def inverse_transform(self, x): # 与 fit_tansform 刚好相反的两个操作 return self.model.inverse_transform(X=x) def get_precision(self): # 根据生成模型计算精度矩阵 return self.model.get_precision() def get_covariance(self): # 根据生成模型获取协方差 return self.model.get_covariance() def partial_fit(self, x, y=None, check_input=True): # 增量训练 self.model.partial_fit(X=x, y=y, check_input=check_input) def get_attributes(self): component = self.model.components_ explained_variance = self.model.explained_variance_ explained_variance_ratio = self.model.explained_variance_ratio_ singular_values = self.model.singular_values_ means = self.model.mean_ # 每个特征的均值 var = self.model.var_ # 每个特征的方差 noise_variance = self.model.noise_variance_ # 评估的噪声协方差 n_component = self.model.n_components_ n_samples_seen = self.model.n_samples_seen_ return component, explained_variance, explained_variance_ratio, singular_values, means, var, noise_variance, \ n_component, n_samples_seen
def compute_pca_of_image_set(self): print("COMPUTING PCA OF IMAGE SET \n \n \n \n \n \n \n \n \n \n \n \n") recordings = os.listdir(self.data_path) c = 0 # mean_r = 0 # mean_g = 0 # mean_b = 0 # num_i = 0 # for r in recordings: # c += 1 # images = os.listdir(self.data_path + "/" + r + "/") # for i in images: # if i.endswith(".png"): # data = self.read_image(self.data_path + "/" + r + "/" + i) # mean_r += np.mean(data[:, :, 0]) # mean_g += np.mean(data[:, :, 1]) # mean_b += np.mean(data[:, :, 2]) # num_i += 1 # mean_r = mean_r/num_i # mean_g = mean_g / num_i # mean_b = mean_b / num_i # print(mean_r) # print(mean_g) # print(mean_b) mean_r = 134.09352525641472 mean_g = 131.9404211385675 mean_b = 129.67342747136797 m = [mean_r, mean_g, mean_b] transformer = IncrementalPCA(n_components=3) for r in recordings: c += 1 images = os.listdir(self.data_path + "/" + r + "/") res = np.zeros(shape=(1, 3)) for i in images: if i.endswith(".png"): data = self.read_image(self.data_path + "/" + r + "/" + i) arr = data.reshape((260 * 210), 3) res = np.concatenate((res, arr), axis=0) res = np.delete(res, (0), axis=0) res = res - m res = res / 255 transformer.partial_fit(res) if not c % 100: print(str(c) + '/' + str(len(recordings))) pickle.dump(transformer, open("pca.p", "wb")) print(transformer.get_covariance())
def test_incremental_pca(): """Incremental PCA on dense arrays.""" X = iris.data batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) pca = PCA(n_components=2) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2)) assert_almost_equal(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), 1) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]))