def indiv_space_for_sparse(X, joint_scores, joint_rank, signal_rank, sv_threshold): # compute a rank R1 SVD of I # if the estimated individual rank is less than R1 we are done # otherwise compute a rank R2 SVD of I # keep going until we find the individual rank # TODO: this could use lots of optimizing X_orthog = col_proj_orthog(X, joint_scores) # start with a low rank SVD max_rank = min(X.shape) - joint_rank # saves computation current_rank = min(int(1.2 * signal_rank), max_rank) # 1.2 is somewhat arbitrary U, D, V = svd_wrapper(X_orthog, current_rank) indiv_rank = sum(D > sv_threshold) if indiv_rank == current_rank: # SVD rank is still too low found_indiv_rank = False for t in range(3): # current guess at an upper bound for the individual rank additional_rank = signal_rank current_rank = current_rank + additional_rank current_rank = min(current_rank, max_rank) # compute additional additional_rank SVD components # TODO: possibly use svds_additional to speed up calculation # U, D, V = svds_additional(I, scores, sv, loadings, additional_rank) U, D, V = svd_wrapper(X_orthog, current_rank) indiv_rank = sum(D > sv_threshold) # we are done if the individual rank estimate is less # than the current_rank or if the current_rank is equal to the maximal rank if (indiv_rank < current_rank) or (current_rank == max_rank): found_indiv_rank = True break if not found_indiv_rank: warnings.warn('individual rank estimate probably too low') return U[:, 0:indiv_rank], D[0:indiv_rank], V[:, 0:indiv_rank], indiv_rank
def _get_rand_sample(num_obs, signal_ranks): M = [None for _ in range(len(signal_ranks))] for k in range(len(signal_ranks)): # sample random orthonormal basis Z = np.random.normal(size=(num_obs, signal_ranks[k])) M[k] = np.linalg.qr(Z)[0] # compute largest sing val of random joint matrix M = np.bmat(M) _, svs, __ = svd_wrapper(M, rank=1) return svs.item() ** 2
def fit(self, X): """ Computes the PCA decomposition of X. Parameters ---------- X: {array-like, sparse matrix}, shape (n_samples, n_features) Fit PCA with data matrix X. If X is a pd.DataFrame, the observation and feature names will be extracted from its index/columns. Note X can be either dense or sparse. """ self.shape_, obs_names, var_names, self.n_components, \ = _arg_checker(X, self.n_components) # possibly mean center X X, self.m_ = centering(X, self.center) # compute SVD U, D, V = svd_wrapper(X, self.n_components) # compute variance explained if self.n_components == min(X.shape): self.frob_norm_ = np.sqrt(sum(D**2)) else: self.frob_norm_ = _safe_frob_norm(X) self.var_expl_prop_ = D**2 / self.frob_norm_**2 self.var_expl_cum_ = np.cumsum(self.var_expl_prop_) if self.n_components is None: self.n_components = self.scores_.shape[1] self.scores_, self.svals_, self.loadings_ = \ svd2pd(U, D, V, obs_names=obs_names, var_names=var_names) return self
def fit(self, blocks, precomp_init_svd=None): """ Fits the AJIVE decomposition. Parameters ---------- blocks: list, dict The data matrices. If dict, will name blocks by keys, otherwise blocks are named by 0, 1, ...K. Data matrices must have observations on the rows and have the same number of observations i.e. the kth data matrix is shape (n_samples, n_features[k]). precomp_init_svd: {list, dict, None}, optional Precomputed initial SVD. Must have one entry for each data block. The SVD should be a 3 tuple (scores, svals, loadings), see output of jive.utils.svd_wrapper for formatting details. """ blocks, self.init_signal_ranks, self.indiv_ranks, precomp_init_svd,\ self.center, obs_names, var_names, self.shapes_ = \ arg_checker(blocks, self.init_signal_ranks, self.joint_rank, self.indiv_ranks, precomp_init_svd, self.center) block_names = list(blocks.keys()) num_obs = list(blocks.values())[0].shape[0] # center blocks self.centers_ = {} for bn in block_names: blocks[bn], self.centers_[bn] = centering(blocks[bn], method=self.center[bn]) ################################################################ # step 1: initial signal space extraction by SVD on each block # ################################################################ init_signal_svd = {} self.sv_threshold_ = {} for bn in block_names: # compute rank init_signal_ranks[bn] + 1 SVD of the data block if precomp_init_svd[bn] is None: # signal rank + 1 to get individual rank sv threshold U, D, V = svd_wrapper(blocks[bn], self.init_signal_ranks[bn] + 1) else: U = precomp_init_svd[bn]['scores'] D = precomp_init_svd[bn]['svals'] V = precomp_init_svd[bn]['loadings'] # The SV threshold is halfway between the init_signal_ranks[bn]th # and init_signal_ranks[bn] + 1 st singular value. Recall that # python is zero indexed. self.sv_threshold_[bn] = (D[self.init_signal_ranks[bn] - 1] \ + D[self.init_signal_ranks[bn]]) / 2 init_signal_svd[bn] = { 'scores': U[:, 0:self.init_signal_ranks[bn]], 'svals': D[0:self.init_signal_ranks[bn]], 'loadings': V[:, 0:self.init_signal_ranks[bn]] } ################################## # step 2: joint space estimation # ################################## # this step estimates the joint rank and computes the common # joint space basis # SVD of joint signal matrix joint_scores_matrix = np.bmat( [init_signal_svd[bn]['scores'] for bn in block_names]) joint_scores, joint_svals, joint_loadings = svd_wrapper( joint_scores_matrix) self.all_joint_svals_ = deepcopy(joint_svals) # estimate joint rank using wedin bound and random direction if a # joint rank estimate has not already been provided # TODO: maybe make this into an object or function if self.joint_rank is None: # if the random sv samples are not already provided compute them if self.random_sv_samples_ is None: self.random_sv_samples_ = \ sample_randdir(num_obs, signal_ranks=list(self.init_signal_ranks.values()), R=self.n_randdir_samples, n_jobs=self.n_jobs) # if the wedin samples are not already provided compute them if self.wedin_samples_ is None: self.wedin_samples_ = {} for bn in block_names: self.wedin_samples_[bn] = \ get_wedin_samples(X=blocks[bn], U=init_signal_svd[bn]['scores'], D=init_signal_svd[bn]['svals'], V=init_signal_svd[bn]['loadings'], rank=self.init_signal_ranks[bn], R=self.n_wedin_samples, n_jobs=self.n_jobs) self.wedin_sv_samples_ = len(blocks) - \ np.array([sum(self.wedin_samples_[bn][i] ** 2 for bn in block_names) for i in range(self.n_wedin_samples)]) # given the wedin and random bound samples, compute the joint rank # SV cutoff self.wedin_cutoff_ = np.percentile(self.wedin_sv_samples_, self.wedin_percentile) self.rand_cutoff_ = np.percentile(self.random_sv_samples_, self.randdir_percentile) self.svalsq_cutoff_ = max(self.wedin_cutoff_, self.rand_cutoff_) self.joint_rank_wedin_est_ = sum( joint_svals**2 > self.svalsq_cutoff_) self.joint_rank = deepcopy(self.joint_rank_wedin_est_) # check identifiability constraint and possibly remove some # joint components if self.reconsider_joint_components: joint_scores, joint_svals, joint_loadings, self.joint_rank = \ reconsider_joint_components(blocks, self.sv_threshold_, joint_scores, joint_svals, joint_loadings, self.joint_rank) # TODO: include center? # TODO: comp_names, var_names # The common joint space has now been estimated self.common = PCA.from_precomputed( scores=joint_scores[:, 0:self.joint_rank], svals=joint_svals[0:self.joint_rank], loadings=joint_loadings[:, 0:self.joint_rank], obs_names=obs_names) self.common.set_comp_names(base='common', zero_index=self.zero_index_names) ####################################### # step 3: compute final decomposition # ####################################### # this step computes the block specific estimates block_specific = {bn: {} for bn in block_names} for bn in block_names: X = blocks[bn] ######################################## # step 3.1: block specific joint space # ######################################## # project X onto the joint space then compute SVD if self.joint_rank != 0: if issparse(X): # lazy evaluation for sparse matrices J = col_proj(X, joint_scores) U, D, V = svd_wrapper(J, self.joint_rank) J = None # kill J matrix to save memory else: J = np.array( np.dot(joint_scores, np.dot(joint_scores.T, X))) U, D, V = svd_wrapper(J, self.joint_rank) if not self.store_full: J = None # kill J matrix to save memory else: U, D, V = None, None, None if self.store_full: J = np.zeros(shape=blocks[bn].shape) else: J = None block_specific[bn]['joint'] = { 'full': J, 'scores': U, 'svals': D, 'loadings': V, 'rank': self.joint_rank } ############################################# # step 3.2: block specific individual space # ############################################# # project X onto the orthogonal complement of the joint space, # estimate the individual rank, then compute SVD if issparse(X): # lazy evaluation for sparse matrices U, D, V, indiv_rank = indiv_space_for_sparse( X, joint_scores, self.joint_rank, self.init_signal_ranks[bn], self.sv_threshold_[bn]) I = None else: # project X columns onto orthogonal complement of joint_scores if self.joint_rank == 0: X_orthog = X else: X_orthog = X - np.dot(joint_scores, np.dot(joint_scores.T, X)) # estimate individual rank using sv threshold, then compute SVD if self.indiv_ranks[bn] is None: max_rank = min( X.shape) - self.joint_rank # saves computation U, D, V = svd_wrapper(X_orthog, max_rank) rank = sum(D > self.sv_threshold_[bn]) if rank == 0: U, D, V = None, None, None else: U = U[:, 0:rank] D = D[0:rank] V = V[:, 0:rank] self.indiv_ranks[bn] = rank else: # indiv_rank has been provided by the user rank = self.indiv_ranks[bn] if rank == 0: U, D, V = None, None, None else: U, D, V = svd_wrapper(X_orthog, rank) if self.store_full: if rank == 0: I = np.zeros(shape=blocks[bn].shape) else: I = np.array(np.dot(U, np.dot(np.diag(D), V.T))) else: I = None # Kill I matrix to save memory block_specific[bn]['individual'] = { 'full': I, 'scores': U, 'svals': D, 'loadings': V, 'rank': rank } ################################### # step 3.3: estimate noise matrix # ################################### if self.store_full and not issparse(X): E = X - (J + I) else: E = None block_specific[bn]['noise'] = E # save block specific estimates self.blocks = {} for bn in block_specific.keys(): self.blocks[bn] = \ BlockSpecificResults(joint=block_specific[bn]['joint'], individual=block_specific[bn]['individual'], noise=block_specific[bn]['noise'], CNS=joint_scores, block_name=bn, obs_names=obs_names, var_names=var_names[bn], m=self.centers_[bn], shape=blocks[bn].shape, zero_index_names=self.zero_index_names, init_signal_svd=init_signal_svd[bn], X=blocks[bn]) return self