def _compute_covariance(self): """Computes the covariance matrix for each gaussian kernel using covariance_factor """ self.factor = self.covariance_factor() self.covariance = atleast_2d(stats.cov(self.dataset, rowvar=1) * self.factor * self.factor) self.inv_cov = linalg.inv(self.covariance)
def _compute_covariance(self): """Computes the covariance matrix for each Gaussian kernel using covariance_factor """ self.factor = self.covariance_factor() self.covariance = atleast_2d( stats.cov(self.dataset, rowvar=1) * self.factor * self.factor) self.inv_cov = linalg.inv(self.covariance) self._norm_factor = sqrt(linalg.det(2 * pi * self.covariance)) * self.n
def scree_plot(arr : np.ndarray, figsize=(18, 8)): """Generates a scree plot for a given data array. Parameters ---------- arr : np.ndarray matrix in (samples x features) form figsize : tuple, optional by default (18, 8) """ eigvals = np.linalg.eigs(stats.cov(arr)) plt.figure(figsize=figsize) plt.title('Eigenvalue versus magnitude (Scree plot)') sns.barplot(x=np.arange(len(eigvals)), y=eigvals, color='blue', saturation=.3) plt.ylabel('Magnitude') plt.xlabel('Eigenvalue index') plt.show()
def __init__(self, dataset, bw_method=None): self.dataset = np.atleast_2d(dataset) if not np.array(self.dataset).size > 1: raise ValueError("`dataset` input should have multiple elements.") self.dim, self.num_dp = np.array(self.dataset).shape isString = isinstance(bw_method, str) if bw_method is None: pass elif (isString and bw_method == 'scott'): self.covariance_factor = self.scotts_factor elif (isString and bw_method == 'silverman'): self.covariance_factor = self.silverman_factor elif (np.isscalar(bw_method) and not isString): self._bw_method = 'use constant' self.covariance_factor = lambda: bw_method elif callable(bw_method): self._bw_method = bw_method self.covariance_factor = lambda: self._bw_method(self) else: raise ValueError("`bw_method` should be 'scott', 'silverman', a " "scalar or a callable") # Computes the covariance matrix for each Gaussian kernel using # covariance_factor(). self.factor = self.covariance_factor() # Cache covariance and inverse covariance of the data if not hasattr(self, '_data_inv_cov'): self.data_covariance = np.atleast_2d( stats.cov(self.dataset, rowvar=1, bias=False)) self.data_inv_cov = linalg.inv(self.data_covariance) self.covariance = self.data_covariance * self.factor**2 self.inv_cov = self.data_inv_cov / self.factor**2 self.norm_factor = np.sqrt(linalg.det( 2 * np.pi * self.covariance)) * self.num_dp
def pca(data : np.ndarray, dim : int = 2, verbose : bool = False, class_identity=None, return_reconstruction=False) -> np.ndarray: """Principal components analysis to form a (dim) dimensional approximation of a given dataset. Parameters ---------- data : np.ndarray matrix in (samples x features) form dim : int, optional approximating dimension, must be less than the data dimension, by default 2 verbose : bool, optional whether to display reconstruction diagnostics and a scree plot Returns ------- np.ndarray dim-dimensional representation of the input data """ if np.sum(np.isnan(data).astype(int)) > 0: print("missing data detected. consider using pca_missing_data") raise(NotImplemented) reconstruction=None n, p = data.shape assert 0 < dim < p, "projection dimension must be a positive integer less than the number of data dimensions {}".format(p) # center the data de_meaned = stats.center(data) # optimize for high dimensionality data if p > n: print('High dimensional data detected, optimizing...') # [email protected] is (n x n) which we're assuming is actually smaller than (p x p) in this case X = de_meaned eigvals, X_eigvecs = np.linalg.eig(X @ X.T) # plop the eigenvalues into a diagonal matrix lambda lam = np.diag(np.real(eigvals)) # compute the eigenvecs # transformed_E = X @ X_eigvecs lam_inv = np.linalg.inv(lam) eigvecs = (X.T @ X_eigvecs @ lam_inv) else: # generate the data covariance matrix sample_covariance = stats.cov(de_meaned) # extract spectrum of the covariance matrix (its set of eigenvalues and eigenvectors) eigvals, eigvecs = np.linalg.eig(sample_covariance) # arrange the eigenvectors so that they are ordered according to the magnitude (large->small) of their corresponding eigenvalue idx = eigvals.argsort()[::-1] eigvals = np.real(np.array(eigvals[idx])) eigvecs = np.real(np.array(eigvecs[:, idx])) E = eigvecs[:, :dim] # compute low dimensional representation. Y = np.array(data @ E) if verbose is True: # scree plot plt.figure(1, figsize=(18, 8)) plt.title('Eigenvalue versus magnitude (Scree plot)') sns.barplot(x=np.arange(len(eigvals)), y=eigvals, color='blue', saturation=.3) plt.ylabel('Magnitude') plt.xlabel('Eigenvalue index') plt.show() # reconstruction reconstruction = Y @ E.T assert reconstruction.shape == data.shape reconstruction_params = [data, reconstruction] labels = ['Original data', 'Reconstruction'] fig, axes = plt.subplots(ncols=2, nrows=1, figsize=(18, 8)) for i, ax in zip(range(2), axes.flat): # extract given params given_params = reconstruction_params[i] ax.set_title(labels[i]) sns.heatmap(given_params, cmap='Blues_r', alpha=0.65, annot=False, cbar=False, xticklabels=False, yticklabels=False, ax=ax) fig.tight_layout() plt.show() # print("Reconstruction error: {}".format(round(np.sum(np.linalg.norm(data - reconstruction, axis=1)), 2))) # plot 2D projection if labels: to_plot = Y if dim != 2: to_plot = pca(data, dim= 2, verbose=False) plt.figure(figsize=(10, 8)) plt.title("2D Data Representation") sns.scatterplot(x=to_plot[:, 0], y=to_plot[:, 1], hue=class_identity, legend='full', \ palette=sns.color_palette('bright', n_colors=len(np.unique(class_identity)))) plt.show() else: to_plot = Y if dim != 2: to_plot = pca(data, dim= 2, verbose=False) plt.figure(figsize=(10, 8)) plt.title("2D Data Representation") sns.scatterplot(x=to_plot[:, 0], y=to_plot[:, 1]) plt.show() # Hinton diagram for eigenvalue matrix (slow af though, only use for small matrices) if lam.shape[0] < 20: hinton(lam) plt.show() if return_reconstruction is True and reconstruction is not None: return Y, reconstruction elif return_reconstruction is True: reconstruction = Y @ E.T return Y, reconstruction else: return Y
plt.savefig(fout + 'sstbudget_anom_ts.png') # In[ ]: T_var = T_anom.var(dim='time') get_ipython().run_line_magic('time', 'T_var.load()') #%time T_var.persist() # In[ ]: tendH_anom = tendH_anom / c_o # In[ ]: #tendH_anom = tendH_anom.transpose('time','face', 'k', 'j', 'i') cov_adv = st.cov(tendH_anom, C_adv_anom) cov_dif = st.cov(tendH_anom, C_dif_anom) cov_forc = st.cov(tendH_anom, C_forc_anom) # In[ ]: cov_adv.nbytes / 1e9 # In[ ]: get_ipython().run_line_magic('time', 'cov_adv.load()') get_ipython().run_line_magic('time', 'cov_dif.load()') get_ipython().run_line_magic('time', 'cov_forc.load()') # In[ ]:
def pca(data,num_components=2): U,s,Vh = np.linalg.svd(cov(data)) return Vh.T[:,:num_components]
def pca(data, num_components=2): U, s, Vh = np.linalg.svd(cov(data)) return Vh.T[:, :num_components]