def wpca_decomposition(data): weights = 0. + np.isfinite(data) kwds = {'weights': weights} pca = WPCA(n_components=1).fit(data, **kwds) eigen_samples = pca.transform(data)[:,0] eigen_genes = pca.components_[0,:] return eigen_genes, eigen_samples
def wpca_subspace(elements, embedding_matrix, weight_array, vector_dim, mean_centering, numComponents, debugInfo): ferr = open("errors_wpca_representation", "a+") flog = open("logs_pca_representation", "a+") weight_matrix = np.tile(weight_array.reshape(-1, 1), vector_dim) if embedding_matrix.ndim == 1: # only one word in the sentence, do nothing (no PCA), the vector-space of the word itself is the subspace ferr.write("[No WPCA]: Only a single element from " + " ".join(elements) + " found in supplied embeddings for the document" + "_".join(debugInfo) + "\n") subspace = embedding_matrix singularValues = np.array([1.0]) energyRetained = 1.0 else: flog.write("Original NumComponents: " + str(numComponents) + " NumElements: " + str(embedding_matrix.shape[0]) + "\t") numComponents = min(embedding_matrix.shape[0], embedding_matrix.shape[1], numComponents) flog.write("New NumComponents: " + str(numComponents) + "\n") pca = WPCA(n_components=numComponents, mean_centering=mean_centering ) #WPCA centers the matrix automatically try: kwds = {'weights': weight_matrix} pca.fit(embedding_matrix, **kwds) subspace = pca.components_ if numComponents == 1: # convert matrix to vector when numComponents = 1 subspace = subspace.T.reshape(-1) energyRetained = np.sum(pca.explained_variance_ratio_) if np.any(pca.explained_variance_ < 0): # Hack explained_variance = np.abs(pca.explained_variance_) ferr.write("[Numerical Precision Error]: Negative variance " + str(pca.explained_variance_) + " in subspace constructed for " + " ".join(elements) + " in the document: " + "_".join(debugInfo) + "\n") else: explained_variance = pca.explained_variance_ #singularValues = np.sqrt( explained_variance * (embedding_matrix.shape[0] - 1) ) singularValues = np.sqrt(explained_variance) except ( np.linalg.LinAlgError, ZeroDivisionError ) as e: # Fails (svd doesn't converge) for some reason. Use the word-vector average in this case! ferr.write("[WPCA Error]: No subspace constructed for " + " ".join(elements) + " in the document: " + "_".join(debugInfo) + "\n") subspace = np.mean(embedding_matrix, axis=0) singularValues = np.array([1.0]) energyRetained = 1.0 ferr.close() flog.close() return subspace, singularValues, energyRetained
def get_pca(input_: Array, learn_input: Array, learn_weight_vec: Opt[Array], n_comp_list: Iterable[int], err_printer: Callable[[Array, Array, str], None] = None, normalize_x: bool = True, normalize_z: bool = False) -> LinearAnalyzer: """ The last from ``n_comp_list`` would be returned. """ def expl(pca_): return np.round(np.sum(pca_.explained_variance_ratio_), 2) n_comp_list = list(n_comp_list) x = x_normalized = learn_input # (~6000, ~162) weight_vec = learn_weight_vec μ_x: Union[Array, int] = 0 σ_x: Union[Array, int] = 1 if normalize_x: x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec) weight_vec_as_mat = weights_matrix(weight_vec, x) if (weight_vec is not None) else None for j, i in enumerate(n_comp_list): pca = ClassWPCA(i) pca.fit(x_normalized, weights=weight_vec_as_mat) z: Array = pca.transform(x_normalized) inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z( z, weight_vec, normalize_z, x_normalized) an = LinearAnalyzer(n=pca.n_components, analyzer=pca, x=input_, μ_x=μ_x, σ_x=σ_x, μ_z=μ_z, σ_z=σ_z, inverse_transform_matrix=inverse_transform_matrix, normalize_x=normalize_x, normalize_z=normalize_z) if err_printer is not None: pref = f"Expl = {expl(pca)}, PC N = {pca.n_components}, " err_printer(input_, an.x_rec, pref) if (j + 1) == len(n_comp_list): break else: raise ValueError('Empty n_comp_list') return an
class CleanSpectra(object): def __init__(self, min_wavelength=3500, max_wavelength=8300, max_masked_fraction=1.0): self.min_wavelength = min_wavelength self.max_wavelength = max_wavelength self.max_masked_fraction = max_masked_fraction def load_data(self, h5file, selection=None): if not isinstance(selection, slice): selection = slice(selection) datafile = h5py.File(h5file, 'r') wavelengths = 10**datafile['log_wavelengths'][:] mask = ((wavelengths >= self.min_wavelength) & (wavelengths <= self.max_wavelength)) self.wavelengths = wavelengths[mask] self.spectra = datafile['spectra'][selection, mask] self.weights = datafile['ivars'][selection, mask] datafile.close() # remove rows with excessive missing data good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction self.spectra = self.spectra[good_rows] self.weights = self.weights[good_rows] self.weights **= 0.5 return self def fit_wpca(self, n_components=200, regularization=False): self.wpca = WPCA(n_components=n_components, regularization=regularization) self.wpca.fit(self.spectra, weights=self.weights) return self def reconstruct(self, spectra=None, weights=None, p=2): if spectra is None: spectra = self.spectra if weights is None: weights = self.weights new_spectra = self.wpca.reconstruct(spectra, weights=weights) SN = abs(spectra * weights)**(1. / p) SN /= SN.max(1, keepdims=True) return SN * spectra + (1 - SN) * new_spectra
def component_removal(data, n_comp): mean = data.mean(axis=1) data = data.sub(mean, axis=0) dataT = data.T.values weights = 0 + np.isfinite(dataT) kwds = {'weights': weights} pca = WPCA(n_components=30).fit(dataT, **kwds) #Fit data to model reconstruction = np.dot( pca.transform(dataT)[:, n_comp:], pca.components_[n_comp:, :]) reconst_df = pd.DataFrame(data=reconstruction.T, columns=data.columns, index=data.index) reconst_df = reconst_df.add(mean, axis=0) return reconst_df
class CleanSpectra(object): def __init__(self, min_wavelength=3500, max_wavelength=8300, max_masked_fraction=1.0): self.min_wavelength = min_wavelength self.max_wavelength = max_wavelength self.max_masked_fraction = max_masked_fraction def load_data(self, h5file, selection=None): if not isinstance(selection, slice): selection = slice(selection) datafile = h5py.File(h5file, 'r') wavelengths = 10 ** datafile['log_wavelengths'][:] mask = ((wavelengths >= self.min_wavelength) & (wavelengths <= self.max_wavelength)) self.wavelengths = wavelengths[mask] self.spectra = datafile['spectra'][selection, mask] self.weights = datafile['ivars'][selection, mask] datafile.close() # remove rows with excessive missing data good_rows = (self.weights == 0).mean(1) < self.max_masked_fraction self.spectra = self.spectra[good_rows] self.weights = self.weights[good_rows] self.weights **= 0.5 return self def fit_wpca(self, n_components=200, regularization=False): self.wpca = WPCA(n_components=n_components, regularization=regularization) self.wpca.fit(self.spectra, weights=self.weights) return self def reconstruct(self, spectra=None, weights=None, p=2): if spectra is None: spectra = self.spectra if weights is None: weights = self.weights new_spectra = self.wpca.reconstruct(spectra, weights=weights) SN = abs(spectra * weights) ** (1. / p) SN /= SN.max(1, keepdims=True) return SN * spectra + (1 - SN) * new_spectra
def weighted_PCA(df, n_pc=1, standardize=True): ''' Function for performing the PCA, using sklearn. df - Dataframe with expression values ''' x = df.values.T #Set x as transpose of only the numerical values of the dataframe if standardize: standardizer = StandardScaler() x2 = standardizer.fit_transform( x ) #Standardize the data (center to mean and scale to unit variance) else: x2 = x x2 = np.nan_to_num( x2 ) #Change back NaN values to 0, so array is accepted by the PCA function weights = 0 + np.isfinite(x) kwds = {'weights': weights} n_pcs = min(df.shape[0], n_pc) pca = WPCA(n_components=n_pcs).fit(x2, **kwds) #Fit data to model expl = pca.explained_variance_ratio_ x3 = pca.transform( x2, **kwds ) #Transform the data (apply dimensionality reduciton) and set x3 as principal components out_df = pd.DataFrame( x3.T, index=list(range(1, n_pcs + 1)), columns=df.columns ).T #Create dataframe with vlues from the PCA and set columnindex as the PC number cont = pd.DataFrame(index=df.index) for i in range(n_pcs): cont.loc[:, f'PC{i+1} contribution'] = pca.components_[i]**2 cont.sort_values(by='PC1 contribution', ascending=False, inplace=True) while n_pcs < n_pc: expl = np.append(expl, float('NaN')) n_pcs += 1 out_df.loc[:, str(n_pcs)] = float('NaN') return out_df, expl, cont
def test_copy_data(): rand = np.random.RandomState(0) X = rand.multivariate_normal([0, 0], [[12, 6], [6, 5]], size=100) W = rand.rand(*X.shape) X_orig = X.copy() # with copy_data=True, X should not change pca1 = WPCA(copy_data=True) pca1.fit(X, weights=W) assert np.all(X == X_orig) # with copy_data=False, X should be overwritten pca2 = WPCA(copy_data=False) pca2.fit(X, weights=W) assert not np.allclose(X, X_orig) # all results should match assert_allclose(pca1.mean_, pca2.mean_) assert_allclose(pca1.components_, pca2.components_) assert_allclose(pca1.explained_variance_, pca2.explained_variance_)
#### # Half axes axes= [10.0, 1.0] # Rotate elispse [rad] angles= [np.pi*0.1] # Origin shift orig = [5.0, -3.0] x, w = elipsoid( axes= axes, angles= angles, orig = orig, n=400) ax[0, 1].plot( x[:, 0], x[:, 1], 'o' ) # PCA kwds = {} ncomp = 2 pca = WPCA(n_components=ncomp).fit(x, **kwds) Y = WPCA(n_components=ncomp).fit_reconstruct(x, **kwds) means_ = pca.mean_ sigmas_ = np.sqrt(pca.explained_variance_) vectors_ = pca.components_[:ncomp] print("Components \n", vectors_) print("Sigmas", sigmas_ ) print("Means", means_ ) # Not used here # ax[1, 1].plot(np.arange(1, ncomp+1), pca.explained_variance_ratio_) plotPCA( ax[1,1], pca, x) fig.suptitle("Test PCA, WPCA", fontsize=16)
def main(): # requires n_comp_to_use, pc1_chunk_size import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) logger.log("grab final params") final_file = get_full_param_traj_file_path(traj_params_dir_name, "final") final_params = pd.read_csv(final_file, header=None).values[0] logger.log("grab start params") start_file = get_full_param_traj_file_path(traj_params_dir_name, "start") start_params = pd.read_csv(start_file, header=None).values[0] V = final_params - start_params ''' ========================================================================================== get the pc vectors ========================================================================================== ''' result = do_pca(cma_args.n_components, cma_args.n_comp_to_use, traj_params_dir_name, intermediate_data_dir, proj=False, origin="mean_param", use_IPCA=cma_args.use_IPCA, chunk_size=cma_args.chunk_size, reuse=True) logger.debug("after pca") final_plane = result["first_n_pcs"] count_file = get_full_param_traj_file_path(traj_params_dir_name, "total_num_dumped") total_num = pd.read_csv(count_file, header=None).values[0] all_param_iterator = get_allinone_concat_df( dir_name=traj_params_dir_name, use_IPCA=True, chunk_size=cma_args.pc1_chunk_size) unduped_angles_along_the_way = [] duped_angles_along_the_way = [] diff_along = [] unweighted_pc1_vs_V_angles = [] duped_pc1_vs_V_angles = [] pc1_vs_V_diffs = [] unweighted_ipca = IncrementalPCA( n_components=cma_args.n_comp_to_use) # for sparse PCA to speed up all_matrix_buffer = [] try: i = -1 for chunk in all_param_iterator: i += 1 if i >= 2: break chunk = chunk.values unweighted_ipca.partial_fit(chunk) unweighted_angle = cal_angle_between_nd_planes( final_plane, unweighted_ipca.components_[:cma_args.n_comp_to_use]) unweighted_pc1_vs_V_angle = postize_angle( cal_angle_between_nd_planes(V, unweighted_ipca.components_[0])) unweighted_pc1_vs_V_angles.append(unweighted_pc1_vs_V_angle) #TODO ignore 90 or 180 for now if unweighted_angle > 90: unweighted_angle = 180 - unweighted_angle unduped_angles_along_the_way.append(unweighted_angle) np.testing.assert_almost_equal( cal_angle_between_nd_planes( unweighted_ipca.components_[:cma_args.n_comp_to_use][0], final_plane[0]), cal_angle( unweighted_ipca.components_[:cma_args.n_comp_to_use][0], final_plane[0])) all_matrix_buffer.extend(chunk) weights = gen_weights(all_matrix_buffer, Funcs[cma_args.func_index_to_use]) logger.log(f"currently at {all_param_iterator._currow}") # ipca = PCA(n_components=1) # for sparse PCA to speed up # ipca.fit(duped_in_so_far) wpca = WPCA(n_components=cma_args.n_comp_to_use ) # for sparse PCA to speed up tic = time.time() wpca.fit(all_matrix_buffer, weights=weights) toc = time.time() logger.debug( f"WPCA of {len(all_matrix_buffer)} data took {toc - tic} secs " ) duped_angle = cal_angle_between_nd_planes( final_plane, wpca.components_[:cma_args.n_comp_to_use]) duped_pc1_vs_V_angle = postize_angle( cal_angle_between_nd_planes(V, wpca.components_[0])) duped_pc1_vs_V_angles.append(duped_pc1_vs_V_angle) pc1_vs_V_diffs.append(duped_pc1_vs_V_angle - unweighted_pc1_vs_V_angle) #TODO ignore 90 or 180 for now if duped_angle > 90: duped_angle = 180 - duped_angle duped_angles_along_the_way.append(duped_angle) diff_along.append(unweighted_angle - duped_angle) finally: plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) angles_plot_name = f"WPCA" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(duped_angles_along_the_way)), duped_angles_along_the_way, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"Not WPCA exponential 2" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(unduped_angles_along_the_way)), unduped_angles_along_the_way, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"Not WPCA - WPCA diff_along exponential 2," \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(diff_along)), diff_along, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"PC1 VS VWPCA PC1 VS V" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(duped_pc1_vs_V_angles)), duped_pc1_vs_V_angles, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"PC1 VS VNot WPCA PC1 VS V" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(unweighted_pc1_vs_V_angles)), unweighted_pc1_vs_V_angles, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"PC1 VS VNot WPCA - WPCA diff PC1 VS V" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(pc1_vs_V_diffs)), pc1_vs_V_diffs, "num of chunks", "angle with diff in degrees", False) del all_matrix_buffer import gc gc.collect()
def sketch(self, matrix, epochs=5, dim=80, verbose=False): """ Estimate the word embeddings. Parameters: - scipy.sparse.coo_matrix matrix: coocurrence matrix - int epochs: number of training epochs - int dim: sketch dimension - bool verbose: print progress messages if True """ shape = matrix.shape if (len(shape) != 2 or shape[0] != shape[1]): raise Exception('Coocurrence matrix must be square') if not sp.isspmatrix_coo(matrix): raise Exception('Coocurrence matrix must be in the COO format') use_svd = True for epoch in range(epochs): shape = matrix.shape if use_svd: # sketch matrix gamma = np.random.random((shape[1], dim)) # range sketch Y = matrix.dot(gamma) # (N, dim) Q, R = np.linalg.qr(Y) # (N, dim), A ~ Q @ Q.T @ A C = matrix.dot(Q).T # (dim, N) # Truncated SVD Uc, sc, Vhc = scipy.linalg.svd(C, full_matrices=False) U_matrix = Q.dot(Uc) #sketch_matrix = matrix.dot(sketch) #sketch_matrix[np.isclose(sketch_matrix, 0)] = -1e8 #U, s, Vh = scipy.linalg.svd(sketch_matrix, full_matrices=False) # Square root singular value self.word_vectors = U_matrix.dot(np.sqrt(np.diag(sc))) # # Normalized version # norms = np.sqrt(np.sum(np.square(U_matrix), axis=1, keepdims=True)) # U_matrix /= np.maximum(norms, 1e-7) # self.word_vectors = U_matrix else: log_matrix = sp.coo_matrix(matrix) log_matrix.data = np.log(log_matrix.data) sketch = np.random.random((shape[1], dim)) compressed = log_matrix.dot(sketch) compressed[np.isclose(compressed, 0)] = -1e8 weights = matrix.dot(sketch) weights += np.random.random(weights.shape) * 0.01 t_start = time.time() Y = WPCA(n_components=dim).fit_reconstruct(compressed, weights=weights) print(f"PCA time: {time.time() - t_start:03f}") self.word_vectors = Y if not np.isfinite(self.word_vectors).all(): raise Exception('Non-finite values in word vectors. ' 'Try reducing the learning rate or the ' 'max_loss parameter.')
def fit_wpca(self, n_components=200, regularization=False): self.wpca = WPCA(n_components=n_components, regularization=regularization) self.wpca.fit(self.spectra, weights=self.weights) return self
def main(): # requires n_comp_to_use, pc1_chunk_size import sys logger.log(sys.argv) common_arg_parser = get_common_parser() cma_args, cma_unknown_args = common_arg_parser.parse_known_args() this_run_dir = get_dir_path_for_this_run(cma_args) traj_params_dir_name = get_full_params_dir(this_run_dir) intermediate_data_dir = get_intermediate_data_dir(this_run_dir) if not os.path.exists(intermediate_data_dir): os.makedirs(intermediate_data_dir) ''' ========================================================================================== get the pc vectors ========================================================================================== ''' logger.log("grab final params") final_file = get_full_param_traj_file_path(traj_params_dir_name, "final") final_params = pd.read_csv(final_file, header=None).values[0] logger.log("grab start params") start_file = get_full_param_traj_file_path(traj_params_dir_name, "start") start_params = pd.read_csv(start_file, header=None).values[0] count_file = get_full_param_traj_file_path(traj_params_dir_name, "total_num_dumped") total_num = pd.read_csv(count_file, header=None).values[0] V = final_params - start_params all_thetas_downsampled = get_allinone_concat_df( dir_name=traj_params_dir_name).values[::2] unduped_angles_along_the_way = [] duped_angles_along_the_way = [] diff_along = [] num = 2 #TODO hardcode! undup_ipca = PCA(n_components=1) # for sparse PCA to speed up all_matrix_buffer = [] for chunk in all_param_iterator: chunk = chunk.values undup_ipca.partial_fit(chunk) unduped_angle = cal_angle(V, undup_ipca.components_[0]) #TODO ignore 90 or 180 for now if unduped_angle > 90: unduped_angle = 180 - unduped_angle unduped_angles_along_the_way.append(unduped_angle) all_matrix_buffer.extend(chunk) weights = gen_weights(all_param_iterator._currow, total_num) duped_in_so_far = dup_so_far_buffer(all_matrix_buffer, last_percentage, num) logger.log( f"currently at {all_param_iterator._currow}, last_pecentage: {last_percentage}" ) # ipca = PCA(n_components=1) # for sparse PCA to speed up # ipca.fit(duped_in_so_far) ipca = WPCA( n_components=cma_args.n_comp_to_use) # for sparse PCA to speed up for i in range(0, len(duped_in_so_far), cma_args.chunk_size): logger.log( f"partial fitting: i : {i} len(duped_in_so_far): {len(duped_in_so_far)}" ) if i + cma_args.chunk_size > len(duped_in_so_far): ipca.partial_fit(duped_in_so_far[i:]) else: ipca.partial_fit(duped_in_so_far[i:i + cma_args.chunk_size]) duped_angle = cal_angle(V, ipca.components_[0]) #TODO ignore 90 or 180 for now if duped_angle > 90: duped_angle = 180 - duped_angle duped_angles_along_the_way.append(duped_angle) diff_along.append(unduped_angle - duped_angle) plot_dir = get_plot_dir(cma_args) if not os.path.exists(plot_dir): os.makedirs(plot_dir) angles_plot_name = f"duped exponential 2, num dup: {num}" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(duped_angles_along_the_way)), duped_angles_along_the_way, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"unduped exponential 2, num dup: {num}" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(unduped_angles_along_the_way)), unduped_angles_along_the_way, "num of chunks", "angle with diff in degrees", False) angles_plot_name = f"undup - dup diff_along exponential 2, num dup: {num}" \ f"cma_args.pc1_chunk_size: {cma_args.pc1_chunk_size} " plot_2d(plot_dir, angles_plot_name, np.arange(len(diff_along)), diff_along, "num of chunks", "angle with diff in degrees", False) del all_matrix_buffer import gc gc.collect()
def getellipse ( histo, ratioECut=0.95, factorSigma=2.0 ): """ ratioECut : max energy ration to select factorSigma : fraction of the gaussian integral 1 sigma ~ 63 % 2 sigma ~ 95 % 3 sigma ~ 99.7 % """ # Warning 0 : Transpose to have i - > X, j -> Y # histo = histo.T # Tolal energy cut of the cluster ecut = np.sum( histo) * (1. - ratioECut) # Find pixel ecut 'pcut' ind = np.where( histo > 0. ) a = histo[ind] a = np.sort( a ) # Find pixel ecut 'pcut' s = 0.0; i= 0 while ( s < ecut ): pcut = s s = s + a[i] i = i+1 pcut = s # print ("getellipse ecut, pcut", ecut, pcut) # Remove pixel < pcut ind = np.where( histo > pcut ) # ??? x = np.where( a >= pcut, a, 0 ) x = np.array(ind, dtype=np.float32) # Debug # print ( x.T ) w = np.sqrt( histo[ ind] ) w = [ w, w ] w = np.transpose( w ) # Debug # print (w) # Debug lin. regression """ slope, intercept, r_value, p_value, std_err = stats.linregress(x[0],x[1]) print("slope", slope, intercept) xmin = np.min(x[0]) xmax = np.max(x[0]) xs = np.array([xmin, xmax]) ys = xs*slope +intercept plt.scatter(x[0], x[1]) plt.plot(xs, ys) plt.xlim(0, 256) plt.ylim(0, 256) plt.show() """ # PCA kwds = {'weights': w} ncomp = 2 # Warning 0 : transpose pca = WPCA(n_components=ncomp).fit( np.transpose(x), **kwds) # Debug : compute covariance """ print("Shape x, : ", x.shape, w.shape ) cov = np.cov( x ) # , aweights=w[:,0] ) print("cov: ", cov) eigVal, eigVec = np.linalg.eig( cov) print("eig: ", eigVal) print("eig. vect: ", eigVec) """ orig_ = pca.mean_ axes_ = factorSigma * np.sqrt(pca.explained_variance_) vectors_ = pca.components_[:ncomp] # ellipse rotation # Debug # print( "sin=", vectors_[0][1], "cos=", vectors_[0][0] ) angles_ = np.array( [np.arctan2( vectors_[0][1], vectors_[0][0]) ] ) """ DEBUG print("PCA Components \n", vectors_) print("PCA Sigmas, half axes", axes_ ) print("PCA Means/ origin", orig_ ) print("PCA Angles, ellipse rotation", angles_ * 180.0 / np.pi ) """ # BBox # vectors_[1] is the smallest if ( np.abs( vectors_[0][0]) <= 10e-7): xmin = - axes_[0] + orig[0] xmax = + axes_[0] + orig[0] ymin = - axes_[1] + orig[1] ymax = + axes_[1] + orig[1] else: tgR = vectors_[0][1] / vectors_[0][0] # Y # --- # Derivate dx/dt = 0 theta = np.array( [np.arctan2( - axes_[1] * tgR, axes_[0] ) ] ) xmin = ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta)[0][0] xmax = ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta+np.pi)[0][0] if (xmin > xmax) : t = xmin; xmin = xmax; xmax = t # Debug #print ("PCA theta dX/dTheta, xmin, xmax = 0", theta* 180.0 / np.pi, xmin, xmax) # # Y # --- # Derivate dy/dt = 0 theta = np.array( [ np.arctan2( axes_[1] , axes_[0]*tgR ) ] ) ymin = ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta)[0][1] ymax = ellipse(axes= axes_, angles= angles_, orig= orig_, t=theta+np.pi)[0][1] if (ymin > ymax) : t = ymin; ymin = ymax; ymax = t # Debug # print ("PCA theta dY/dTheta, ymin, ymax = 0", theta* 180.0 / np.pi, ymin, ymax) # Warning 0 : inverse transpose to have in pixel or matrix indices xmin = max( 0, xmin ) ymin = max( 0, ymin ) xmax = min( 255, xmax ) ymax = min( 255, ymax ) bbox = np.array( [xmin, xmax, ymin, ymax], dtype=np.float32 ) # angles_ = angles_ - np.pi/2 axes_ = np.array( [ axes_[0], axes_[1] ], dtype=np.float32 ) orig_ = np.array( [ orig_[0], orig_[1] ], dtype=np.float32 ) # print("Angle :", angles_*180/np.pi) return bbox, axes_, angles_, orig_
def benchmark_complete(data, ending_density=.02, step=.01): ''' Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation Output: Dataframe of output density and RMSE for each method with respect to each input density ''' # removes min value that is greater than zero (checks density) in each iteration randomly chosen #density range to run nonzeroscount = np.count_nonzero(data) sizel = data.shape totalentr = sizel[0] * sizel[1] end = 0.02 # final density to test begin = (nonzeroscount / totalentr) # Begning density of matrix given #step=.01 # step of density #intialize lists to store density_in = [] RMSE_empca_scores = [] RMSE_wpca_scores = [] RMSE_sfi_scores = [] RMSE_siv_scores = [] RMSE_sni_scores = [] RMSE_smi_scores = [] RMSE_szi_scores = [] RMSE_wmiC_scores = [] RMSE_wmiP_scores = [] Density_empca = [] Density_wpca = [] Density_sfi = [] Density_siv = [] Density_sni = [] Density_smi = [] Density_szi = [] Density_wmiC = [] Density_wmiP = [] #radnomly remove values from known matrix and try to impute them for d in reversed(np.arange(end, begin, step)): otum = data.T.copy() #begin density check nonzeroscount = np.count_nonzero(otum) sizel = otum.shape totalentr = sizel[0] * sizel[1] while np.float64((nonzeroscount / totalentr)) > d: #remove a min frequency OTU and then check density j = np.random.randint(0, len(otum[:][:]) - 1) #make sure row is not all zero (all zero row causes singular matrix) if sum(list(otum[j][:])) < 1: continue m = min(i for i in list(otum[j][:]) if i > 0) #make sure removing value will not result in zero row if sum(list(otum[j][:])) == m: continue otum[j][list(otum[j][:]).index(m)] = 0 #check denstiy to break nonzeroscount = float(np.count_nonzero(otum)) sizel = otum.shape totalentr = float(sizel[0]) * float(sizel[1]) # coherce float of the unknown and print new density print("Data table of %f generated" % d) otum = otum.T.astype(np.float64) # make zero unknown for fancy impute, avoid singular matrix by taking transpose otum2 = otum.T.copy() otum2 = otum2.astype(np.float64) otum2[otum2 == 0] = np.nan #make unknown nan #WPCA and EMPCA #build wieghted matrix weight = otum.copy() for i in range(len(otum2.T)): for j in range(len(otum2.T[i])): if otum2.T[i][j] == 0: weight[i][j] = 1 else: weight[i][j] = 1000 print("Running EMPCA") EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight) print("Running WPCA") WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight) # fancy impute and zeros print("Nuclear Norm") sni = NuclearNormMinimization(min_value=(np.amin(otum2)), max_value=(np.amax(otum2))).complete( otum2.copy()) print("Running Soft Impute") sfi = SoftImpute(shrinkage_value=None, convergence_threshold=0.00001, max_iters=1000, max_rank=min(otum2.shape), n_power_iterations=1, init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), normalizer=None, verbose=False).complete(otum2.copy()) print("Running Iterative SVD") siv = IterativeSVD(rank=(min(otum2.shape) - 1), convergence_threshold=0.00001, max_iters=1000, gradual_rank_increase=True, svd_algorithm="arpack", init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Running Matrix Factorization") smi = MatrixFactorization(rank=(min(otum2.shape) - 1), initializer=np.random.randn, learning_rate=0.01, patience=5, l1_penalty=0.05, l2_penalty=0.05, min_improvement=0.01, max_gradient_norm=5, optimization_algorithm="adam", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Imputing by filling with zeros for base comparison") szi = base.zeros(otum2.copy()) print("Weighted Mean Interpolation without phylo-distance") wmiC = base.wmi_wrapper(X=otum2.copy()) print("Weighted Mean Interpolation with phylo-distance") phylo = pd.read_csv( 'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv' ) wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo) # save the results #density in (after removed values) density_in.append(error.get_density(otum)) # density imputed Density_empca.append(error.get_density(EMPCAi)) Density_wpca.append(error.get_density(WPCAi)) Density_sfi.append(error.get_density(sfi)) Density_siv.append(error.get_density(siv)) Density_sni.append(error.get_density(sni)) Density_smi.append(error.get_density(smi)) Density_szi.append(error.get_density(szi)) Density_wmiC.append(error.get_density(wmiC)) Density_wmiP.append(error.get_density(wmiP)) # RMSE of imputed values missing_mask = np.isnan( otum2.T ) # masking to only check RMSE between values imputed and values removed RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask)) RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask)) RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask)) RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask)) RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask)) RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask)) RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask)) RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask)) RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask)) RMSEmapping = pd.DataFrame({ 'Density': list(map(int, density_in)), 'EMPCA': RMSE_empca_scores, 'Matrix Factorization': RMSE_smi_scores, 'WPCA': RMSE_wpca_scores, 'Soft Impute': RMSE_sfi_scores, 'Iterative SVD': RMSE_siv_scores, 'Nuclear Norm Minimization': RMSE_sni_scores, 'Zeros Replace Unknown': RMSE_szi_scores, 'Weighted-Mean Interpolation Correlation': RMSE_wmiC_scores, 'Weighted-Mean Interpolation Phylo': RMSE_wmiP_scores }) RMSEmapping.set_index(['Density'], inplace=True) Out_density = pd.DataFrame({ 'density': list(map(int, density_in)), 'EMPCA': Density_empca, 'Matrix Factorization': Density_smi, 'WPCA': Density_wpca, 'Soft Impute': Density_sfi, 'Iterative SVD': Density_siv, 'Nuclear Norm Minimization': Density_sni, 'Zeros Replace Unknown': Density_szi, 'Weighted-Mean Interpolation Correlation': Density_wmiC, 'Weighted-Mean Interpolation Phylo': Density_wmiP }) Out_density.set_index(['density'], inplace=True) return Out_density, RMSEmapping
def eigensampleFromWPCA(matrix): '''Find a representation of each sample using wPCA to exclude NaNs.''' weights = 1.0 - np.isnan(matrix.T) pc = WPCA(n_components=1).fit_reconstruct(matrix.T, weights=weights) return pc.T