def initialize_variables(self): self.N = self.allelic_counts.shape[0] self.T = self.allelic_counts.shape[1] self.num_cov = self.cov.shape[1] # Randomly initialize (only U matters) self.U = np.random.randn(self.N, self.K) self.V = np.random.randn(self.K, self.T) mom_conc_init = 1.0 / np.nanvar( self.allelic_counts / self.total_counts) self.conc = np.ones(self.T) * mom_conc_init self.C = np.random.randn(self.num_cov, self.T) ppca_init = False if ppca_init == True: rat = self.allelic_counts / self.total_counts nans = np.isnan(rat) scaled_rat = scale_allelic_ratios(rat) scaled_residual_rat = regress_out_cell_line( scaled_rat, self.cov[:, 1:]) rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat) ppca = PPCA() ppca.fit(data=np.transpose(rescaled_residual_rat), d=self.K, verbose=True, tol=1e-6) self.U = ppca.C / np.std(ppca.C)
def run_factorization(self, N, S, X, Z, I, K, k, n): # Smart initialization #rat = k/n rat = filter_lowly_expressed_sites(k, n, 3) nans = np.isnan(rat) scaled_rat = scale_allelic_ratios(rat) scaled_residual_rat = regress_out_cell_line(scaled_rat, Z) rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat) ppca = PPCA() ppca.fit(data=np.transpose(rescaled_residual_rat), d=K, verbose=True, tol=1e-6) U = ppca.C V = ppca.transform() pickle.dump(ppca, open(self.output_root + '_model', 'wb')) np.savetxt(self.output_root + '_temper_U.txt', U, fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', V.T, fmt="%s", delimiter='\t')
class PCAImputer: def __init__(self, n_dimension): self._q = n_dimension def fit_transform(self, data, method='eig', probabilistic=False, n_iteration=100): """fitting a PCA to the original data by iterativly filling the missing entries with value generated from PCA. Each missing entries are initialized with the row mean.""" self._data = data.copy() self._missing = np.isnan(data) self._observed = ~self._missing self._pca = PPCA(n_dimension=self._q) row_defau = np.zeros(self._data.shape[0]) row_means = np.repeat(np.nanmean(self._data, axis=1, out=row_defau).reshape(-1, 1), \ self._data.shape[1], axis=1) self._data[self._missing] = row_means[self._missing] for i in range(n_iteration): self._pca.fit(self._data, method=method) self._data[self._missing] = self._pca.inverse_transform(self._pca.transform(self._data, \ probabilistic), probabilistic)[self._missing] return self._data
def probPCA(self): """ Do probabilistic PCA :return: result 2d array X """ # Normalizing X sc = StandardScaler() X_normalized = sc.fit_transform(self.X) ppca = PPCA() ppca.fit(data=X_normalized, d=2) result = ppca.transform() return result
def compute_pca(data, predictors, how='pca', what='factors', n_components=1, use_corr=True): data[predictors] = data[predictors].astype('float64') X = data[predictors].values if (use_corr == True): ## If PCA are computed using the correlation matrix --> standardize the data (zero mean, unit std.) scaler = preprocessing.StandardScaler() X_std = scaler.fit_transform(X) else: ## If PCA are computed using the covariance matrix --> center the data only (zero mean) X_mean = np.mean(X, axis=0) X_std = X - X_mean if (how == 'pca'): pca = PCA(n_components) pca.fit(X_std) factors = pca.transform(X_std) explained_variance = pca.explained_variance_ratio_ Xhat_std = pca.inverse_transform(factors) if (use_corr == True): Xhat = scaler.inverse_transform(Xhat_std) else: Xhat = Xhat_std + X_mean if (how == 'ppca'): ppca = PPCA() ppca.fit(X_std, n_components) factors = ppca.transform() explained_variance = ppca.var_exp Xhat_std = ppca.reconstruct() if (use_corr == True): Xhat = scaler.inverse_transform(Xhat_std) else: Xhat = Xhat_std + X_mean if (what != 'recon'): pca_columns = [] for i in range(factors.shape[1]): pca_columns.append('pca_{}'.format(i)) data['pca_{}'.format(i)] = factors[:, i] return list([data, explained_variance]) else: rec_data = pd.DataFrame(Xhat) return rec_data
def run_ppca(data, features, ncomponents = None, min_obs = 0.1, use_corr = False): X = data[features] if ncomponents is None: ncomponents=len(features) ppca = PPCA(X,d = ncomponents, min_obs = min_obs) ppca.standardize() ppca.fit() scores, loadings = ppca.transform() explained_variance = ppca.var_exp pca_columns = [] for i in range(scores.shape[1]): pca_columns.append('pc_{}'.format(i)) data['pc_{}'.format(i)] = scores[:,i] return list([data,explained_variance])
def run_factorization(self, N, S, X, Z, I, K, k, n): # Smart initialization rat = k / n nans = np.isnan(rat) scaled_rat = scale_allelic_ratios(rat) ppca = PPCA() ppca.fit(data=np.transpose(scaled_rat), d=K, verbose=True) U = ppca.C V = ppca.transform() pickle.dump(ppca, open(self.output_root + '_model', 'wb')) np.savetxt(self.output_root + '_temper_U.txt', U, fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', V.T, fmt="%s", delimiter='\t')
def remove_nan_test(self): N = 101 k = 23 p_nan = 0.02 n_components = 3 data = np.random.random((N, k)) for n in range(N): for _k in range(k): if random.random() < p_nan: data[n, _k] = np.nan pca = PPCA() pca.fit(data, n_components) self.assertEqual(pca.data[np.isnan(pca.data)].shape, (0, )) self.assertEqual(pca.C.shape, (k, n_components)) self.assertEqual(pca.transform().shape, (N, n_components))
def run_ppca_initialization(self): print('Starting PPCA initialization') rat = self.allelic_counts/self.total_counts nans = np.isnan(rat) scaled_rat = scale_allelic_ratios(rat) scaled_residual_rat = regress_out_cell_line(scaled_rat, self.Z) rescaled_residual_rat = scale_allelic_ratios(scaled_residual_rat) ppca = PPCA() ppca.fit(data=np.transpose(rescaled_residual_rat), d=self.K, verbose=True, tol=1e-6) self.U_init = ppca.C/np.std(ppca.C) # Run bb-mf with pm.Model() as bb_glm_init: CONC = pm.HalfCauchy('CONC', beta=5, shape=(1,self.S), testval=self.conc_init) BETA = pm.Normal('BETA', mu=0, tau=(1/1000000.0), shape=(self.S, self.num_cov), testval=self.beta_init) #U = pm.Normal('U', mu=0, tau=(1.0/1.0), shape=(N, K), testval=self.U_init) V = pm.Normal('V', mu=0, tau=(1.0/1.0), shape=(self.S, self.K), testval=np.zeros(self.V_init.shape)) MU_A = pm.Normal("MU_A", mu=0., sd=100**2, shape=(1,self.S), testval=self.mu_a_init) SIGMA_A = pm.HalfCauchy("SIGMA_A", beta=5.0, shape=(1,self.S), testval=self.sigma_a_init) mu_a_mat = pm.math.dot(np.ones((self.I,1)), MU_A) sigma_a_mat = pm.math.dot(np.ones((self.I,1)), SIGMA_A) A = pm.Normal('A', mu=mu_a_mat, sigma=sigma_a_mat, shape=(self.I,self.S), testval=self.A_init) p = pm.math.invlogit(pm.math.dot(self.cov, BETA.T) + pm.math.dot(self.U_init,V.T) + A[self.Z,:]) conc_mat = pm.math.dot(np.ones((self.N,1)), CONC) R = pm.BetaBinomial('like',alpha=(p*conc_mat)[~nans], beta=((1.0-p)*conc_mat)[~nans], n=self.total_counts[~nans], observed=self.allelic_counts[~nans]) approx_init = pm.fit(method='advi', n=2000) pickle.dump(approx_init, open(self.output_root + '_model_init', 'wb')) init_dict = approx_init.bij.rmap(approx_init.params[0].eval()) self.beta_init = init_dict['BETA'] self.A_init = init_dict['A'] self.sigma_a_init = np.exp(init_dict['SIGMA_A_log__']) self.mu_a_init = init_dict['MU_A'] self.conc_init = np.exp(init_dict['CONC_log__']) self.V_init = init_dict['V'] print('Smart PPCA complete')
def reducePCA(x, ndim): # if there are any nans in any of the lists, use ppca if np.isnan(np.vstack(x)).any(): warnings.warn( 'Missing data: Inexact solution computed with PPCA (see https://github.com/allentran/pca-magic for details)' ) # ppca if missing data m = PPCA(np.vstack(x)) m.fit(d=ndim) x_pca = m.transform() # if the whole row is missing, return nans all_missing = [ idx for idx, a in enumerate(np.vstack(x)) if all([type(b) == np.nan for b in a]) ] if len(all_missing) > 0: for i in all_missing: x_pca[i, :] = np.nan # get the original lists back if len(x) > 1: x_split = np.cumsum([i.shape[0] for i in x][:-1]) return list(np.split(x_pca, x_split, axis=0)) else: return [x_pca] else: m = PCA(n_components=ndim, whiten=True) m.fit(np.vstack(x)) if len(x) > 1: return [m.transform(i) for i in x] else: return [m.transform(x[0])]
def do_probabilistic_PCA(df, var, output): """ Perform probabilistic PCA (PPCA) on scaled values for the whole screen Args: df: Existing combined dictionary var: Minimum explained variance required output: Output filenames Return: df: Updated with added 'DataPCA' values exp_var: List of explained variance with each added PC num_PCs: Number of PCs to explain var PCA_loadings: Principal axes in feature space (n_components, n_features) """ print('Feature selection using probabilistic PCA...') log_write(output['log'], 'Feature selection using probabilistic PCA...\n') # Initialize parameters exp_var = [0] exp_var_ratio = [] num_PCs = 0 ppca = PPCA() ppca.fit(df['DataScaled'], d=2) exp_var.append(ppca.var_exp[0]) exp_var_ratio.append(ppca.var_exp[0]) # Do PPCA with number of components iteratively (max is the number of features, min is 2) for i in range(2, df['DataScaled'].shape[1]): num_PCs = i ppca = PPCA() ppca.fit(df['DataScaled'], d=i) total_var = ppca.var_exp[i - 1] exp_var.append(total_var) exp_var_ratio.append(ppca.var_exp[i - 1] - ppca.var_exp[i - 2]) # End PCA if the total variance passes the minimum variance required if total_var > var: num_PCs = i np.savetxt(output['PCAExplainedVariance'], exp_var_ratio, fmt='%0.4f') break # Do the final PCA with num_PCs ppca = PPCA() ppca.fit(df['Data'], d=num_PCs) df['DataPCA'] = ppca.transform() PPCA_loadings = np.transpose(ppca.C) return df, exp_var, num_PCs, PPCA_loadings
E = [] for i in range(len(X)): Y = np.fromstring(X[i], dtype=int, sep = ' ') Y = np.reshape(Y,(48, 48)) E.append(Y) X_inp = np.array(E) #X_train = X_inp.reshape(-1,X_inp.shape[1], X_inp.shape[2],1) X_train = X_inp.astype('float32') print X_inp inp_img = X_train[0,:,:] ppca = PPCA(inp_img) ppca.fit(d=20, verbose=False) component_mat = ppca.transform() E_y = component_mat.reshape(1,component_mat.shape[0],component_mat.shape[1]) for i in range(1,len(X_train)): print i inp_img = X_train[i,:,:] ppca = PPCA(inp_img) try: ppca.fit(d=20, verbose=False) component_mat = ppca.transform() shape = component_mat.shape component_mat = component_mat.reshape(1,component_mat.shape[0],component_mat.shape[1]) if shape[1] == 20: E_y = concatenate((E_y,component_mat)) except numpy.linalg.linalg.LinAlgError:
SelectedImage = showImagesRandomImages( 3) #select and image randomly from MNSIT dataset missingPercentage = 0.2 # missing rate percentage missingImage = generateMissingFig( SelectedImage, missingPercentage) #inserting missing values to the original image imputer = KNNImputer(n_neighbors=2, weights="uniform") imputed_by_KNN = imputer.fit_transform(missingImage) KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN) #plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = MissForest() MissForest_imputed = imputer.fit_transform(missingImage) MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed) #plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = IterativeImputer() MICE_imputed = imputer.fit_transform(missingImage) MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed) #plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() ppca = PPCA() ppca.fit(data=SelectedImage, d=100, verbose=True) PPCA_imputed = ppca.transform(missingImage) PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed) #plt.imshow(PPCA_imputed, cmap='gray', vmin=0, vmax=1) #plt.show()
# @Date: 2020-07-07 11:22:28 # @Last Modified by: ashayaan # @Last Modified time: 2020-07-07 12:44:33 import torch import pandas as pd import numpy as np import json from ppca import PPCA import pickle if __name__ == '__main__': roberta_features = pd.read_csv('../data/roberta.csv') roberta_features = roberta_features.set_index('idx') mca_features = pd.read_csv('../data/mca.csv') mca_features = mca_features.set_index('idx') pca_features = pd.read_csv('../data/pca.csv') pca_features = pca_features.set_index('idx') links = pd.read_csv('../data/ml-20m/links.csv') df = pd.concat([roberta_features, mca_features, pca_features], axis=1) ppca = PPCA() ppca.fit(data=df.values.astype(float),d=128,verbose=True) print(ppca.var_exp) transformed = ppca.transform() films_dict = dict([(k, torch.tensor(transformed[i]).float()) for k, i in zip(df.index, range(transformed.shape[0]))]) pickle.dump(films_dict, open('../data//ml20_pca128.pkl', 'wb'))
import numpy as np import matplotlib.pyplot as plt from numpy.random import multivariate_normal from ppca import PPCA if __name__ == '__main__': cov = np.diag([10, 9, 8, 7] + [1]*28 + [6, 5, 4, 3] + [1]*28)**2 data = multivariate_normal(np.zeros(64), cov, 256) ppca1 = PPCA(n_dimension=4) ppca1.fit(data, method='EM') ppca2 = PPCA(n_dimension=4) ppca2.fit(data, method='eig') print('\n\n\n\n**TEST FITTING THE COVARIANCE MATRIX**') plt.matshow(cov); print('\n\noriginal covariance matrix') plt.show() plt.matshow(ppca1._C); print('\n\nfitted covariance matrix (fitted by EM)') plt.show() plt.matshow(ppca2._C); print('\n\nfitted covariance matrix (fitted by eigen)') plt.show() print('\n\n\n\n**TEST GENERATING DATA**') plt.scatter(data[:, 0], data[:, 1], alpha=0.2); print('\n\noriginal data (first 2 dimensions)') plt.show() gene = ppca1.generate(256)
if __name__ == '__main__': omdb = json.load(open('../data/parsed/omdb.json')) tmdb = json.load(open('../data/parsed/tmdb.json')) numerical_features = {'omdb': ['Year', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes'], 'tmdb': ['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count']} omdb_numerical = extractFeatures(omdb,'omdb',numerical_features['omdb']) tmdb_numerical = extractFeatures(tmdb,'tmdb',numerical_features['tmdb']) data = dict([(i, {**omdb_numerical[i], **tmdb_numerical[i]}) for i in omdb_numerical.keys()]) data = extractRatings(data) # data = dict([(i,{**omdb_numerical[i],**tmdb_numerical[i]}) for i in omdb_numerical.keys()]) df = pd.DataFrame.from_dict(data).T df.replace('N/A',np.nan,inplace=True) df.to_pickle('temp.pkl') df = fixData(df) df = df.head(100) ppca = PPCA() print (time.ctime()) ppca.fit(df.values.astype(float), d=16,verbose=True) print (time.ctime()) transformed = ppca.transform() transformed = pd.DataFrame(transformed) transformed['idx'] = pd.Series(list(omdb.keys())) transformed = transformed.set_index('idx') transformed.head() transformed.to_csv('../data/pca.csv',index=True,index_label='idx')