def run_factorization(self, N, S, X, Z, I, K, k, n): # Smart initialization rat = k / n nans = np.isnan(rat) scaled_rat = scale_allelic_ratios(rat) ppca = PPCA() ppca.fit(data=np.transpose(scaled_rat), d=K, verbose=True) U = ppca.C V = ppca.transform() pickle.dump(ppca, open(self.output_root + '_model', 'wb')) np.savetxt(self.output_root + '_temper_U.txt', U, fmt="%s", delimiter='\t') np.savetxt(self.output_root + '_temper_V.txt', V.T, fmt="%s", delimiter='\t')
def remove_nan_test(self): N = 101 k = 23 p_nan = 0.02 n_components = 3 data = np.random.random((N, k)) for n in range(N): for _k in range(k): if random.random() < p_nan: data[n, _k] = np.nan pca = PPCA() pca.fit(data, n_components) self.assertEqual(pca.data[np.isnan(pca.data)].shape, (0, )) self.assertEqual(pca.C.shape, (k, n_components)) self.assertEqual(pca.transform().shape, (N, n_components))
for i in range(len(X)): Y = np.fromstring(X[i], dtype=int, sep = ' ') Y = np.reshape(Y,(48, 48)) E.append(Y) X_inp = np.array(E) #X_train = X_inp.reshape(-1,X_inp.shape[1], X_inp.shape[2],1) X_train = X_inp.astype('float32') print X_inp inp_img = X_train[0,:,:] ppca = PPCA(inp_img) ppca.fit(d=20, verbose=False) component_mat = ppca.transform() E_y = component_mat.reshape(1,component_mat.shape[0],component_mat.shape[1]) for i in range(1,len(X_train)): print i inp_img = X_train[i,:,:] ppca = PPCA(inp_img) try: ppca.fit(d=20, verbose=False) component_mat = ppca.transform() shape = component_mat.shape component_mat = component_mat.reshape(1,component_mat.shape[0],component_mat.shape[1]) if shape[1] == 20: E_y = concatenate((E_y,component_mat)) except numpy.linalg.linalg.LinAlgError: print "Numpy Error"
SelectedImage = showImagesRandomImages( 3) #select and image randomly from MNSIT dataset missingPercentage = 0.2 # missing rate percentage missingImage = generateMissingFig( SelectedImage, missingPercentage) #inserting missing values to the original image imputer = KNNImputer(n_neighbors=2, weights="uniform") imputed_by_KNN = imputer.fit_transform(missingImage) KNNImputed_RMSE = mean_squared_error(SelectedImage, imputed_by_KNN) #plt.imshow(imputed_by_KNN, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = MissForest() MissForest_imputed = imputer.fit_transform(missingImage) MissForest_RMSE = mean_squared_error(SelectedImage, MissForest_imputed) #plt.imshow(MissForest_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() imputer = IterativeImputer() MICE_imputed = imputer.fit_transform(missingImage) MICE_RMSE = mean_squared_error(SelectedImage, MICE_imputed) #plt.imshow(MICE_imputed, cmap='gray', vmin=0, vmax=1) #plt.show() ppca = PPCA() ppca.fit(data=SelectedImage, d=100, verbose=True) PPCA_imputed = ppca.transform(missingImage) PPCA_RMSE = mean_squared_error(SelectedImage, PPCA_imputed) #plt.imshow(PPCA_imputed, cmap='gray', vmin=0, vmax=1) #plt.show()
# @Date: 2020-07-07 11:22:28 # @Last Modified by: ashayaan # @Last Modified time: 2020-07-07 12:44:33 import torch import pandas as pd import numpy as np import json from ppca import PPCA import pickle if __name__ == '__main__': roberta_features = pd.read_csv('../data/roberta.csv') roberta_features = roberta_features.set_index('idx') mca_features = pd.read_csv('../data/mca.csv') mca_features = mca_features.set_index('idx') pca_features = pd.read_csv('../data/pca.csv') pca_features = pca_features.set_index('idx') links = pd.read_csv('../data/ml-20m/links.csv') df = pd.concat([roberta_features, mca_features, pca_features], axis=1) ppca = PPCA() ppca.fit(data=df.values.astype(float),d=128,verbose=True) print(ppca.var_exp) transformed = ppca.transform() films_dict = dict([(k, torch.tensor(transformed[i]).float()) for k, i in zip(df.index, range(transformed.shape[0]))]) pickle.dump(films_dict, open('../data//ml20_pca128.pkl', 'wb'))
plt.show() print('\n\n\n\n**TEST CALCULATING LIKELIHOOD**') ppca1 = PPCA(n_dimension=2) loglikelihoods = ppca1.fit(data, method='EM', keep_loglikes=True) plt.plot(loglikelihoods) plt.show() print('\n\n\n\n**TEST DIMENSION REDUCTION AND RECOVERING**') plt.matshow(data) print('\n\noriginal data') plt.show() ppca3 = PPCA(n_dimension=2) ppca3.fit(data, method='EM') plt.matshow( ppca3.inverse_transform( ppca3.transform(data) ) ) print('\n\nrecovered data: 2-component') plt.show() ppca4 = PPCA(n_dimension=2) ppca4.fit(data, batchsize=16, n_iteration=2000, method='EM') plt.matshow( ppca4.inverse_transform( ppca4.transform(data) ) ) print('\n\nrecovered data: 2-component (mini-batch)') plt.show() ppca5 = PPCA(n_dimension=63) ppca5.fit(data, method='EM') plt.matshow( ppca5.inverse_transform( ppca5.transform(data) ) ) print('\n\nrecovered data: 63-component') plt.show()