class PCAImputer: def __init__(self, n_dimension): self._q = n_dimension def fit_transform(self, data, method='eig', probabilistic=False, n_iteration=100): """fitting a PCA to the original data by iterativly filling the missing entries with value generated from PCA. Each missing entries are initialized with the row mean.""" self._data = data.copy() self._missing = np.isnan(data) self._observed = ~self._missing self._pca = PPCA(n_dimension=self._q) row_defau = np.zeros(self._data.shape[0]) row_means = np.repeat(np.nanmean(self._data, axis=1, out=row_defau).reshape(-1, 1), \ self._data.shape[1], axis=1) self._data[self._missing] = row_means[self._missing] for i in range(n_iteration): self._pca.fit(self._data, method=method) self._data[self._missing] = self._pca.inverse_transform(self._pca.transform(self._data, \ probabilistic), probabilistic)[self._missing] return self._data
plt.show() print('\n\n\n\n**TEST CALCULATING LIKELIHOOD**') ppca1 = PPCA(n_dimension=2) loglikelihoods = ppca1.fit(data, method='EM', keep_loglikes=True) plt.plot(loglikelihoods) plt.show() print('\n\n\n\n**TEST DIMENSION REDUCTION AND RECOVERING**') plt.matshow(data) print('\n\noriginal data') plt.show() ppca3 = PPCA(n_dimension=2) ppca3.fit(data, method='EM') plt.matshow( ppca3.inverse_transform( ppca3.transform(data) ) ) print('\n\nrecovered data: 2-component') plt.show() ppca4 = PPCA(n_dimension=2) ppca4.fit(data, batchsize=16, n_iteration=2000, method='EM') plt.matshow( ppca4.inverse_transform( ppca4.transform(data) ) ) print('\n\nrecovered data: 2-component (mini-batch)') plt.show() ppca5 = PPCA(n_dimension=63) ppca5.fit(data, method='EM') plt.matshow( ppca5.inverse_transform( ppca5.transform(data) ) ) print('\n\nrecovered data: 63-component') plt.show()