def decompose_features(features, decomposer, n_components=None, lag_time=1): ''' Decomposing features is a way to reduce the dimension of the features. Each of the components is a eigenvector of the feature space, dimension: (n_features,) The old features are transformed to the new feature space. Consider one sample, which is vectorized to (n_features,).T, apply the transform matrix, which is in the shape (n_components, n_features), we will get its projection onto the new space (n_components,). -------------------------------------------------------------------------------------------------------------------------------------- Input features : array-like, length n_trajs, each of shape (n_samples, n_features) Output features_new : array-like, length n_trajs, each of shape (n_samples, n_components) ((n_samples, n_samples) if n_components = None) dcmp.components_ : shape (n_components, n_features), ((n_samples, n_features) if n_components = None) PCA : Principal axes in feature space, representing the directions of maximum variance in the data. tICA : Components with maximum autocorrelation. ''' if decomposer == 'PCA': from msmbuilder.decomposition import PCA dcmp = PCA(n_components=n_components) elif decomposer == 'tICA': from msmbuilder.decomposition import tICA dcmp = tICA(n_components=n_components, lag_time=lag_time) features_new = dcmp.fit_transform(features) return features_new, dcmp.components_
def fit_protein_pca(yaml_file): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("pca__"): current_mdl_params[i.split("pca__")[1]] = mdl_params[i] protein_pca_mdl = PCA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_pca_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the pca_mdl pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl") verbosedump(protein_pca_mdl, pca_mdl_path) return
def decompose_features(features, decomposer, n_components=None, lag_time=1): ''' Input features : list of arrays, length n_trajs, each of shape (n_samples, n_features) Output features_new : list of arrays, length n_trajs, each of shape (n_samples, n_features_new) ''' if decomposer == 'PCA': from msmbuilder.decomposition import PCA dcmp = PCA(n_components=n_components) elif decomposer == 'tICA': from msmbuilder.decomposition import tICA dcmp = tICA(n_components=n_components, lag_time=lag_time) return dcmp.fit_transform(features)
def test_1(): #Compare msmbuilder.pca with sklearn.decomposition pcar = PCAr() pcar.fit(np.concatenate(trajs)) pca = PCA() pca.fit(trajs) y_ref1 = pcar.transform(trajs[0]) y1 = pca.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1) np.testing.assert_array_almost_equal(pca.components_, pcar.components_) np.testing.assert_array_almost_equal(pca.explained_variance_, pcar.explained_variance_) np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_) np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_) np.testing.assert_array_almost_equal(pca.noise_variance_, pcar.noise_variance_)
def test_generator(): # Check to see if it works with a generator traj_dict = dict((i, t) for i, t in enumerate(trajs)) pcar = PCAr() pcar.fit(np.concatenate(trajs)) pca = PCA() # on python 3, dict.values() returns a generator pca.fit(traj_dict.values()) y_ref1 = pcar.transform(trajs[0]) y1 = pca.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1) np.testing.assert_array_almost_equal(pca.components_, pcar.components_) np.testing.assert_array_almost_equal(pca.explained_variance_, pcar.explained_variance_) np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_) np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_) np.testing.assert_array_almost_equal(pca.noise_variance_, pcar.noise_variance_)
def test_2(): # Tet that PCA it works in a msmbuilder pipeline p = Pipeline([('pca', PCA()), ('cluster', KCenters())]) p.fit(trajs)
class SolventShellsAnalysis(): """Do analysis on solvent shell results. The protocol is as follows: 1. Normalize by shell volume 2. Flatten to 2d (for compatibility with tICA, et. al.) 3. Remove zero-variance features :param seqs: Sequences of counts. List of shape (n_frames, n_solute, n_shells) arrays :param shell_w: Shell width (nm) """ def __init__(self, seqs, shell_w): self._seqs3d_unnormed = seqs self._seqs3d = None self._seqs2d_unpruned = None self._seqs2d = None self._deleted = None self.shell_w = shell_w self.tica = None self.pca = None self.ticax = None self.pcax = None @property def seqs3d_unnormed(self): """Unnormalized (input) sequences""" return self._seqs3d_unnormed @property def seqs3d(self): """Normalized 3d sequences.""" if self._seqs3d is None: self._seqs3d = [normalize(fp3d, self.shell_w) for fp3d in self.seqs3d_unnormed] return self._seqs3d @property def seqs2d_unpruned(self): """Reshaped (2D) sequences.""" if self._seqs2d_unpruned is None: self._seqs2d_unpruned = [reshape(fp3d) for fp3d in self.seqs3d] return self._seqs2d_unpruned @property def seqs2d(self): """Reshaped with zero-variance features removed. Input this to tICA, MSM, etc. """ if self._seqs2d is None: self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned) return self._seqs2d @property def deleted(self): """Which features (2d-indexing) we deleted.""" if self._deleted is None: self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned) return self._deleted def fit_tica(self, lag_time): self.tica = tICA(n_components=10, lag_time=lag_time, weighted_transform=True) self.tica.fit(self.seqs2d) self.ticax = self.tica.transform(self.seqs2d) def fit_pca(self): self.pca = PCA(n_components=10) self.pca.fit(self.seqs2d) self.pcax = self.pca.transform(self.seqs2d)
def fit_pca(self): self.pca = PCA(n_components=10) self.pca.fit(self.seqs2d) self.pcax = self.pca.transform(self.seqs2d)
class SolventShellsAnalysis(): """Do analysis on solvent shell results. The protocol is as follows: 1. Normalize by shell volume 2. Flatten to 2d (for compatibility with tICA, et. al.) 3. Remove zero-variance features :param seqs: Sequences of counts. List of shape (n_frames, n_solute, n_shells) arrays :param shell_w: Shell width (nm) """ def __init__(self, seqs, shell_w): self._seqs3d_unnormed = seqs self._seqs3d = None self._seqs2d_unpruned = None self._seqs2d = None self._deleted = None self.shell_w = shell_w self.tica = None self.pca = None self.ticax = None self.pcax = None @property def seqs3d_unnormed(self): """Unnormalized (input) sequences""" return self._seqs3d_unnormed @property def seqs3d(self): """Normalized 3d sequences.""" if self._seqs3d is None: self._seqs3d = [ normalize(fp3d, self.shell_w) for fp3d in self.seqs3d_unnormed ] return self._seqs3d @property def seqs2d_unpruned(self): """Reshaped (2D) sequences.""" if self._seqs2d_unpruned is None: self._seqs2d_unpruned = [reshape(fp3d) for fp3d in self.seqs3d] return self._seqs2d_unpruned @property def seqs2d(self): """Reshaped with zero-variance features removed. Input this to tICA, MSM, etc. """ if self._seqs2d is None: self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned) return self._seqs2d @property def deleted(self): """Which features (2d-indexing) we deleted.""" if self._deleted is None: self._seqs2d, self._deleted = prune_all(self.seqs2d_unpruned) return self._deleted def fit_tica(self, lag_time): self.tica = tICA(n_components=10, lag_time=lag_time, weighted_transform=True) self.tica.fit(self.seqs2d) self.ticax = self.tica.transform(self.seqs2d) def fit_pca(self): self.pca = PCA(n_components=10) self.pca.fit(self.seqs2d) self.pcax = self.pca.transform(self.seqs2d)