def fit_protein_pca(yaml_file): mdl_dir = yaml_file["mdl_dir"] mdl_params = yaml_file["mdl_params"] current_mdl_params={} for i in mdl_params.keys(): if i.startswith("pca__"): current_mdl_params[i.split("pca__")[1]] = mdl_params[i] protein_pca_mdl = PCA(**current_mdl_params) for protein in yaml_file["protein_list"]: print("Fitting to protein %s" % protein) with enter_protein_data_dir(yaml_file, protein): featurized_traj = sorted(glob.glob("./%s/*.jl" % yaml_file["feature_dir"]), key=keynat) for f in featurized_traj: featurized_path = verboseload(f) try: protein_pca_mdl.partial_fit(featurized_path) except: pass print("Done partial fitting to protein %s" % protein) # dumping the pca_mdl pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl") verbosedump(protein_pca_mdl, pca_mdl_path) return
def decompose_features(features, decomposer, n_components=None, lag_time=1): ''' Decomposing features is a way to reduce the dimension of the features. Each of the components is a eigenvector of the feature space, dimension: (n_features,) The old features are transformed to the new feature space. Consider one sample, which is vectorized to (n_features,).T, apply the transform matrix, which is in the shape (n_components, n_features), we will get its projection onto the new space (n_components,). -------------------------------------------------------------------------------------------------------------------------------------- Input features : array-like, length n_trajs, each of shape (n_samples, n_features) Output features_new : array-like, length n_trajs, each of shape (n_samples, n_components) ((n_samples, n_samples) if n_components = None) dcmp.components_ : shape (n_components, n_features), ((n_samples, n_features) if n_components = None) PCA : Principal axes in feature space, representing the directions of maximum variance in the data. tICA : Components with maximum autocorrelation. ''' if decomposer == 'PCA': from msmbuilder.decomposition import PCA dcmp = PCA(n_components=n_components) elif decomposer == 'tICA': from msmbuilder.decomposition import tICA dcmp = tICA(n_components=n_components, lag_time=lag_time) features_new = dcmp.fit_transform(features) return features_new, dcmp.components_
def decompose_features(features, decomposer, n_components=None, lag_time=1): ''' Input features : list of arrays, length n_trajs, each of shape (n_samples, n_features) Output features_new : list of arrays, length n_trajs, each of shape (n_samples, n_features_new) ''' if decomposer == 'PCA': from msmbuilder.decomposition import PCA dcmp = PCA(n_components=n_components) elif decomposer == 'tICA': from msmbuilder.decomposition import tICA dcmp = tICA(n_components=n_components, lag_time=lag_time) return dcmp.fit_transform(features)
def test_1(): #Compare msmbuilder.pca with sklearn.decomposition pcar = PCAr() pcar.fit(np.concatenate(trajs)) pca = PCA() pca.fit(trajs) y_ref1 = pcar.transform(trajs[0]) y1 = pca.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1) np.testing.assert_array_almost_equal(pca.components_, pcar.components_) np.testing.assert_array_almost_equal(pca.explained_variance_, pcar.explained_variance_) np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_) np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_) np.testing.assert_array_almost_equal(pca.noise_variance_, pcar.noise_variance_)
def test_generator(): # Check to see if it works with a generator traj_dict = dict((i, t) for i, t in enumerate(trajs)) pcar = PCAr() pcar.fit(np.concatenate(trajs)) pca = PCA() # on python 3, dict.values() returns a generator pca.fit(traj_dict.values()) y_ref1 = pcar.transform(trajs[0]) y1 = pca.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1) np.testing.assert_array_almost_equal(pca.components_, pcar.components_) np.testing.assert_array_almost_equal(pca.explained_variance_, pcar.explained_variance_) np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_) np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_) np.testing.assert_array_almost_equal(pca.noise_variance_, pcar.noise_variance_)
def test_2(): # Tet that PCA it works in a msmbuilder pipeline p = Pipeline([('pca', PCA()), ('cluster', KCenters())]) p.fit(trajs)
def fit_pca(self): self.pca = PCA(n_components=10) self.pca.fit(self.seqs2d) self.pcax = self.pca.transform(self.seqs2d)