def fit_protein_pca(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("pca__"):
            current_mdl_params[i.split("pca__")[1]] = mdl_params[i]

    protein_pca_mdl = PCA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_pca_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the pca_mdl
    pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    verbosedump(protein_pca_mdl, pca_mdl_path)
    return
def decompose_features(features, decomposer, n_components=None, lag_time=1):
    '''
    Decomposing features is a way to reduce the dimension of the features. 

    Each of the components is a eigenvector of the feature space, dimension: (n_features,) 

    The old features are transformed to the new feature space. 
    
    Consider one sample, which is vectorized to (n_features,).T, 
    apply the transform matrix, which is in the shape (n_components, n_features), 
    we will get its projection onto the new space (n_components,). 

    --------------------------------------------------------------------------------------------------------------------------------------
    Input
    features         : array-like, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    features_new     : array-like, length n_trajs, each of shape (n_samples, n_components) ((n_samples, n_samples) if n_components = None) 

    dcmp.components_ : shape (n_components, n_features), ((n_samples, n_features) if n_components = None)
        PCA  : Principal axes in feature space, representing the directions of maximum variance in the data.
        tICA : Components with maximum autocorrelation. 
    '''
    if decomposer == 'PCA':
        from msmbuilder.decomposition import PCA
        dcmp = PCA(n_components=n_components)
    elif decomposer == 'tICA':
        from msmbuilder.decomposition import tICA
        dcmp = tICA(n_components=n_components, lag_time=lag_time)
    features_new = dcmp.fit_transform(features)
    return features_new, dcmp.components_
Exemplo n.º 3
0
def decompose_features(features, decomposer, n_components=None, lag_time=1):
    '''
    Input
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    features_new : list of arrays, length n_trajs, each of shape (n_samples, n_features_new) 
    '''
    if decomposer == 'PCA':
        from msmbuilder.decomposition import PCA
        dcmp = PCA(n_components=n_components)
    elif decomposer == 'tICA':
        from msmbuilder.decomposition import tICA
        dcmp = tICA(n_components=n_components, lag_time=lag_time)
    return dcmp.fit_transform(features)
Exemplo n.º 4
0
def test_1():
    #Compare msmbuilder.pca with sklearn.decomposition

    pcar = PCAr()
    pcar.fit(np.concatenate(trajs))

    pca = PCA()
    pca.fit(trajs)

    y_ref1 = pcar.transform(trajs[0])
    y1 = pca.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    np.testing.assert_array_almost_equal(pca.components_, pcar.components_)
    np.testing.assert_array_almost_equal(pca.explained_variance_,
                                         pcar.explained_variance_)
    np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_)
    np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_)
    np.testing.assert_array_almost_equal(pca.noise_variance_,
                                         pcar.noise_variance_)
Exemplo n.º 5
0
def test_generator():
    # Check to see if it works with a generator

    traj_dict = dict((i, t) for i, t in enumerate(trajs))

    pcar = PCAr()
    pcar.fit(np.concatenate(trajs))

    pca = PCA()
    # on python 3, dict.values() returns a generator
    pca.fit(traj_dict.values())

    y_ref1 = pcar.transform(trajs[0])
    y1 = pca.transform(trajs)[0]

    np.testing.assert_array_almost_equal(y_ref1, y1)
    np.testing.assert_array_almost_equal(pca.components_, pcar.components_)
    np.testing.assert_array_almost_equal(pca.explained_variance_,
                                         pcar.explained_variance_)
    np.testing.assert_array_almost_equal(pca.mean_, pcar.mean_)
    np.testing.assert_array_almost_equal(pca.n_components_, pcar.n_components_)
    np.testing.assert_array_almost_equal(pca.noise_variance_,
                                         pcar.noise_variance_)
Exemplo n.º 6
0
def test_2():
    # Tet that PCA it works in a msmbuilder pipeline

    p = Pipeline([('pca', PCA()), ('cluster', KCenters())])
    p.fit(trajs)
Exemplo n.º 7
0
 def fit_pca(self):
     self.pca = PCA(n_components=10)
     self.pca.fit(self.seqs2d)
     self.pcax = self.pca.transform(self.seqs2d)