Exemplo n.º 1
0
def test_score_2():
    X = np.random.randn(100, 5)
    Y = np.random.randn(100, 5)
    model = tICA(shrinkage=0.0, n_components=2).fit([X])
    s1 = model.score([Y])
    s2 = tICA(shrinkage=0.0).fit(model.transform([Y])).eigenvalues_.sum()

    eq(s1, s2)
Exemplo n.º 2
0
def test_kinetic_mapping():
    np.random.seed(42)
    X = np.random.randn(10, 3)

    tica1 = tICA(n_components=2, lag_time=1)
    tica2 = tICA(n_components=2, lag_time=1, kinetic_mapping=True)

    y1 = tica1.fit_transform([np.copy(X)])[0]
    y2 = tica2.fit_transform([np.copy(X)])[0]

    assert eq(y2, y1*tica1.eigenvalues_)
Exemplo n.º 3
0
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None):
	if not os.path.exists(ktica_dir): os.makedirs(ktica_dir)
	
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)

	if not os.path.exists(nystroem_data_filename):
		clusterer = verboseload(clusterer_dir)
		tica = verboseload(tica_dir)
		features = tica
		clusters = clusterer.cluster_centers_
		landmarks = clusters

		print("here's what goes into the combined class:")
		#print(np.shape(features))
		print(np.shape(landmarks))
		print(type(landmarks))
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)

	print(np.shape(nyx))
	print(dir(nyx))

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
Exemplo n.º 4
0
def test_subsampler_tica():
    n_traj, n_samples, n_features = 1, 500, 4
    lag_time = 2
    X_all_0 = [random.normal(size=(n_samples, n_features)) for i in range(n_traj)]
    tica_0 = tICA(lag_time=lag_time)
    tica_0.fit(X_all_0)

    subsampler = Subsampler(lag_time=lag_time)
    tica_1 = tICA()
    pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler), ('tica', tica_1)])    
    pipeline.fit(X_all_0)

    eq(tica_0.n_features, tica_1.n_features)  # Obviously true
    eq(tica_0.n_observations_, tica_1.n_observations_)
    eq(tica_0.eigenvalues_, tica_1.eigenvalues_)  # The eigenvalues should be the same.  NOT the timescales, as tica_1 has timescales calculated in a different time unit
Exemplo n.º 5
0
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5):
	if not os.path.exists(model_dir):
		os.makedirs(model_dir)

	projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir
	fit_model_filename  = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir
	#active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

	tica_model = tICA(n_components = n_components, lag_time = lag_time)

	if not os.path.exists(projected_data_filename):
		print("loading feature files")
		feature_files = get_trajectory_files(features_directory, ext = ".h5")
		pool = mp.Pool(mp.cpu_count())
		features = pool.map(load_features, feature_files)
		pool.terminate()
		if not os.path.exists(fit_model_filename):
			print("fitting data to tICA model")
			fit_model = tica_model.fit(features)
			verbosedump(fit_model, fit_model_filename)
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
		else:
			print("loading tICA model")
			fit_model = verboseload(fit_model_filename)
			print("transforming")
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
	else:
		fit_model = verboseload(fit_model_filename)
		transformed_data = verboseload(projected_data_filename)

	print fit_model.summarize()
Exemplo n.º 6
0
def test_MetEnkephalin():
    np.random.seed(0)
    data = build_dataset()
    n_features = data[0].shape[1]

    # check whether this recovers a single 1-sparse eigenpair without error
    kstica = KSparseTICA(n_components=1, k=1)
    _ = kstica.fit_transform(data)
    assert (np.sum(kstica.components_ != 0) == 1)

    ## check whether this recovers >1 eigenpair without error
    #kstica = KSparseTICA(n_components=2)
    #_ = kstica.fit_transform(data)

    ## check whether this recovers all eigenpairs without error
    #kstica = KSparseTICA()
    #_ = kstica.fit_transform(data)

    # check whether we recover the same solution as standard tICA when k = n_features
    n_components = 10
    kstica = KSparseTICA(n_components=n_components, k=n_features)
    tica = tICA(n_components=n_components)
    _ = kstica.fit_transform(data)
    _ = tica.fit_transform(data)
    np.testing.assert_array_almost_equal(kstica.eigenvalues_,
                                         tica.eigenvalues_)
Exemplo n.º 7
0
def test_MetEnkephalin():
    np.random.seed(0)
    data = build_dataset()
    n_features = data[0].shape[1]

    # check whether this recovers a single 1-sparse eigenpair without error
    kstica = KSparseTICA(n_components=1, k = 1)
    _ = kstica.fit_transform(data)
    assert (np.sum(kstica.components_ != 0) == 1)

    ## check whether this recovers >1 eigenpair without error
    #kstica = KSparseTICA(n_components=2)
    #_ = kstica.fit_transform(data)

    ## check whether this recovers all eigenpairs without error
    #kstica = KSparseTICA()
    #_ = kstica.fit_transform(data)

    # check whether we recover the same solution as standard tICA when k = n_features
    n_components = 10
    kstica = KSparseTICA(n_components=n_components, k=n_features)
    tica = tICA(n_components=n_components)
    _ = kstica.fit_transform(data)
    _ = tica.fit_transform(data)
    np.testing.assert_array_almost_equal(kstica.eigenvalues_, tica.eigenvalues_)
Exemplo n.º 8
0
 def __init__(self, args):
     from msmbuilder.decomposition import tICA
     if args.lag_time <= 0:
         self.error('offset must be greater than or equal to zero')
     self.args = args
     self.model = tICA(n_components=2, lag_time=self.args.lag_time)
     self.labels = [b'tIC1', b'tIC2']
def fit_protein_tica(yaml_file,sparse=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
Exemplo n.º 10
0
def ktica_test(features_dir,
               tica_dir,
               landmark_indices=None,
               nystroem_components=1000,
               tica_components=10,
               lag_time=5,
               nystroem_data_filename="",
               fit_model_filename="",
               projected_data_filename=""):
    nys = Nystroem(n_components=nystroem_components)
    tica_model = tICA(n_components=tica_components, lag_time=lag_time)
    feature_files = get_trajectory_files(features_dir, ext=".h5")[0:3]

    #if os.path.exists(nystroem_data_filename):
    #	nyx = verboseload(nystroem_data_filename)
    #else:
    features = load_file_list(feature_files)
    nyx = nys.fit_transform(features)
    verbosedump(nyx, nystroem_data_filename)

    print((np.shape(nyx)))
    print((dir(nyx)))

    fit_model = tica_model.fit(nyx)
    verbosedump(fit_model, fit_model_filename)
    transformed_data = fit_model.transform(nyx)
    verbosedump(transformed_data, projected_data_filename)

    return
Exemplo n.º 11
0
def make_tica_opt(trajectories_t, timeskip_t):
#
	frameskip = 0
	
	shortest_length = len(trajectories_t[0])
	
	for i in range(len(trajectories_t)):
	#
		if shortest_length > len(trajectories_t[i]):
		#
			shortest_length = len(trajectories_t[i])
		#
	#
	
	frameskip = int(shortest_length/2)
		
	print("Frameskip {:f} ns ({:d} frames) :".format(float(frameskip)*timeskip_t/1000.0, frameskip))
	
	tica_tot_t = tICA(n_components = len(trajectories_t[0][0]), lag_time = frameskip)
	
	tica_tot_t.fit(trajectories_t)
	
	usable_comps_t = get_smallest_tscale(tica_tot_t)
	
	equil_t, equil_dists_t = in_equil(tica_tot_t, usable_comps_t)
	
	n_comp_t = find_components(tica_tot_t, usable_comps_t)
	
	return tica_tot_t, equil_t, equil_dists_t, n_comp_t, usable_comps_t, frameskip
Exemplo n.º 12
0
    def test_plot_decomp_grid(self):
        from msmbuilder.decomposition import tICA

        tica = tICA(n_components=2).fit([data])
        ax = plot_decomp_grid(tica, xlim=(0., 1.), ylim=(0., 1.))

        assert isinstance(ax, SubplotBase)
Exemplo n.º 13
0
def stepwise_analysis(counts_fns, out_fn_prefix, lag_time):
    # Load data from tables
    seqs_3d_unorm = []
    shell_w = -1
    for counts_fn in counts_fns:
        h = tables.open_file(counts_fn)
        seq = h.root.shell_counts[:]
        shell_w, = h.root.shell_width[:]
        seqs_3d_unorm.append(seq)
        h.close()
    pickle_save(seqs_3d_unorm, "{}.3d.unnorm.pickl".format(out_fn_prefix))

    # Normalize
    seqs_3d_norm = [normalize(fp3d, shell_w) for fp3d in seqs_3d_unorm]
    del seqs_3d_unorm
    pickle_save(seqs_3d_norm, '{}.3d.norm.pickl'.format(out_fn_prefix))

    # Flatten
    seqs_2d_uprune = [reshape(fp3d) for fp3d in seqs_3d_norm]
    del seqs_3d_norm
    pickle_save(seqs_2d_uprune, '{}.2d.uprune.pickl'.format(out_fn_prefix))

    # Prune low variance
    seqs_2d_prune, deleted = prune_all(seqs_2d_uprune)
    del seqs_2d_uprune
    pickle_save(seqs_2d_prune, '{}.2d.prune.pickl'.format(out_fn_prefix))
    pickle_save(deleted, '{}.deleted.pickl'.format(out_fn_prefix))

    # Fit tICA
    tica = tICA(n_components=10, lag_time=lag_time, weighted_transform=True)
    ticax = tica.fit_transform(seqs_2d_prune)
    del seqs_2d_prune
    pickle_save(tica, '{}.tica.pickl'.format(out_fn_prefix))
    pickle_save(ticax, '{}.ticax.pickl'.format(out_fn_prefix))
Exemplo n.º 14
0
 def __init__(self, args):
     from msmbuilder.decomposition import tICA
     if args.lag_time <= 0:
         self.error('offset must be greater than or equal to zero')
     self.args = args
     self.model = tICA(n_components=2, lag_time=self.args.lag_time)
     self.labels = [b'tIC1', b'tIC2']
Exemplo n.º 15
0
def stepwise_analysis(counts_fns, out_fn_prefix, lag_time):
    # Load data from tables
    seqs_3d_unorm = []
    shell_w = -1
    for counts_fn in counts_fns:
        h = tables.open_file(counts_fn)
        seq = h.root.shell_counts[:]
        shell_w, = h.root.shell_width[:]
        seqs_3d_unorm.append(seq)
        h.close()
    pickle_save(seqs_3d_unorm, "{}.3d.unnorm.pickl".format(out_fn_prefix))

    # Normalize
    seqs_3d_norm = [normalize(fp3d, shell_w) for fp3d in seqs_3d_unorm]
    del seqs_3d_unorm
    pickle_save(seqs_3d_norm, '{}.3d.norm.pickl'.format(out_fn_prefix))

    # Flatten
    seqs_2d_uprune = [reshape(fp3d) for fp3d in seqs_3d_norm]
    del seqs_3d_norm
    pickle_save(seqs_2d_uprune, '{}.2d.uprune.pickl'.format(out_fn_prefix))

    # Prune low variance
    seqs_2d_prune, deleted = prune_all(seqs_2d_uprune)
    del seqs_2d_uprune
    pickle_save(seqs_2d_prune, '{}.2d.prune.pickl'.format(out_fn_prefix))
    pickle_save(deleted, '{}.deleted.pickl'.format(out_fn_prefix))

    # Fit tICA
    tica = tICA(n_components=10, lag_time=lag_time, weighted_transform=True)
    ticax = tica.fit_transform(seqs_2d_prune)
    del seqs_2d_prune
    pickle_save(tica, '{}.tica.pickl'.format(out_fn_prefix))
    pickle_save(ticax, '{}.ticax.pickl'.format(out_fn_prefix))
Exemplo n.º 16
0
def make_tica_opt(trajectories_t, timeskip_t):
    #
    frameskip = 0

    shortest_length = len(trajectories_t[0])

    for i in range(len(trajectories_t)):
        #
        if shortest_length > len(trajectories_t[i]):
            #
            shortest_length = len(trajectories_t[i])
        #
    #

    frameskip = int(shortest_length / 2)

    print("Frameskip {:f} ns ({:d} frames) :".format(
        float(frameskip) * timeskip_t / 1000.0, frameskip))

    tica_tot_t = tICA(n_components=len(trajectories_t[0][0]),
                      lag_time=frameskip)

    tica_tot_t.fit(trajectories_t)

    usable_comps_t = get_smallest_tscale(tica_tot_t)

    equil_t, equil_dists_t = in_equil(tica_tot_t, usable_comps_t)

    n_comp_t = find_components(tica_tot_t, usable_comps_t)

    return tica_tot_t, equil_t, equil_dists_t, n_comp_t, usable_comps_t, frameskip
def decompose_features(features, decomposer, n_components=None, lag_time=1):
    '''
    Decomposing features is a way to reduce the dimension of the features. 

    Each of the components is a eigenvector of the feature space, dimension: (n_features,) 

    The old features are transformed to the new feature space. 
    
    Consider one sample, which is vectorized to (n_features,).T, 
    apply the transform matrix, which is in the shape (n_components, n_features), 
    we will get its projection onto the new space (n_components,). 

    --------------------------------------------------------------------------------------------------------------------------------------
    Input
    features         : array-like, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    features_new     : array-like, length n_trajs, each of shape (n_samples, n_components) ((n_samples, n_samples) if n_components = None) 

    dcmp.components_ : shape (n_components, n_features), ((n_samples, n_features) if n_components = None)
        PCA  : Principal axes in feature space, representing the directions of maximum variance in the data.
        tICA : Components with maximum autocorrelation. 
    '''
    if decomposer == 'PCA':
        from msmbuilder.decomposition import PCA
        dcmp = PCA(n_components=n_components)
    elif decomposer == 'tICA':
        from msmbuilder.decomposition import tICA
        dcmp = tICA(n_components=n_components, lag_time=lag_time)
    features_new = dcmp.fit_transform(features)
    return features_new, dcmp.components_
Exemplo n.º 18
0
def ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse = False, shrinkage = 0.05, wolf = True, rho = 0.01):
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			if wolf:
				tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
			else:
				tica_model = tICA(n_components = n_components, lag_time = lag_time, gamma = shrinkage)

		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)


	if not os.path.exists(nystroem_data_filename):
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		print("Computed Nystroem.")
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)
		print("Loaded Nystroem")

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
Exemplo n.º 19
0
def test_score_1():
    X = np.random.randn(100, 5)
    for n in range(1, 5):
        tica = tICA(n_components=n, gamma=0)
        tica.fit([X])
        assert_approx_equal(tica.score([X]), tica.eigenvalues_.sum())
        X2 = np.random.randn(100, 5)
        assert tica.score([X2]) < tica.score([X])
        assert_approx_equal(tica.score([X]), tica.score_)
Exemplo n.º 20
0
def test_1():
    data = build_dataset()
    tica = tICA(n_components=1).fit(data)
    tic0 = tica.components_[0]
    print('tICA\n', tic0)

    stica = SparseTICA(n_components=1, verbose=True).fit(data)
    stic0 = stica.components_[0]
    print('Sparse tICA\n', stic0)
    assert np.allclose(stic0, [1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Exemplo n.º 21
0
def test_doublewell():
    data = build_dataset()
    tica = tICA(n_components=1).fit(data)
    tic0 = tica.components_[0]

    stica = SparseTICA(n_components=1, verbose=False).fit(data)
    stic0 = stica.components_[0]

    np.testing.assert_array_almost_equal(stic0[1:], np.zeros(9))
    np.testing.assert_almost_equal(stic0[0], 0.58, decimal=1)
Exemplo n.º 22
0
def test_singular_1():
    tica = tICA(n_components=1)

    # make some data that has one column repeated twice
    X = np.random.randn(100, 2)
    X = np.hstack((X, X[:, 0, np.newaxis]))

    tica.fit([X])
    assert tica.components_.dtype == np.float64
    assert tica.eigenvalues_.dtype == np.float64
Exemplo n.º 23
0
def test_singular_2():
    tica = tICA(n_components=1)

    # make some data that has one column of all zeros
    X = np.random.randn(100, 2)
    X = np.hstack((X, np.zeros((100, 1))))

    tica.fit([X])
    assert tica.components_.dtype == np.float64
    assert tica.eigenvalues_.dtype == np.float64
Exemplo n.º 24
0
def test_singular_2():
    tica = tICA(n_components=1)

    # make some data that has one column of all zeros
    X = np.random.randn(100, 2)
    X = np.hstack((X, np.zeros((100, 1))))

    tica.fit([X])
    assert tica.components_.dtype == np.float64
    assert tica.eigenvalues_.dtype == np.float64
Exemplo n.º 25
0
def test_singular_1():
    tica = tICA(n_components=1)

    # make some data that has one column repeated twice
    X = np.random.randn(100, 2)
    X = np.hstack((X, X[:,0, np.newaxis]))

    tica.fit([X])
    assert tica.components_.dtype == np.float64
    assert tica.eigenvalues_.dtype == np.float64
Exemplo n.º 26
0
def test_subsampler_tica():
    n_traj, n_samples, n_features = 1, 500, 4
    lag_time = 2
    X_all_0 = [
        random.normal(size=(n_samples, n_features)) for i in range(n_traj)
    ]
    tica_0 = tICA(lag_time=lag_time)
    tica_0.fit(X_all_0)

    subsampler = Subsampler(lag_time=lag_time)
    tica_1 = tICA()
    pipeline = sklearn.pipeline.Pipeline([("subsampler", subsampler),
                                          ('tica', tica_1)])
    pipeline.fit(X_all_0)

    eq(tica_0.n_features, tica_1.n_features)  # Obviously true
    eq(tica_0.n_observations_, tica_1.n_observations_)
    eq(
        tica_0.eigenvalues_, tica_1.eigenvalues_
    )  # The eigenvalues should be the same.  NOT the timescales, as tica_1 has timescales calculated in a different time unit
Exemplo n.º 27
0
def test_sample_dimension():
    np.random.seed(42)
    X = np.random.randn(500, 5)
    data = [X, X, X]

    tica = tICA(n_components=2, lag_time=1).fit(data)
    tica_trajs = {k: tica.partial_transform(v) for k, v in enumerate(data)}
    res = sample_dimension(tica_trajs, 0, 10, scheme="linear")
    res2 = sample_dimension(tica_trajs, 1, 10, scheme="linear")

    assert len(res) == len(res2) == 10
Exemplo n.º 28
0
def test_score_1():
    X = np.random.randn(100, 5)
    for n in range(1, 5):
        tica = tICA(n_components=n, gamma=0)
        tica.fit([X])
        assert_approx_equal(
            tica.score([X]),
            tica.eigenvalues_.sum())
        X2 = np.random.randn(100, 5)
        assert tica.score([X2]) < tica.score([X])
        assert_approx_equal(tica.score([X]), tica.score_)
Exemplo n.º 29
0
def get_pipeline(parameters):
    """
    Wrapper so that new instance of a pipeline can be instantiated for every fold. 
    :return: sklean.pipeline.Pipeline object
    """
    pipe = Pipeline([('variance_cut', VarianceThreshold()),
                     ('tica', tICA(kinetic_mapping=True)),
                     ('cluster', MiniBatchKMeans()),
                     ('msm', MarkovStateModel(use_gap='timescales', lag_time=50, verbose=True))])
    pipe.set_params(**parameters)

    return pipe
Exemplo n.º 30
0
def tica_wrapper(proj_folder, feature_dict, lag_time=10):
    #100ps*100==10ns and 10 features
    if os.path.exists(proj_folder + "/tica_features.pkl"):
        return verboseload(proj_folder + "/tica_features.pkl")

    tica_mdl = tICA(lag_time=lag_time, n_components=10)
    tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

    tica_features = {}
    for i in feature_dict.keys():
        tica_features[i] = tica_mdl.transform([feature_dict[i]])[0]
    verbosedump(tica_features, proj_folder + "/tica_features.pkl")
    return tica_features
Exemplo n.º 31
0
def tica_wrapper(proj_folder,feature_dict,lag_time=10):
     #100ps*100==10ns and 10 features
     if os.path.exists(proj_folder+"/tica_features.pkl"):
          return verboseload(proj_folder+"/tica_features.pkl")

     tica_mdl = tICA(lag_time=lag_time,n_components=10)
     tica_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

     tica_features={}
     for i in feature_dict.keys():
          tica_features[i] = tica_mdl.transform([feature_dict[i]])[0]
     verbosedump(tica_features,proj_folder+"/tica_features.pkl")
     return tica_features
Exemplo n.º 32
0
def decompose_features(features, decomposer, n_components=None, lag_time=1):
    '''
    Input
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    features_new : list of arrays, length n_trajs, each of shape (n_samples, n_features_new) 
    '''
    if decomposer == 'PCA':
        from msmbuilder.decomposition import PCA
        dcmp = PCA(n_components=n_components)
    elif decomposer == 'tICA':
        from msmbuilder.decomposition import tICA
        dcmp = tICA(n_components=n_components, lag_time=lag_time)
    return dcmp.fit_transform(features)
Exemplo n.º 33
0
    def build_model(self, user_defined_model):
        """
        Load or build a model (Pipeline from scikit-learn) to do all the transforming and fitting
        :param user_defined_model: Either a string (to load from disk) or a Pipeline object to use as model
        :return model: Return the model back
        """
        if user_defined_model is None:
            if os.path.exists(self.model_pkl_fname):
                logger.info('Loading model pkl file {}'.format(
                    self.model_pkl_fname))
                model = load_generic(self.model_pkl_fname)
            else:
                logger.info('Building default model based on dihedrals')

                # build a lag time of 1 ns for tICA and msm
                # if the stride is too big and we can't do that
                # use 1 frame and report how much that is in ns
                if self.app.meta is not None:
                    lag_time = max(1, int(1 / self.timestep))
                    logger.info(
                        'Using a lag time of {} ns for the tICA and MSM'.
                        format(lag_time * self.timestep))
                else:
                    self.timestep = None
                    lag_time = 1
                    logger.warning(
                        'Cannot determine timestep. Defaulting to 1 frame.'.
                        format(lag_time))
                model = Pipeline([('feat', DihedralFeaturizer()),
                                  ('scaler', RobustScaler()),
                                  ('tICA',
                                   tICA(lag_time=lag_time,
                                        commute_mapping=True,
                                        n_components=10)),
                                  ('clusterer',
                                   MiniBatchKMeans(n_clusters=200)),
                                  ('msm',
                                   MarkovStateModel(lag_time=lag_time,
                                                    ergodic_cutoff='off',
                                                    reversible_type=None))])
        else:
            if not isinstance(user_defined_model, Pipeline):
                raise ValueError(
                    'model is not an sklearn.pipeline.Pipeline object')
            else:
                logger.info('Using user defined model')
                model = user_defined_model
        return model
Exemplo n.º 34
0
    def generate_tics(self, featurized):
        """
        Now tracks tica object and partially fits on it
        to speed up this step a lot by only adding new data rather than re-fitting
        each time.
        reduced dataset

        Returns: tica'd dataset
        """

        if os.path.isfile(
                os.path.join(self.dir, "tICA_%d.h5" % self.generation)):
            ticr = utils.load_tica_h5(
                os.path.join(self.dir, "tICA_%d.h5" % self.generation))

        elif os.path.isfile(os.path.join(self.dir, "tICA.pkl")):  # legacy
            ticr = utils.load(os.path.join(self.dir, "tICA.pkl"))

        else:
            ticr = tICA(n_components=self.config.getint("model", "num_tics"),
                        lag_time=self.config.getint("model", "tica_lag"))

        for newfeat in featurized:
            ticr.partial_fit(newfeat)

        utils.save_tica_h5(
            ticr, os.path.join(self.dir, "tICA_%d.h5" % self.generation))

        # Now apply tica to the whole feature set.
        # We need to do this to all featurized data again since the tics
        # have changed since we just updated them with new data
        # Do one at a time to save memory.

        ticad = []
        for gen in range(1, self.generation):
            if os.path.isfile("%s.h5" % self.featurized % gen):
                feated = utils.load_features_h5("%s.h5" % self.featurized %
                                                gen)
            else:
                feated = utils.load("%s.pkl" % self.featurized % gen)

            ticad.extend(ticr.transform(feated))

        # Add the features we have in memory now
        ticad.extend(ticr.transform(featurized))
        utils.save_features_h5(ticad, "ticad_%d.h5" % self.generation)

        return ticad
Exemplo n.º 35
0
 def setUp(self):
     numpy.random.seed(12)
     self.top = 'data_app/runs/structure.prmtop'
     self.traj_1 = 'data_app/runs/run-000.nc'
     self.traj_2 = 'data_app/runs/run-001.nc'
     self.feat = DihedralFeaturizer()
     self.traj_dict = {
         0: load(self.traj_1, top=self.top),
         1: load(self.traj_2, top=self.top)
     }
     self.scaler = RobustScaler()
     self.tica = tICA(n_components=2)
     self.ftrajs = {
         0: numpy.random.rand(100, 50),
         1: numpy.random.rand(100, 50),
     }
Exemplo n.º 36
0
def featurizeData(xyz, tica_dim):
    featurizer = DihedralFeaturizer(types=['phi', 'psi'])
    if os.path.exists('diheds'):
        os.system('rm -rf diheds')
    diheds = xyz.fit_transform_with(featurizer, 'diheds/', fmt='dir-npy')

    scaler = RobustScaler()
    if os.path.exists('scaled_diheds'):
        os.system('rm -rf scaled_diheds')
    scaled_diheds = diheds.fit_transform_with(scaler, 'scaled_diheds/', fmt='dir-npy')

    tica_model = tICA(lag_time=1, n_components=tica_dim)
    tica_model = scaled_diheds.fit_with(tica_model)
    if os.path.exists('ticas'):
        os.system('rm -rf ticas')
    tica_trajs = scaled_diheds.transform_with(tica_model, 'ticas/', fmt='dir-npy')
    return tica_trajs
def fit_predict_tica_embeddings(traj_folder, reference_tica_information,
                                outputdir, names, n_components, tica_lagtime):
    traj_list_array, pairwise_distance = readtrajs_from_folder(traj_folder)
    #calculate the tica means for the current system
    test_tica = tICA(lag_time=tica_lagtime, n_components=n_components)
    test_tica.fit(pairwise_distance)
    numpy.savetxt('%s/%s_pairwise_means' % (outputdir, names),
                  test_tica.means_)

    for line in range(len(traj_list_array)):
        temp = numpy.load("%s/%s.npy" % (traj_folder, traj_list_array[line]))
        #we begin to project
        results_to_store = numpy.dot(
            (temp - test_tica.means_.T),
            reference_tica_information.eigenvectors_[:, :])
        numpy.savetxt("%s/%s_ticproj.txt" % (outputdir, traj_list_array[line]),
                      results_to_store[:, 0:n_components])
Exemplo n.º 38
0
def fit_and_transform(directory, stride=5):

    projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" % stride
    fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" % stride
    #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb"
    active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb"

    tica_model = tICA(n_components=4)

    if not os.path.exists(projected_data_filename):
        print("loading feature files")
        feature_files = get_trajectory_files(directory)
        pool = mp.Pool(mp.cpu_count())
        features = pool.map(load_features, feature_files)
        pool.terminate()
        if not os.path.exists(fit_model_filename):
            print("fitting data to tICA model")
            fit_model = tica_model.fit(features)
            verbosedump(fit_model, fit_model_filename)
            transformed_data = fit_model.transform(features)
            verbosedump(transformed_data, projected_data_filename)
        else:
            print("loading tICA model")
            fit_model = verboseload(fit_model_filename)
            transformed_data = fit_model.transform(features)
            verbosedump(transformed_data, projected_data_filename)
    else:
        fit_model = verboseload(fit_model_filename)
        transformed_data = verboseload(projected_data_filename)

    active_pdb = md.load(active_pdb_file)
    top = active_pdb.topology
    atom_indices = [
        a.index for a in top.atoms
        if a.residue.is_protein and a.residue.resSeq != 341
        and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79
        and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue.
        resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA"
        and a.residue.name != "NME" and a.residue.name != "ACE"
    ]
    active_pdb = md.load(active_pdb_file, atom_indices=atom_indices)
    featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2'])
    active_pdb_features = featurizer.transform(active_pdb)
    active_pdb_projected = fit_model.transform(active_pdb_features)
    print((active_pdb_projected[0:4]))
Exemplo n.º 39
0
def test_multiple_components():
    X = np.random.randn(100, 5)
    tica = tICA(n_components=1, gamma=0)
    tica.fit([X])

    Y1 = tica.transform([X])[0]

    tica.n_components = 4
    Y4 = tica.transform([X])[0]

    tica.n_components = 3
    Y3 = tica.transform([X])[0]

    assert Y1.shape == (100, 1)
    assert Y4.shape == (100, 4)
    assert Y3.shape == (100, 3)

    eq(Y1.flatten(), Y3[:, 0])
    eq(Y3, Y4[:, :3])
Exemplo n.º 40
0
def test_multiple_components():
    X = np.random.randn(100, 5)
    tica = tICA(n_components=1, gamma=0)
    tica.fit([X])

    Y1 = tica.transform([X])[0]

    tica.n_components = 4
    Y4 = tica.transform([X])[0]

    tica.n_components = 3
    Y3 = tica.transform([X])[0]

    assert Y1.shape == (100, 1)
    assert Y4.shape == (100, 4)
    assert Y3.shape == (100, 3)

    eq(Y1.flatten(), Y3[:, 0])
    eq(Y3, Y4[:, :3])
Exemplo n.º 41
0
def ktica_test(features_dir, tica_dir, landmark_indices = None, nystroem_components=1000, tica_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = ""):
	nys = Nystroem(n_components=nystroem_components)
	tica_model = tICA(n_components = tica_components, lag_time = lag_time)
	feature_files = get_trajectory_files(features_dir, ext = ".h5")[0:3]

	#if os.path.exists(nystroem_data_filename):
	#	nyx = verboseload(nystroem_data_filename)
	#else:
	features = load_file_list(feature_files)
	nyx = nys.fit_transform(features)
	verbosedump(nyx, nystroem_data_filename)

	print(np.shape(nyx))
	print(dir(nyx))

	fit_model = tica_model.fit(nyx)
	verbosedump(fit_model, fit_model_filename)
	transformed_data = fit_model.transform(nyx)
	verbosedump(transformed_data, projected_data_filename)

	return
Exemplo n.º 42
0
def train_model(args):
    features, feature_type = extract_features(args)

    print "Fitting %s model" % args.model
    
    if args.model == "PCA":
        model = PCA(n_components = args.n_components)
        model_type = PCA_MODEL
        projected = model.fit_transform(features)

    elif args.model == "SVD":
        model = TruncatedSVD(n_components = args.n_components)
        model_type = SVD_MODEL
        projected = model.fit_transform(features)

    elif args.model == "ICA":
        model = FastICA(n_components = args.n_components)
        model_type = ICA_MODEL
        projected = model.fit_transform(features)

    elif args.model == "tICA":
        model = tICA(n_components = args.n_components,
                     kinetic_mapping=True,
                     lag_time = args.lag_time)
        model_type = TICA_MODEL
        projected = model.fit_transform([features])[0]

    else:
        raise Exception, "Unknown model type '%s'", args.model
    

    print "Writing model"
    model = { LAG_TIME_KEY : args.lag_time,
              MODEL_TYPE_KEY : model_type,
              MODEL_KEY : model,
              PROJECTION_KEY : projected,
              FEATURE_TYPE_KEY : feature_type }
    
    joblib.dump(model, args.model_file)
Exemplo n.º 43
0
def fit_and_transform(directory):
	print("fitting data to tICA model")

	tica_model = tICA(n_components=4)

	features = generateData(get_trajectory_files(directory))
	for data in features:
		print(np.shape(data[0]))
		tica_model.partial_fit(data[0])
		print("Fitting: ")
		print(data)

	transformed_data = []
	for data in features:
		print("Transforming: ")
		print(data)
		transformed_data.append(tica_model.partial_transform(data))
		
	verbosedump(transformed_data, "/home/enf/b2ar_analysis/phi_psi_chi_stride10_projected.h5")
	trajs = np.concatenate(transformed_data)
	plt.hexbin(trajs[:,0], trajs[:,1], bins='log', mincnt=1)
	plt.show()
Exemplo n.º 44
0
def fit_and_transform(directory):
	print("fitting data to tICA model")

	tica_model = tICA(n_components=4)

	features = generateData(get_trajectory_files(directory))
	for data in features:
		print((np.shape(data[0])))
		tica_model.partial_fit(data[0])
		print("Fitting: ")
		print(data)

	transformed_data = []
	for data in features:
		print("Transforming: ")
		print(data)
		transformed_data.append(tica_model.partial_transform(data))
		
	verbosedump(transformed_data, "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride10_projected.h5")
	trajs = np.concatenate(transformed_data)
	plt.hexbin(trajs[:,0], trajs[:,1], bins='log', mincnt=1)
	plt.show()
def fit_protein_tica(yaml_file,sparse=False,ksparse=None):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    elif type(ksparse)==int:
        current_mdl_params["k"] = ksparse
        protein_tica_mdl = KSparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            if os.path.exists("./normalized_features"):
                featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat)
            else:
                print('Warning: features have not been scaled')
                featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
           
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    print('Error')
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
Exemplo n.º 46
0
def fit_and_transform(directory, stride=5):
	
	projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" %stride
	fit_model_filename  = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" %stride
	#active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb"
	active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb"

	tica_model = tICA(n_components=4)

	if not os.path.exists(projected_data_filename):
		print("loading feature files")
		feature_files = get_trajectory_files(directory)
		pool = mp.Pool(mp.cpu_count())
		features = pool.map(load_features, feature_files)
		pool.terminate()
		if not os.path.exists(fit_model_filename):
			print("fitting data to tICA model")
			fit_model = tica_model.fit(features)
			verbosedump(fit_model, fit_model_filename)
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
		else:
			print("loading tICA model")
			fit_model = verboseload(fit_model_filename)
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
	else:
		fit_model = verboseload(fit_model_filename)
		transformed_data = verboseload(projected_data_filename)

	active_pdb = md.load(active_pdb_file)
	top = active_pdb.topology
	atom_indices = [a.index for a in top.atoms if a.residue.is_protein and a.residue.resSeq != 341 and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79 and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue.resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA" and a.residue.name != "NME" and a.residue.name != "ACE"]
	active_pdb = md.load(active_pdb_file, atom_indices=atom_indices)
	featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2'])
	active_pdb_features = featurizer.transform(active_pdb)
	active_pdb_projected = fit_model.transform(active_pdb_features)
	print(active_pdb_projected[0:4])
Exemplo n.º 47
0
def fit_and_transform(features_directory,
                      model_dir,
                      stride=5,
                      lag_time=10,
                      n_components=5):
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" % model_dir
    fit_model_filename = "%s/phi_psi_chi2_allprot_tica_coords.h5" % model_dir
    #active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

    tica_model = tICA(n_components=n_components, lag_time=lag_time)

    if not os.path.exists(projected_data_filename):
        print("loading feature files")
        feature_files = get_trajectory_files(features_directory, ext=".h5")
        pool = mp.Pool(mp.cpu_count())
        features = pool.map(load_features, feature_files)
        pool.terminate()
        if not os.path.exists(fit_model_filename):
            print("fitting data to tICA model")
            fit_model = tica_model.fit(features)
            verbosedump(fit_model, fit_model_filename)
            transformed_data = fit_model.transform(features)
            verbosedump(transformed_data, projected_data_filename)
        else:
            print("loading tICA model")
            fit_model = verboseload(fit_model_filename)
            print("transforming")
            transformed_data = fit_model.transform(features)
            verbosedump(transformed_data, projected_data_filename)
    else:
        fit_model = verboseload(fit_model_filename)
        transformed_data = verboseload(projected_data_filename)

    print(fit_model.summarize())
Exemplo n.º 48
0
import numpy as np

import msmexplorer as msme

rs = np.random.RandomState(42)

# Load Fs Peptide Data
trajs = FsPeptide().get().trajectories

# Extract Backbone Dihedrals
featurizer = DihedralFeaturizer(types=['chi1'])
diheds = featurizer.fit_transform(trajs)

# Perform Dimensionality Reduction
tica_model = tICA(lag_time=2, n_components=2)
tica_trajs = tica_model.fit_transform(diheds)

# Perform Clustering
clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs)
clustered_trajs = clusterer.fit_transform(tica_trajs)

# Construct MSM
msm = MarkovStateModel(lag_time=2)
assignments = msm.fit_transform(clustered_trajs)

# Plot Stacked Distributions
a = np.concatenate(assignments, axis=0)
d = np.concatenate(diheds, axis=0)

# Plot Stacked Distributions of the sine of each Chi1 angle
Exemplo n.º 49
0
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5, wolf = True, shrinkage = None, rho = 0.05, parallel=True, sparse = True, traj_ext = ".h5"):
	if not os.path.exists(model_dir):
		os.makedirs(model_dir)

	projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir
	fit_model_filename  = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir
	#active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)

	if not os.path.exists(projected_data_filename):
		print("loading feature files")
		feature_files = get_trajectory_files(features_directory, ext = traj_ext)
		if len(feature_files) == 0: feature_files = get_trajectory_files(features_directory, ext = ".dataset")

		if not parallel:
			features = []
			for feature_file in feature_files:
				#if "A-00" not in feature_file and "A-01" not in feature_file: continue
				#print("Loading feature files one at a time")
				print "loading %s" %feature_file
				#if sparse: 
				#	features.append(load_features(feature_file)[0:1000,0:10])
				#else:
				
				features.append(load_features(feature_file))
		else:
			pool = mp.Pool(mp.cpu_count())
			features = pool.map(load_features, feature_files)
			pool.terminate()
		transpose = False
		for i in range(0, len(features)):
			if np.shape(features[0])[1] != np.shape(features[i])[1]:
				transpose = True
				break
		if transpose: 
			for i in range(0, len(features)):
				features[i] = np.transpose(features[i])
		print np.shape(features[0])
		#print np.shape(features[1])
		print(features[0][0][0:10])
		#print(features[1][0][0:10])
		print(np.shape(features))

		print("fitting data to tICA model")
		fit_model = tica_model.fit(features)
		print(fit_model.summarize())
		#print(dir(fit_model))
		#save_dataset(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(features)
		print("transformed data with tICA model")
		verbosedump(fit_model, fit_model_filename)
		print("saved tICA model")
		verbosedump(transformed_data, projected_data_filename)
		print("saved data projected onto tICA coords")

	else:
		print("already computed tICA model")
Exemplo n.º 50
0
# Load trajectories
################################################################################

print ('loading trajectories...')
filenames = glob(os.path.join(source_directory, '*0.h5'))
trajectories = [md.load(filename) for filename in filenames]
print "We are analyzing %s trajectories." % len(trajectories)

################################################################################
# initialize dihedral and tICA features
################################################################################

print('initializing dihedral and tICA features...')
dihedrals = featurizer.DihedralFeaturizer(types=["chi1"]).transform(trajectories)
print "We are using %s chi1 dihedral features." % len(dihedrals[0])
tica = decomposition.tICA(n_components = 4,lag_time= 1600)
X = tica.fit_transform(dihedrals)

################################################################################
# Make eigenvalues plot
################################################################################

plt.clf()
eigenvalues = (tica.eigenvalues_)**2

sum_eigenvalues = np.sum(eigenvalues[0:2])

print "This is the sum of the first two eigenvalues: %s." % sum_eigenvalues

plt.plot(eigenvalues)
plt.xlim(0,4)
Exemplo n.º 51
0
def calculate_tica_components():
    print("Calculating tICA components...")
    in_files = glob.glob("out*npy")
    loaded_files = [ np.load(filename) for filename in in_files ]
    tica = tICA(lag_time=tica_lagtime,
        n_components=int(tica_components)).fit_transform(loaded_files)
    np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica)
    tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components)
    joblib.dump(tica, tica_data)
    data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components))

    for i in range(len(glob.glob('out*npy'))): # extract the four tICA components
        for j in range(len(data[i])):
            tica_1.append(data[i][j][0])
            tica_2.append(data[i][j][1])
            tica_3.append(data[i][j][2])
            tica_4.append(data[i][j][3])

# Clustering via KCenters
    if cluster_method == 'kcenters':
        print("Clustering via KCenters...")
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        print("Clustering via KMeans...")
        clusters = KMeans(n_clusters)
    else:
        sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
    sequences = clusters.fit_transform(tica)
    np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences)
    np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters),
        clusters.cluster_centers_)
    cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(sequences, cluster_data)

 # Determining cluster populations
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]

# Plotting the tICA components
    print("Plotting tICA components with cluster centers...")
    plt.figure(0) # plotting tica_1, tica_2
    plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_2.png')
    plt.figure(1) # plotting tica_1, tica_3
    plt.hexbin(tica_1, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_1_3.png')
    plt.figure(2) # plotting tica_2, tica_3
    plt.hexbin(tica_2, tica_3, bins='log')
    x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))]
    y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))]
    plt.plot(x_centers, y_centers, 'wo')
    for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers):
        plt.annotate(
          label,
          xy = (x, y), xytext = (-20, 20),
          textcoords = 'offset points', ha = 'right', va = 'bottom',
          bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
          arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
    plt.savefig('tica_2_3.png')


   # Determining cluster entropy ( this yields errors for me )
    # print("Determining cluster entropy")
    # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum()
    # np.savetxt('cluster_entropy.dat', cluster_entropy)

  
 # Determining the cluster populations and writing out PDBs for cluster centers
    print("Determining cluster populations...")
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster
    normalized_counts =  counts/float(counts.sum())
    np.savetxt('populations.dat', normalized_counts)
    print("Performing cluster analytics and saving center PDBs...\n")
    for i in range(len(glob.glob("traj*xtc"))):
        n_snapshots = len(clusters.distances_[i])
        cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers
        cluster_labels = sequences[i][cluster_indices] # number of cluster
	if cluster_indices.size != 0: # print only the trajectories that have cluster centers
            for j in range(len(cluster_labels)): # for each cluster center found in this trajectory
                print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.')
                print('It is found on frame', cluster_indices[j], 'and has a relative population of',
                  "%.4f"%percentages[cluster_labels[j]], '%.')

        xtcfile = sorted(glob.glob("traj*xtc"))[i]
        for j in range(len(cluster_indices)): # actually saving the snapshots
            cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro')
            cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1)


   # Calculating IPTs
    print("\nCalculating Implied Timescales...")
    timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales,
        msm=MarkovStateModel(verbose=False))
    
    implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters)
    joblib.dump(timescales, implied_timescale_data)
    numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters)
    np.savetxt('lagtimes.txt', lagtimes)
    np.save(numpy_timescale_data, timescales)
   
# Plotting IPTs (lagtimes and timescales)
    print("Plotting Implied Timescales...")
    for i in range(n_timescales):
	plt.figure(42)
	plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-')
	plt.yscale('log')
	plt.xlabel('lagtime (ns)')
	plt.ylabel('Implied timescales (ns)')
	plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
Exemplo n.º 52
0
def test_shape():
    model = tICA(n_components=3).fit([np.random.randn(100, 10)])
    eq(model.eigenvalues_.shape, (3,))
    eq(model.eigenvectors_.shape, (10, 3))
    eq(model.components_.shape, (3, 10))
Exemplo n.º 53
0
 def fit_tica(self, lag_time):
     self.tica = tICA(n_components=10, lag_time=lag_time,
                      weighted_transform=True)
     self.tica.fit(self.seqs2d)
     self.ticax = self.tica.transform(self.seqs2d)
Exemplo n.º 54
0
"""Reduce dimensionality with tICA

msmbuilder autogenerated template version 2
created 2017-05-23T16:38:49.125259
please cite msmbuilder in any publications

"""

from msmbuilder.io import load_trajs, save_trajs, save_generic
from msmbuilder.decomposition import tICA

## Load
tica = tICA(n_components=5, lag_time=10, kinetic_mapping=True)
meta, ftrajs = load_trajs("ftrajs")

## Fit
tica.fit(ftrajs.values())

## Transform
ttrajs = {}
for k, v in ftrajs.items():
    ttrajs[k] = tica.partial_transform(v)

## Save
save_trajs(ttrajs, 'ttrajs', meta)
save_generic(tica, 'tica.pickl')
Exemplo n.º 55
0
def test_1():
    np.random.seed(42)
    X = np.random.randn(10, 3)

    tica = tICA(n_components=2, lag_time=1)
    y2 = tica.fit_transform([np.copy(X)])[0]
Exemplo n.º 56
0
'''
'''
if not (os.path.isfile("phi_psi_chi2_features_vd_stride10.h5")):
	print("featurizing")
	phi_psi_chi2 = DihedralFeaturizer(types=['phi','psi','chi2'])
	features = phi_psi_chi2.transform(traj_list = traj)
	print("finished featurizing")
	verbosedump(features, "phi_psi_chi2_features_vd_stride10.h5")
else:
	print("loading existing features")
	features = verboseload("phi_psi_chi2_features_vd_stride10.h5")
	features = [np.concatenate(features)]

if not (os.path.isfile("reduced_phi_psi_chi_stride10.h5")):
	print("Fitting tICA model")
	tica_model = tICA(n_components=4)
	fitted_model = tica_model.fit(features)
	reduced_data = fitted_model.transform(features)
	verbosedump(reduced_data, "reduced_phi_psi_chi_stride10.h5")
	print(tica_model.summarize())
else:
	reduced_data = verboseload("reduced_phi_psi_chi_stride10.h5")

clusterer = KMedoids(n_clusters=9)

clusters = clusterer.fit_transform(reduced_data)[0]

center_locations = []

for i in range(0, len(clusters)):
	print i