예제 #1
0
def individual_traj_featurize(data_to_process):
    #print('Running individual traj featurize\n')
    test = 1
    #print("Data process to do is :", data_to_process)
    featurizer_type = data_to_process[0]

    if featurizer_type == 'Dihedral':
        featurizer_data = DihedralFeaturizer(types=['phi', 'psi'])
        # print('Featurizer created:\n')

    featurized_data = featurizer_data.fit_transform(data_to_process[2])

    #print('Finished individual traj featurize\n')
    return [data_to_process[1], featurized_data]
예제 #2
0
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import MiniBatchKMeans
from msmbuilder.msm import MarkovStateModel

import numpy as np

import msmexplorer as msme

rs = np.random.RandomState(42)

# Load Fs Peptide Data
trajs = FsPeptide().get().trajectories

# Extract Backbone Dihedrals
featurizer = DihedralFeaturizer(types=['chi1'])
diheds = featurizer.fit_transform(trajs)

# Perform Dimensionality Reduction
tica_model = tICA(lag_time=2, n_components=2)
tica_trajs = tica_model.fit_transform(diheds)

# Perform Clustering
clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs)
clustered_trajs = clusterer.fit_transform(tica_trajs)

# Construct MSM
msm = MarkovStateModel(lag_time=2)
assignments = msm.fit_transform(clustered_trajs)

# Plot Stacked Distributions
a = np.concatenate(assignments, axis=0)
def main():
    cli = argparse.ArgumentParser()
    cli.add_argument('-e', '--eps', help='eps', default=1, type=float)
    cli.add_argument('-m',
                     '--min_samples',
                     help='min_samples',
                     default=5,
                     type=int)
    cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int)
    cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int)

    # Download example dataset
    from msmbuilder.example_datasets import AlanineDipeptide
    ala2 = AlanineDipeptide(verbose=False)
    xyz = ala2.get().trajectories
    print(ala2.description())

    #xyz = [t[::10] for t in xyz]
    print("{} trajectories".format(len(xyz)))
    # msmbuilder does not keep track of units! You must keep track of your
    # data's timestep
    to_ns = 0.5
    print("with length {} ns".format(set(len(x) * to_ns for x in xyz)))

    from msmbuilder.featurizer import DihedralFeaturizer
    featurizer = DihedralFeaturizer(types=['phi', 'psi'])
    diheds = featurizer.fit_transform(xyz)

    print(xyz[0].xyz.shape)
    print(diheds[0].shape)

    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(diheds)

    print(diheds[0].shape)
    print(scaled_diheds[0].shape)

    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=2)
    # fit and transform can be done in seperate steps:
    tica_model.fit(diheds)

    tica_trajs = tica_model.transform(diheds)
    featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False)
    diheds = featurizer.fit_transform(xyz)
    print(diheds[0].shape)
    print(tica_trajs[0].shape)

    # ===========================================================================
    #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True:
    #    phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32)
    #    psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32)
    #X = np.column_stack((phi_angles, psi_angles))
    #print(X.shape)
    phi_angles = np.degrees(diheds[0][:, 0])
    psi_angles = np.degrees(diheds[0][:, 1])
    print(phi_angles)
    X = tica_trajs[0].astype(np.float32)
    #rint(X)
    n_size = X.shape[0]
    dimension = X.shape[1]

    #print(X.shape)

    # ===========================================================================
    args = cli.parse_args()
    eps = args.eps  # eps
    min_samples = args.min_samples  # min_samples
    nlist = args.nlist
    nprobe = args.nprobe
    IVFFlat = True
    print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' %
          (n_size, dimension, eps, min_samples))

    n_samples = 1000
    percent = 0.9
    import random
    whole_samples = random.sample(list(X), n_samples)
    #print whole_samples
    from metrics.pairwise import pairwise_distances
    sample_dist_metric = pairwise_distances(whole_samples,
                                            whole_samples,
                                            metric='l2')
    print(sample_dist_metric.shape)
    sample_dist = []
    for i in range(0, n_samples):
        for j in range(i + 1, n_samples):
            sample_dist.append(sample_dist_metric[i, j])
    sorted_sample_dist = np.sort(sample_dist)
    print("Len of samples:", len(sorted_sample_dist),
          np.max(sorted_sample_dist), np.min(sorted_sample_dist))

    eps_list = []
    len_samples = len(sorted_sample_dist)
    for percent in [0.30, 0.20, 0.10]:  #,0.005, 0.003,
        #                   0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]:
        #percent /= 10.0
        index = int(round(len_samples * percent))
        if index == len_samples:
            index -= 1
        dc = sorted_sample_dist[index]
        #print index, sorted_sample_dist[index]
        eps_list.append(dc)
    print(eps_list)

    #print X
    # ===========================================================================
    # do Clustering using MR -DBSCAN method
    clustering_name = "mr-dbscan_iter_"
    #potential = True
    remove_outliers = False
    potential = False
    eps = eps_list[0]
    min_samples = 1
    len_frames = len(X)
    print("Total frames:", len_frames)
    print("Running first calculation")
    db = Faiss_DBSCAN(eps=eps,
                      min_samples=min_samples,
                      nlist=nlist,
                      nprobe=nprobe,
                      metric="l2",
                      GPU=False,
                      IVFFlat=IVFFlat)
    db.fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    old_assignments = db.labels_
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)

    # Calculating percentage of each states
    frame_bincount = np.bincount(
        old_assignments[old_assignments >= 0])  #remove outliers
    frame_freq_index_sorted = np.argsort(
        frame_bincount)[::-1]  # descending arg sort
    frame_freq_percent_sorted = frame_bincount[
        frame_freq_index_sorted] / np.float32(len_frames)
    print(frame_freq_percent_sorted[0:10])
    print(frame_freq_index_sorted[0:10])
    old_frame_freq_percent_sorted = frame_freq_percent_sorted
    old_frame_freq_index_sorted = frame_freq_index_sorted
    n_microstates = len(
        set(old_assignments)) - (1 if -1 in old_assignments else 0)
    print('Estimated number of clusters: %d' % n_microstates)
    iter_name = clustering_name + '0' + '_eps_' + str(
        eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
            n_microstates)
    plot_cluster(labels=old_assignments,
                 phi_angles=phi_angles,
                 psi_angles=psi_angles,
                 name=iter_name,
                 potential=potential)

    n_iterations = len(eps_list)
    print("n_iterations:", n_iterations)
    min_samples_list = [50, 30, 10]
    #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2]
    n_min_samples = len(min_samples_list)
    #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5]
    #min_samples_list = [3, 3, 3, 3, 3, 2, 2]

    results = np.zeros((n_min_samples, n_iterations, len_frames),
                       dtype=np.int32)
    for i in range(1, n_iterations):
        eps = eps_list[i]
        min_samples = min_samples_list[i]
        db = Faiss_DBSCAN(eps=eps,
                          min_samples=min_samples,
                          nlist=nlist,
                          nprobe=nprobe,
                          metric="l2",
                          GPU=False,
                          IVFFlat=IVFFlat).fit(X)

        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        new_assignments = db.labels_
        if i is n_iterations - 1:
            remove_outliers = True
        #else:
        #    remove_outliers = False
        assignments = merge_assignments(new_assignments,
                                        old_assignments,
                                        remove_outliers=remove_outliers)
        n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0)

        #results[j,i, :]= np.array(assignments)
        print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:',
              min_samples, 'Estimated number of clusters:', n_microstates)
        #print('Estimated number of clusters: %d' % n_microstates)
        iter_name = clustering_name + str(i) + '_eps_' + str(
            eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str(
                n_microstates)
        plot_cluster(labels=assignments,
                     phi_angles=phi_angles,
                     psi_angles=psi_angles,
                     name=iter_name,
                     potential=potential)
        #old_assignments = assignments
    #print(results)
    #np.save("results.npy", results)
    #np.savetxt("results.csv", results, fmt="%d", delimiter=",")
    np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",")
    np.savetxt("min_samples_list.txt",
               min_samples_list,
               fmt="%d",
               delimiter=",")
예제 #4
0
    md.load_dcd(
        "/home/sbhakat/Aurora/DESRES/Conv_trajectory/DCD_files/Conv-BPTI-all-0001.dcd",
        top="prot_maeconv.pdb",
        atom_indices=a),
    md.load_dcd(
        "/home/sbhakat/Aurora/DESRES/Conv_trajectory/DCD_files/Conv-BPTI-all-4068.dcd",
        top="prot_maeconv.pdb",
        atom_indices=a),
    md.load_dcd(
        "/home/sbhakat/Aurora/DESRES/Conv_trajectory/DCD_files/Conv-BPTI-all-4069.dcd",
        top="prot_maeconv.pdb",
        atom_indices=a)
]

dump(trj_list, "traj_list.pkl")

f = DihedralFeaturizer(types=['phi', 'psi'])
dump(f, "raw_featurizer.pkl")

feat = f.fit_transform(trj_list)
dump(feat, "raw_features.pkl")

f = DihedralFeaturizer(types=['phi', 'psi'])
dump(f, "featurizer.pkl")

df1 = pd.DataFrame(f.describe_features(trj_list[0]))
dump(df1, "feature_descriptor.pkl")

feat = f.fit_transform(trj_list)
dump(feat, "features.pkl")
예제 #5
0
#prettier plots

#Loading the trajectory
a = np.arange(1119, 1277)
ds = dataset("../../trajfit.xtc", topology="../prot.pdb", atom_indices=a)

#Featurization
featurizer = DihedralFeaturizer(types=['chi1', 'chi2'])
#dump(featurizer,"raw_featurizer.pkl")

#from msmbuilder.utils import load,dump
f = DihedralFeaturizer(types=['chi1', 'chi2'], sincos=False)
dump(f, "raw_featurizer.pkl")

#featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83)
diheds = featurizer.fit_transform(ds)
dump(diheds, "features.pkl")

#print(ds[0].shape)
print(diheds[0].shape)

# this basically maps every feature to atom indices.
df1 = pd.DataFrame(featurizer.describe_features(ds))
dump(df1, "feature_descriptor.pkl")

#Robust scaling
from msmbuilder.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_diheds = scaler.fit_transform(diheds)

print(diheds[0].shape)