def individual_traj_featurize(data_to_process): #print('Running individual traj featurize\n') test = 1 #print("Data process to do is :", data_to_process) featurizer_type = data_to_process[0] if featurizer_type == 'Dihedral': featurizer_data = DihedralFeaturizer(types=['phi', 'psi']) # print('Featurizer created:\n') featurized_data = featurizer_data.fit_transform(data_to_process[2]) #print('Finished individual traj featurize\n') return [data_to_process[1], featurized_data]
from msmbuilder.decomposition import tICA from msmbuilder.cluster import MiniBatchKMeans from msmbuilder.msm import MarkovStateModel import numpy as np import msmexplorer as msme rs = np.random.RandomState(42) # Load Fs Peptide Data trajs = FsPeptide().get().trajectories # Extract Backbone Dihedrals featurizer = DihedralFeaturizer(types=['chi1']) diheds = featurizer.fit_transform(trajs) # Perform Dimensionality Reduction tica_model = tICA(lag_time=2, n_components=2) tica_trajs = tica_model.fit_transform(diheds) # Perform Clustering clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs) clustered_trajs = clusterer.fit_transform(tica_trajs) # Construct MSM msm = MarkovStateModel(lag_time=2) assignments = msm.fit_transform(clustered_trajs) # Plot Stacked Distributions a = np.concatenate(assignments, axis=0)
def main(): cli = argparse.ArgumentParser() cli.add_argument('-e', '--eps', help='eps', default=1, type=float) cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int) cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int) cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int) # Download example dataset from msmbuilder.example_datasets import AlanineDipeptide ala2 = AlanineDipeptide(verbose=False) xyz = ala2.get().trajectories print(ala2.description()) #xyz = [t[::10] for t in xyz] print("{} trajectories".format(len(xyz))) # msmbuilder does not keep track of units! You must keep track of your # data's timestep to_ns = 0.5 print("with length {} ns".format(set(len(x) * to_ns for x in xyz))) from msmbuilder.featurizer import DihedralFeaturizer featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(xyz) print(xyz[0].xyz.shape) print(diheds[0].shape) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=2) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False) diheds = featurizer.fit_transform(xyz) print(diheds[0].shape) print(tica_trajs[0].shape) # =========================================================================== #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: # phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) # psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) #X = np.column_stack((phi_angles, psi_angles)) #print(X.shape) phi_angles = np.degrees(diheds[0][:, 0]) psi_angles = np.degrees(diheds[0][:, 1]) print(phi_angles) X = tica_trajs[0].astype(np.float32) #rint(X) n_size = X.shape[0] dimension = X.shape[1] #print(X.shape) # =========================================================================== args = cli.parse_args() eps = args.eps # eps min_samples = args.min_samples # min_samples nlist = args.nlist nprobe = args.nprobe IVFFlat = True print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(X), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.30, 0.20, 0.10]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: #percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) #print X # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" #potential = True remove_outliers = False potential = False eps = eps_list[0] min_samples = 1 len_frames = len(X) print("Total frames:", len_frames) print("Running first calculation") db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount( old_assignments[old_assignments >= 0]) #remove outliers frame_freq_index_sorted = np.argsort( frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[ frame_freq_index_sorted] / np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) min_samples_list = [50, 30, 10] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] results = np.zeros((n_min_samples, n_iterations, len_frames), dtype=np.int32) for i in range(1, n_iterations): eps = eps_list[i] min_samples = min_samples_list[i] db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i is n_iterations - 1: remove_outliers = True #else: # remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments #print(results) #np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
md.load_dcd( "/home/sbhakat/Aurora/DESRES/Conv_trajectory/DCD_files/Conv-BPTI-all-0001.dcd", top="prot_maeconv.pdb", atom_indices=a), md.load_dcd( "/home/sbhakat/Aurora/DESRES/Conv_trajectory/DCD_files/Conv-BPTI-all-4068.dcd", top="prot_maeconv.pdb", atom_indices=a), md.load_dcd( "/home/sbhakat/Aurora/DESRES/Conv_trajectory/DCD_files/Conv-BPTI-all-4069.dcd", top="prot_maeconv.pdb", atom_indices=a) ] dump(trj_list, "traj_list.pkl") f = DihedralFeaturizer(types=['phi', 'psi']) dump(f, "raw_featurizer.pkl") feat = f.fit_transform(trj_list) dump(feat, "raw_features.pkl") f = DihedralFeaturizer(types=['phi', 'psi']) dump(f, "featurizer.pkl") df1 = pd.DataFrame(f.describe_features(trj_list[0])) dump(df1, "feature_descriptor.pkl") feat = f.fit_transform(trj_list) dump(feat, "features.pkl")
#prettier plots #Loading the trajectory a = np.arange(1119, 1277) ds = dataset("../../trajfit.xtc", topology="../prot.pdb", atom_indices=a) #Featurization featurizer = DihedralFeaturizer(types=['chi1', 'chi2']) #dump(featurizer,"raw_featurizer.pkl") #from msmbuilder.utils import load,dump f = DihedralFeaturizer(types=['chi1', 'chi2'], sincos=False) dump(f, "raw_featurizer.pkl") #featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83) diheds = featurizer.fit_transform(ds) dump(diheds, "features.pkl") #print(ds[0].shape) print(diheds[0].shape) # this basically maps every feature to atom indices. df1 = pd.DataFrame(featurizer.describe_features(ds)) dump(df1, "feature_descriptor.pkl") #Robust scaling from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape)