def transform_protein_tica(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    protein_tica_mdl = verboseload(tica_obj_path)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            if os.path.exists("./normalized_features"):
                featurized_traj = sorted(glob.glob("./normalized_features/*.jl"), key=keynat)
            else:
                print('Warning: features have not been scaled')
                featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)

            tica_data = {}
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    tica_data[os.path.basename(f)] = \
                        protein_tica_mdl.partial_transform(featurized_path)
                except:
                    pass
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(tica_data, 'tica_data.pkl')
                print("Done transforming protein %s" % protein)

    # dumping the tica_mdl again since the eigenspectrum might have been calculated
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
예제 #2
0
def featurize_project(proj_folder,top_folder,featurizer_object,stride,view):

     #if already featurized dont bother(should add a warning about this)
     if os.path.exists(proj_folder+"/featurized_traj.pkl"):
          return verboseload(proj_folder+"/featurized_traj.pkl")

     if featurizer_object is None:
          featurizer = DihedralFeaturizer(types=['phi', 'psi','chi1'])
     else:
          try:
               featurizer = verboseload(featurizer_object)
          except:
               sys.exit("Cant Load Featurizer using msmbuilder verboseload")

     feature_dict={}

     traj_list =  glob.glob(proj_folder+"/trajectories/*.dcd")


     jobs = [(proj_folder,top_folder,featurizer,traj,stride) for traj in traj_list]
     results = view.map_sync(featurize_traj,jobs)

     for result in results:
          feature_dict[result[0]] = result[1]

     verbosedump(feature_dict,proj_folder+"/featurized_traj.pkl")

     return feature_dict
예제 #3
0
def load_current_protein_model(yaml_file, protein, sanity=True):
    """
    :param base_dir: Base directory for the project
    :param protein: Protein for which to load
    :param sanity: Whether or not to run sanity tests
    :return: base_dir, mdl_dir,
                msm_mdl, tica_mdl,
                tica_data, kmeans_mdl,
                fixed_assignments for the model currently stored in
                mdl_dir and mdl_dir/protein
    """
    yaml_file = load_yaml_file(yaml_file)
    base_dir = yaml_file["base_dir"]
    mdl_dir = yaml_file["mdl_dir"]

    prot_mdl_dir = os.path.join(mdl_dir, protein)

    # load the project level information first
    kmeans_mdl = verboseload(os.path.join(mdl_dir, "kmeans_mdl.pkl"))
    tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl"))

    # now load the protein level information
    tica_data = verboseload(os.path.join(prot_mdl_dir, "tica_data.pkl"))
    # need the fixed assignments because otherwise we will have issues
    assignments = verboseload(os.path.join(
        prot_mdl_dir, "fixed_assignments.pkl"))
    msm_mdl = verboseload(os.path.join(prot_mdl_dir, "msm_mdl.pkl"))
    # some sanity tests
    if sanity:
        _sanity_test(base_dir, protein, msm_mdl,
                     tica_data, kmeans_mdl, assignments)
    return base_dir, mdl_dir, msm_mdl, tica_mdl, tica_data, kmeans_mdl, assignments
def transform_protein_tica(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    tica_obj_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    protein_tica_mdl = verboseload(tica_obj_path)
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            print("Transforming protein %s" % protein)
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            tica_data = {}
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    tica_data[os.path.basename(f)] = \
                        protein_tica_mdl.partial_transform(featurized_path)
                except:
                    pass
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(tica_data, 'tica_data.pkl')
                print("Done transforming protein %s" % protein)

    # dumping the tica_mdl again since the eigenspectrum might have been calculated. 
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
예제 #5
0
def featurize_project(proj_folder, top_folder, featurizer_object, stride,
                      view):

    #if already featurized dont bother(should add a warning about this)
    if os.path.exists(proj_folder + "/featurized_traj.pkl"):
        return verboseload(proj_folder + "/featurized_traj.pkl")

    if featurizer_object is None:
        featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi1'])
    else:
        try:
            featurizer = verboseload(featurizer_object)
        except:
            sys.exit("Cant Load Featurizer using msmbuilder verboseload")

    feature_dict = {}

    traj_list = glob.glob(proj_folder + "/trajectories/*.dcd")

    jobs = [(proj_folder, top_folder, featurizer, traj, stride)
            for traj in traj_list]
    results = view.map_sync(featurize_traj, jobs)

    for result in results:
        feature_dict[result[0]] = result[1]

    verbosedump(feature_dict, proj_folder + "/featurized_traj.pkl")

    return feature_dict
def transform_protein_pca(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    pca_obj_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    protein_pca_mdl = verboseload(pca_obj_path)
    for protein in yaml_file["protein_list"]:
        with enter_protein_data_dir(yaml_file, protein):
            print("Transforming protein %s" % protein)
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            pca_data = {}
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    pca_data[os.path.basename(f)] = \
                        protein_pca_mdl.partial_transform(featurized_path)
                except:
                    print('Error')
            with enter_protein_mdl_dir(yaml_file, protein):
                verbosedump(pca_data, 'pca_data.pkl')
                print("Done transforming protein %s" % protein)

    # dumping the pca_mdl again since the eigenspectrum might have been calculated
    pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    verbosedump(protein_pca_mdl, pca_mdl_path)
    return
def fit_protein_kmeans(yaml_file,mini=True,pca=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            if pca:
                tica_data = verboseload("pca_data.pkl")
            else:
                tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
예제 #8
0
def cos_to_means(clusterer_dir, features_dir):
    clusterer = verboseload(clusterer_dir)
    clusters_map = make_clusters_map(clusterer)

    features = verboseload(features_dir)
    feature_distances = {}

    for i in range(0, len(list(clusters_map.keys()))):
        indices = clusters_map[i]
        k_mean = clusterer.cluster_centers_[i]
        print(k_mean)
        find_cos_partial = partial(find_cos, k_mean=k_mean, features=features)
        feature_distances_i = list(map(find_cos_partial, indices))
        feature_distances[i] = feature_distances_i

    print((feature_distances[0][0:10]))
    sorted_map = {}

    print((list(feature_distances.keys())))
    print((len(list(feature_distances.keys()))))

    for i in range(0, len(list(feature_distances.keys()))):
        sorted_features = sorted(feature_distances[i],
                                 key=lambda x: x[2],
                                 reverse=True)
        sorted_map[i] = sorted_features

    print(sorted_map[0][0:10])
    return sorted_map
예제 #9
0
def cluster_minikmeans(tica_dir,
                       data_dir,
                       traj_dir,
                       n_clusters,
                       clusterer_dir=None,
                       tICs=None):
    if (os.path.exists(clusterer_dir)):
        reduced_data = load_file(data_dir)
        clusterer = verboseload(clusterer_dir)
        clusterer.labels_ = clusterer.transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
    else:
        print("Clustering by KMeans")
        try:
            reduced_data = verboseload(data_dir)
        except:
            reduced_data = load_dataset(data_dir)
        if tICs is not None:
            X = []
            for traj in reduced_data:
                X.append(traj[:, tICs])
        else:
            X = reduced_data

        clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
        clusterer.fit_transform(X)
        verbosedump(clusterer, clusterer_dir)
예제 #10
0
def plot_tica_and_clusters(tica_dir,
                           transformed_data_dir,
                           clusterer_dir,
                           lag_time,
                           component_i=0,
                           component_j=1):
    transformed_data = verboseload(transformed_data_dir)
    clusterer = verboseload(clusterer_dir)

    trajs = np.concatenate(transformed_data)
    plt.hexbin(trajs[:, component_i],
               trajs[:, component_j],
               bins='log',
               mincnt=1)

    centers = clusterer.cluster_centers_
    for i in range(0, np.shape(centers)[0]):
        center = centers[i, :]
        plt.annotate('%d' % i,
                     xy=(center[0], center[1]),
                     xytext=(center[0], center[1]),
                     size=6)

    pp = PdfPages("%s/c%d_c%d_clusters%d.pdf" %
                  (tica_dir, component_i, component_j, np.shape(centers)[0]))
    pp.savefig()
    pp.close()
예제 #11
0
def fit_and_transform(features_directory, model_dir, stride=5, lag_time=10, n_components = 5):
	if not os.path.exists(model_dir):
		os.makedirs(model_dir)

	projected_data_filename = "%s/phi_psi_chi2_allprot_projected.h5" %model_dir
	fit_model_filename  = "%s/phi_psi_chi2_allprot_tica_coords.h5" %model_dir
	#active_pdb_file = "/scratch/users/enf/b2ar_analysis/renamed_topologies/A-00.pdb"

	tica_model = tICA(n_components = n_components, lag_time = lag_time)

	if not os.path.exists(projected_data_filename):
		print("loading feature files")
		feature_files = get_trajectory_files(features_directory, ext = ".h5")
		pool = mp.Pool(mp.cpu_count())
		features = pool.map(load_features, feature_files)
		pool.terminate()
		if not os.path.exists(fit_model_filename):
			print("fitting data to tICA model")
			fit_model = tica_model.fit(features)
			verbosedump(fit_model, fit_model_filename)
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
		else:
			print("loading tICA model")
			fit_model = verboseload(fit_model_filename)
			print("transforming")
			transformed_data = fit_model.transform(features)
			verbosedump(transformed_data, projected_data_filename)
	else:
		fit_model = verboseload(fit_model_filename)
		transformed_data = verboseload(projected_data_filename)

	print fit_model.summarize()
예제 #12
0
def cos_to_means(clusterer_dir, features_dir):
	clusterer = verboseload(clusterer_dir)
	clusters_map = make_clusters_map(clusterer)

	features = verboseload(features_dir)
	feature_distances = {}

	for i in range(0, len(clusters_map.keys())):
		indices = clusters_map[i]
		k_mean = clusterer.cluster_centers_[i]
		print k_mean
		find_cos_partial = partial(find_cos, k_mean=k_mean, features = features)
		feature_distances_i = map(find_cos_partial, indices)
		feature_distances[i] = feature_distances_i

	print(feature_distances[0][0:10])
	sorted_map = {}

	print(feature_distances.keys())
	print(len(feature_distances.keys()))

	for i in range(0, len(feature_distances.keys())):
		sorted_features = sorted(feature_distances[i], key = lambda x: x[2], reverse = True)
		sorted_map[i] = sorted_features

	print sorted_map[0][0:10]
	return sorted_map
예제 #13
0
def dist_to_means(clusterer_dir, features_dir):
	clusterer = verboseload(clusterer_dir)
	clusters_map = make_clusters_map(clusterer)

	features = verboseload(features_dir)
	feature_distances = {}

	def find_cos(index, k_mean):
		traj = index[0]
		frame = index[1]
		conformation = features[traj][frame]
		a = conformation
		b = k_mean
		return (traj, frame, np.dot(a,b) / (np.linalg.norm(a) * np.linalg.norm(b)))

	for i in range(0, len(clusters_map.keys())):
		indices = clusters_map[i]
		k_mean = clusterer.cluster_centers_[i]
		print k_mean
		find_cos_partial = partial(find_cos, k_mean=k_mean)
		feature_distances_i = map(find_cos_partial, indices)
		feature_distances[i] = feature_distances_i

	print(feature_distances[0][0:10])
	sorted_map = {}

	print(feature_distances.keys())
	print(len(feature_distances.keys()))

	for i in range(0, len(feature_distances.keys())):
		sorted_features = sorted(feature_distances[i], key = lambda x: x[2], reverse = True)
		sorted_map[i] = sorted_features

	print sorted_map[0][0:10]
	return sorted_map
예제 #14
0
def load_current_protein_model(yaml_file, protein, sanity=True):
    """
    :param base_dir: Base directory for the project
    :param protein: Protein for which to load
    :param sanity: Whether or not to run sanity tests
    :return: base_dir, mdl_dir,
                msm_mdl, tica_mdl,
                tica_data, kmeans_mdl,
                fixed_assignments for the model currently stored in
                mdl_dir and mdl_dir/protein
    """
    yaml_file = load_yaml_file(yaml_file)
    base_dir = yaml_file["base_dir"]
    mdl_dir = yaml_file["mdl_dir"]

    prot_mdl_dir = os.path.join(mdl_dir, protein)

    # load the project level information first
    kmeans_mdl = verboseload(os.path.join(mdl_dir, "kmeans_mdl.pkl"))
    tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl"))

    # now load the protein level information
    tica_data = verboseload(os.path.join(prot_mdl_dir, "tica_data.pkl"))
    # need the fixed assignments because otherwise we will have issues
    assignments = verboseload(
        os.path.join(prot_mdl_dir, "fixed_assignments.pkl"))
    msm_mdl = verboseload(os.path.join(prot_mdl_dir, "msm_mdl.pkl"))
    # some sanity tests
    if sanity:
        _sanity_test(base_dir, protein, msm_mdl, tica_data, kmeans_mdl,
                     assignments)
    return base_dir, mdl_dir, msm_mdl, tica_mdl, tica_data, kmeans_mdl, assignments
예제 #15
0
def landmark_ktica(features_dir, combined_features_file=None, feature_ext = ".dataset", use_clusters_as_landmarks=True, clusters_map_file = "", 
	landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", 
	fit_model_filename = "", projected_data_filename = "", landmark_subsample=10, 
	sparse = False, shrinkage = 0.05, wolf = False, rho = 0.01):
'''
features_dir: string, directory where your featurized trajectories are kept. 
combined_features_dir: if you have a file containing all featurized trajectories in one file, i.e. as a list of np arrays, this is it.
feature_ext: if instead of a combined file of features they are in separate files, what is the extension of your feature files? 
use_clusters_as_landmarks: this is if you are doing a composition of tICA --> clustering --> Nystroem --> tICA. this is what I do. 
	if true, you need to feed it a json file containing a dictionary that maps cluster name --> list of 2-tuples, where each tuple has 
	(trajectory_id, frame_number pairs). So this way, instead of choosing landmark points at random in the Nystroem approximation, you
	are using regular linear tICA-driven clustering to choose your landmark points more efficiently. 
landmarks_dir: directory where you will save the landmarks. this should be a file containing a list of 1d np arrays or a 2d array
nystroem_components: the number of landmarks to use. 
n_components: the number of ktICA components to compute.
lag_time: lag time of tICA 
nystroem_data_filename: where you will save Nystroem object
fit_model_filename: the filename of the ktICA object to save.
projected_data_filename: where you will save the features projected with kernel tICA 
landmark_subsample= how frequently to subsample the landmarks if you are doing use_clusters_as_landmarks.
sparse: set to False. 
shrinkage: same as gamma in old version of tICA. you might want to mess with this. 
wolf = False: keep this as true unless you're using Robert's branch of msmbuilder
rho = Ignore this. 

'''

	if not os.path.exists(nystroem_data_filename):
		if combined_features_dir is not None: 
			features = verboseload(combined_features_file)
		else:
			features = load_file_list(get_trajectory_files(features_dir, ext = feature_ext))

		if os.path.exists(landmarks_dir):
			landmarks = verboseload(landmarks_dir)
			print(np.shape(landmarks))
		else:
			if use_clusters_as_landmarks:
				with open(clusters_map_file) as f:
					clusters_map = json.load(f)
					clusters_map = {int(k):v for k,v in clusters_map.items()}
					landmarks = []
					for cluster_id,sample_list in clusters_map.items():
						for sample in sample_list:
							traj = sample[0]
							frame = sample[1]
							landmark = features[traj][frame]
							landmarks.append(landmark)
							landmarks = [landmarks[i] for i in range(0,np.shape(landmarks)[0]) if i%landmark_subsample==0] #%landmark_subsample == 0]

					verbosedump(landmarks, landmarks_dir)
			else: 
				n = np.shape(features)[0]
				indices = np.random.choice(n, nystroem_components)
				features_concatenated = np.concatenate(features)
				landmarks = features_concatenated[indices,:]
				verbosedump(landmarks, landmarks_dir)

		ktica(features, landmarks, projected_data_filename, nystroem_data_filename, fit_model_filename, sparse, shrinkage, wolf, rho)
예제 #16
0
def dist_to_means(clusterer_dir, features_dir, n_samples = False, n_components = False, tica_coords_csv = False, kmeans_csv = False):
	clusterer = verboseload(clusterer_dir)
	clusters_map = make_clusters_map(clusterer)

	try: 
		features = verboseload(features_dir)
	except:
		features = load_dataset(features_dir)
	feature_distances = {}

	for i in range(0, len(clusters_map.keys())):
		indices = clusters_map[i]
		k_mean = clusterer.cluster_centers_[i]
		print k_mean
		find_dist_partial = partial(find_dist, k_mean=k_mean, features = features)
		feature_distances_i = map(find_dist_partial, indices)
		feature_distances[i] = feature_distances_i

	print(feature_distances[0][0:10])
	sorted_map = {}

	print(feature_distances.keys())
	print(len(feature_distances.keys()))

	for i in range(0, len(feature_distances.keys())):
		sorted_features = sorted(feature_distances[i], key = lambda x: x[2], reverse = False)
		sorted_map[i] = sorted_features

	if n_samples is not False and n_components is not False and tica_coords_csv is not False:
		tica_coords_map = {}
		for cluster_id in sorted_map.keys():
			for j in range(0, n_samples):
				sample = "cluster%d_sample%d" %(cluster_id, j)
				sample_tuple = sorted_map[cluster_id][j][0:2]
				sample_coords = features[sample_tuple[0]][sample_tuple[1]]
				tica_coords_map[sample] = sample_coords
		titles = ["sample"]
		for k in range(0, n_components):
			titles.append("component_%d" %k)
		print(tica_coords_map.keys()[0])
		print(tica_coords_map[tica_coords_map.keys()[0]])
		write_map_to_csv(tica_coords_csv, tica_coords_map, titles)

	if kmeans_csv is not False:
		kmeans_map = {}
		for cluster in range(0,clusterer.n_clusters):
			k_mean = clusterer.cluster_centers_[cluster]
			cluster_id = "cluster%d" %cluster
			kmeans_map[cluster_id] = k_mean
		titles = ["cluster"]
		for k in range(0, n_components):
			titles.append("component_%d" %k)
		write_map_to_csv(kmeans_csv, kmeans_map, titles)			


	print sorted_map[0][0:10] 
	return sorted_map
예제 #17
0
def plot_all_tics_and_clusters(tica_dir,
                               transformed_data_dir,
                               clusterer_dir,
                               lag_time,
                               tic_range=None,
                               main="",
                               label="dot",
                               active_cluster_ids=[],
                               intermediate_cluster_ids=[],
                               inactive_cluster_ids=[],
                               inactive_subsample=5,
                               intermediate_subsample=5,
                               custom_cluster_centers=None,
                               concatenate=True,
                               axes=None):
    try:
        transformed_data = verboseload(transformed_data_dir)
    except:
        transformed_data = load_dataset(transformed_data_dir)
    if custom_cluster_centers is None:
        clusterer = verboseload(clusterer_dir)
        centers = clusterer.cluster_centers_
    #print centers
    if not concatenate:
        num_tics = np.shape(transformed_data)[1]
    else:
        num_tics = np.shape(transformed_data[0])[1]
    if tic_range == None:
        tic_range = range(0, num_tics)
    for i in tic_range:
        js = [j for j in tic_range if j > i]
        plot_partial = partial(
            plot_tica_and_clusters,
            n_clusters=len(centers),
            tica_dir=tica_dir,
            main=main,
            transformed_data=transformed_data,
            lag_time=lag_time,
            label=label,
            active_cluster_ids=active_cluster_ids,
            intermediate_cluster_ids=intermediate_cluster_ids,
            inactive_cluster_ids=inactive_cluster_ids,
            inactive_subsample=inactive_subsample,
            intermediate_subsample=intermediate_subsample,
            component_i=i,
            centers=centers,
            concatenate=concatenate,
            axes=axes)
        #for j in js:
        #	plot_partial(j)
        pool = mp.Pool(mp.cpu_count())
        pool.map(plot_partial, js)
        pool.terminate()
        #plot_tica_and_clusters(tica_dir = tica_dir, transformed_data = transformed_data, clusterer = clusterer, lag_time = lag_time, label = "dot", active_cluster_ids = active_cluster_ids, intermediate_cluster_ids = intermediate_cluster_ids, inactive_cluster_ids = inactive_cluster_ids, component_i = i, component_j = j)
    print("Printed all tICA coords and all requested clusters")
예제 #18
0
def plot_col(transformed_data_file, figure_directory, colors_file):
	transformed_data = verboseload(transformed_data_file)
	trajs = np.concatenate(transformed_data)
	colors = np.concatenate(verboseload(colors_file))
	sc = plt.scatter(trajs[:,0], trajs[:,1], c=colors, s=50, cmap = mpl.cm.RdYlBu_r)
	plt.colorbar(sc)
	plt.show()
	pp = PdfPages(figure_directory)
	pp.savefig()
	pp.close()
	return
예제 #19
0
def landmark_ktica_ticaTraj(tica_dir, clusterer_dir, ktica_dir, clusters_map_file = "", landmarks_dir = "", nystroem_components=1000, n_components=10, lag_time=5, nystroem_data_filename = "", fit_model_filename = "", projected_data_filename = "", landmark_subsample=1, sparse = False, wolf = True, rho = 0.01, shrinkage = None):
	if not os.path.exists(ktica_dir): os.makedirs(ktica_dir)
	
	if not sparse:
		if shrinkage is None:
			tica_model = tICA(n_components = n_components, lag_time = lag_time)
		else:
			tica_model = tICA(n_components = n_components, lag_time = lag_time, shrinkage = shrinkage)
		
	else:
		if shrinkage is None:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho)
		else:
			tica_model = SparseTICA(n_components = n_components, lag_time = lag_time, rho = rho, shrinkage = shrinkage)

	if not os.path.exists(nystroem_data_filename):
		clusterer = verboseload(clusterer_dir)
		tica = verboseload(tica_dir)
		features = tica
		clusters = clusterer.cluster_centers_
		landmarks = clusters

		print("here's what goes into the combined class:")
		#print(np.shape(features))
		print(np.shape(landmarks))
		print(type(landmarks))
		nys = Nystroem(n_components = np.shape(landmarks)[0], basis = landmarks)#np.shape(landmarks)[0])# basis=landmarks)
		nyx = nys.fit_transform(features)
		del features
		del landmarks
		try:
			save_dataset(nyx, nystroem_data_filename)
		except:
			os.system("rm -rf %s" %nystroem_data_filename)
			save_dataset(nyx, nystroem_data_filename)
	else:
		nyx = load_dataset(nystroem_data_filename)

	print(np.shape(nyx))
	print(dir(nyx))

	if not os.path.exists(projected_data_filename):
		fit_model = tica_model.fit(nyx)
		verbosedump(fit_model, fit_model_filename)
		transformed_data = fit_model.transform(nyx)
		del(nyx)
		try:
			save_dataset(transformed_data, projected_data_filename)
		except:
			os.system("rm -rf %s" %projected_data_filename)
			save_dataset(transformed_data, projected_data_filename)
	else:
		print("Already performed landmark kernel tICA.")
예제 #20
0
 def __init__(self, yaml_file, relative_loc=None):
     self.yaml_file = load_yaml_file(yaml_file)
     self.base_dir = self.yaml_file["base_dir"]
     self.mdl_dir = self.yaml_file["mdl_dir"]
     if relative_loc is None:
         self.relative_loc = self.mdl_dir
     else:
         self.relative_loc = os.path.join(relative_loc,
                                          os.path.split(self.mdl_dir)[1])
     self.kmeans_mdl = verboseload(
         os.path.join(self.relative_loc, "kmeans_mdl.pkl"))
     self.tica_mdl = verboseload(os.path.join(self.relative_loc, "tica_mdl.pkl"))
예제 #21
0
 def __init__(self, yaml_file, relative_loc=None):
     self.yaml_file = load_yaml_file(yaml_file)
     self.base_dir = self.yaml_file["base_dir"]
     self.mdl_dir = self.yaml_file["mdl_dir"]
     if relative_loc is None:
         self.relative_loc = self.mdl_dir
     else:
         self.relative_loc = os.path.join(relative_loc,
                                          os.path.split(self.mdl_dir)[1])
     self.kmeans_mdl = verboseload(
         os.path.join(self.relative_loc, "kmeans_mdl.pkl"))
     self.tica_mdl = verboseload(
         os.path.join(self.relative_loc, "tica_mdl.pkl"))
예제 #22
0
def _test_protein_with_project(prj):
    p1 = Protein(prj, "kinase_1")
    p2 = Protein(prj, "kinase_2")
    assert isinstance(p1, Protein)
    assert isinstance(p1.msm, MarkovStateModel)
    assert (p1.msm.left_eigenvectors_ ==
            verboseload(os.path.join(prj.mdl_dir,"kinase_1","msm_mdl.pkl")).left_eigenvectors_).all()
    assert (p1.bootrap_msm.mle_.left_eigenvectors_ ==
            verboseload(os.path.join(prj.mdl_dir,"kinase_1","msm_mdl.pkl")).left_eigenvectors_).all()
    assert (p2.msm.left_eigenvectors_ ==
            verboseload(os.path.join(prj.mdl_dir,"kinase_2","msm_mdl.pkl")).left_eigenvectors_).all()
    assert (p2.bootrap_msm.mle_.left_eigenvectors_ ==
            verboseload(os.path.join(prj.mdl_dir,"kinase_2","msm_mdl.pkl")).left_eigenvectors_).all()
    return True
예제 #23
0
def plot_col(transformed_data_file, figure_directory, colors_file):
    transformed_data = verboseload(transformed_data_file)
    trajs = np.concatenate(transformed_data)
    colors = np.concatenate(verboseload(colors_file))
    sc = plt.scatter(trajs[:, 0],
                     trajs[:, 1],
                     c=colors,
                     s=50,
                     cmap=mpl.cm.RdYlBu_r)
    plt.colorbar(sc)
    plt.show()
    pp = PdfPages(figure_directory)
    pp.savefig()
    pp.close()
    return
예제 #24
0
def plot_tica_and_clusters(tica_dir, transformed_data_dir, clusterer_dir, lag_time, component_i = 0, component_j = 1):
	transformed_data = verboseload(transformed_data_dir)
	clusterer = verboseload(clusterer_dir)

	trajs = np.concatenate(transformed_data)
	plt.hexbin(trajs[:,component_i], trajs[:,component_j], bins='log', mincnt=1)

	centers = clusterer.cluster_centers_
	for i in range(0, np.shape(centers)[0]):
		center = centers[i,:]
		plt.annotate('%d' %i, xy=(center[0],center[1]), xytext=(center[0], center[1]),size=6)

	pp = PdfPages("%s/c%d_c%d_clusters%d.pdf" %(tica_dir, component_i, component_j, np.shape(centers)[0]))
	pp.savefig()
	pp.close()
예제 #25
0
def _slice_file(job_tuple):
    inp_file, feature_ind, output_folder = job_tuple
    featurized_file = verboseload(inp_file)
    sliced_file = featurized_file[:, feature_ind]
    sliced_file_out = os.path.join(output_folder, os.path.basename(inp_file))
    verbosedump(sliced_file, sliced_file_out)
    return
예제 #26
0
def build_msm(clusterer_dir, lag_time):
	clusterer = verboseload(clusterer_dir)
	n_clusters = np.shape(clusterer.cluster_centers_)[0]
	labels = clusterer.labels_
	msm_modeler = MarkovStateModel(lag_time=lag_time)
	print("fitting msm to trajectories with %d clusters and lag_time %d" %(n_clusters, lag_time))
	msm_modeler.fit_transform(labels)
	verbosedump(msm_modeler, "/scratch/users/enf/b2ar_analysis/msm_model_%d_clusters_t%d" %(n_clusters, lag_time))
	print("fitted msm to trajectories with %d states" %(msm_modeler.n_states_))
	#np.savetxt("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_transmat.csv" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",")
	#G = nx.from_numpy_matrix(msm_modeler.transmat_)
	#nx.write_edgelist(G, "/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist" %(n_clusters, lag_time), msm_modeler.transmat_, delimiter=",")
	transmat = msm_modeler.transmat_

	mapping = msm_modeler.mapping_

	edges = open("/scratch/users/enf/b2ar_analysis/msm_%d_clusters_t%d_edgelist.csv" %(n_clusters, lag_time), "wb")
	for i in range(0, msm_modeler.n_states_):
		if i == 0:
			for j in range(0, msm_modeler.n_states_):
				edges.write(";")
				edges.write("%d" %mapping[j])
			edges.write("\n")

		edges.write("%d" %(mapping[i]))
		for j in range(0, msm_modeler.n_states_):
			prob = transmat[i][j]
			edges.write(";")
			if prob > 0.000001:
				edges.write("%f" %prob)
			else:
				edges.write("0")
		edges.write("\n")
	edges.close()
예제 #27
0
def _slice_file(job_tuple):
    inp_file, feature_ind, output_folder = job_tuple
    featurized_file = verboseload(inp_file)
    sliced_file = featurized_file[:, feature_ind]
    sliced_file_out = os.path.join(output_folder, os.path.basename(inp_file))
    verbosedump(sliced_file, sliced_file_out)
    return
예제 #28
0
def plot_tics_gmm_R(save_dir, data_file, gmm_dir, titles = None, tICA = False, scale = 1.0, refcoords_file = None):
  data = verboseload(data_file)
  data = np.concatenate(data)
  data[:,0] *= scale

  if(refcoords_file is not None):
    refcoords = load_file(refcoords_file)
  else:
    refcoords = None
  print(np.shape(refcoords))
  print(refcoords)

  gmm_means = []
  for j in range(0,np.shape(data)[1]):
    with gzip.open("%s/tIC%d_gmm.pkl.gz" %(gmm_dir, j)) as f:
      gmm = pickle.load(f)
    gmm_means.append(gmm.means_)

  num_columns = np.shape(data)[1]
  plot_column_pair_partial = partial(plot_column_pair, num_columns = num_columns, save_dir = save_dir, titles = titles, 
    data = data, gmm_means = gmm_means, refcoords = refcoords)
  #for i in range(0,num_columns):
  #  plot_column_pair_partial(i)
  pool = mp.Pool(mp.cpu_count())
  pool.map(plot_column_pair_partial, range(0,num_columns))
  pool.terminate()

  print("Done plotting columns")
  return
예제 #29
0
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir):

	msm = verboseload(msm_file)
	clusterer = verboseload(clusterer_file)

	#pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates)
	#macrostate_model = MarkovStateModel()
	#macrostate_model.fit(pcca.transform(labels))

	pcca_object = lumping.PCCA(n_macrostates = 10)
	pcca_object.fit(sequences = clusterer.labels_)
	#pcca_object.transform(sequences = clusterer.labels_)
	#macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates)
	print(pcca_object)
	print(pcca_object.microstate_mapping_)
	verbosedump(pcca_object, macrostate_dir)
예제 #30
0
def test_dihedral_feat():

    print(base_dir)
    pool = Pool(6)
    yaml_file = load_yaml_file(os.path.join(base_dir,"mdl_dir","project.yaml"))

    for prt in ["kinase_1", "kinase_2"]:
        print(prt)
        prj = yaml_file["project_dict"][prt][0]
        featurize_project_wrapper(yaml_file, prt, feat=None, stride=1, view=pool)

        feat = DihedralFeaturizer(types=['phi', 'psi','chi1'])
        flist = glob.glob(os.path.join(base_dir, prt , yaml_file["protein_dir"],"*.hdf5"))
        for i in np.random.choice(flist, 3):
            trj = mdt.load(i)
            my_feat = feat.partial_transform(trj)
            expected_fname = os.path.join(base_dir, prt,
                                          yaml_file["feature_dir"],
                                          os.path.splitext(os.path.basename(i))[0]+".jl")
            calc_feat = verboseload(expected_fname)

            assert np.allclose(my_feat, calc_feat)



    return True
def fit_protein_pca(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("pca__"):
            current_mdl_params[i.split("pca__")[1]] = mdl_params[i]

    protein_pca_mdl = PCA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_pca_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the pca_mdl
    pca_mdl_path = os.path.join(mdl_dir, "pca_mdl.pkl")
    verbosedump(protein_pca_mdl, pca_mdl_path)
    return
def fit_protein_tica(yaml_file,sparse=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("tica__"):
            current_mdl_params[i.split("tica__")[1]] = mdl_params[i]

    if sparse==True:
        protein_tica_mdl = SparseTICA(**current_mdl_params)
    else:
        protein_tica_mdl = tICA(**current_mdl_params)

    for protein in yaml_file["protein_list"]:
        print("Fitting to protein %s" % protein)
        with enter_protein_data_dir(yaml_file, protein):
            featurized_traj = sorted(glob.glob("./%s/*.jl" %
                                               yaml_file["feature_dir"]), key=keynat)
            for f in featurized_traj:
                featurized_path = verboseload(f)
                try:
                    protein_tica_mdl.partial_fit(featurized_path)
                except:
                    pass
            print("Done partial fitting to protein %s" % protein)
    # dumping the tica_mdl
    tica_mdl_path = os.path.join(mdl_dir, "tica_mdl.pkl")
    verbosedump(protein_tica_mdl, tica_mdl_path)
    return
def transform_protein_kmeans(yaml_file):
    mdl_dir = yaml_file["mdl_dir"]
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    kmeans_mdl = verboseload(kmeans_mdl_path)
    for protein in yaml_file["protein_list"]:
        print("Assigning protein %s" % protein)
        with enter_protein_mdl_dir(yaml_file, protein):
            tica_data = verboseload("tica_data.pkl")
            # do assignments
            assignments = {}
            for i in tica_data.keys():
                assignments[i] = kmeans_mdl.predict([tica_data[i]])[0]
            verbosedump(assignments, 'assignments.pkl')

            print("Done assigning %s" % protein)
    return
예제 #34
0
def cluster(data_dir, traj_dir, n_clusters):
    reduced_data = verboseload(data_dir)
    trajs = np.concatenate(reduced_data)
    plt.hexbin(trajs[:, 0], trajs[:, 1], bins='log', mincnt=1)

    clusterer = MiniBatchKMedoids(n_clusters=n_clusters)
    clusterer.fit_transform(reduced_data)

    centers = clusterer.cluster_centers_
    for i in range(0, np.shape(centers)[0]):
        center = centers[i, :]
        plt.scatter(center[0], center[1])
        plt.annotate('C%d' % i,
                     xy=(center[0], center[1]),
                     xytext=(center[0] + 0.1, center[1] + 0.1),
                     arrowprops=dict(facecolor='black', shrink=0.05))

        location = clusterer.cluster_ids_[i, :]
        print(location)
        traj = get_trajectory_files(traj_dir)[location[0]]
        print(("traj = %s" % traj))
        print(("frame = %d" % location[1]))
        conformation = md.load_frame(traj, location[1])
        conformation.save_pdb(
            "/scratch/users/enf/b2ar_analysis/cluster_%d.pdb" % i)

    plt.show()
예제 #35
0
def plot_timescales(clusterer_dir,
                    n_clusters,
                    tica_dir,
                    main="",
                    lag_times=list(range(1, 50))):
    clusterer = verboseload(clusterer_dir)
    print(clusterer)
    sequences = clusterer.labels_
    #print(sequences)
    #lag_times = list(np.arange(1,150,5))
    n_timescales = 5

    msm_timescales = implied_timescales(sequences,
                                        lag_times,
                                        n_timescales=n_timescales,
                                        msm=MarkovStateModel(
                                            verbose=True,
                                            prior_counts=1e-5,
                                            ergodic_cutoff='off'))
    print(msm_timescales)

    for i in range(n_timescales):
        plt.plot(lag_times, msm_timescales[:, i])
    plt.xlabel("Lag time (ns)")
    plt.ylabel("Implied Timescales (ns)")
    plt.title(main)
    plt.semilogy()
    pp = PdfPages("%s/%s_n_clusters%d_implied_timescales.pdf" %
                  (tica_dir, main, n_clusters))
    pp.savefig()
    pp.close()
    plt.clf()
예제 #36
0
def test_map_tic_component():
    yaml_file = os.path.join(base_dir,"mdl_dir","project.yaml")
    yaml_file = load_yaml_file(yaml_file)
    fit_pipeline(yaml_file["base_dir"])

    with enter_protein_data_dir(yaml_file, "kinase_1"):
        df = pd.DataFrame(verboseload(
            os.path.join(yaml_file["feature_dir"],
                         "feature_descriptor.h5")
        ))
        trj = mdt.load(os.path.join(yaml_file["protein_dir"], "fake_proj1_0_0.hdf5"))


    ser = ProteinSeries(yaml_file,base_dir)
    prt = Protein(ser, "kinase_1")

    tica_mdl = prt.tica_mdl
    tic_index=0
    t_c = tica_mdl.components_[tic_index, :]

    a_i, r_i = _map_tic_component(t_c, df, trj)

    assert len(a_i[0]) == trj.n_atoms
    assert len(r_i[0]) == trj.n_residues

    #spot check residue 0
    df2 = pd.DataFrame([i[1] for i in df.iterrows() if 0 in i[1]["resids"]])
    r0_imp = np.sum(abs(t_c[df2.index]))
    assert r0_imp==r_i[0,0]
def fit_protein_kmeans(yaml_file,mini=True):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
예제 #38
0
def macrostate_pcca(msm_file, clusterer_file, n_macrostates, macrostate_dir):

    msm = verboseload(msm_file)
    clusterer = verboseload(clusterer_file)

    #pcca = lumping.PCCAPlus.from_msm(msm = msm,n_macrostates = n_macrostates)
    #macrostate_model = MarkovStateModel()
    #macrostate_model.fit(pcca.transform(labels))

    pcca_object = lumping.PCCA(n_macrostates=10)
    pcca_object.fit(sequences=clusterer.labels_)
    #pcca_object.transform(sequences = clusterer.labels_)
    #macrostate_model = pcca_object.from_msm(msm = msm, n_macrostates = n_macrostates)
    print(pcca_object)
    print((pcca_object.microstate_mapping_))
    verbosedump(pcca_object, macrostate_dir)
def fit_bootstrap(yaml_file,pool=None):
    mdl_params = yaml_file["mdl_params"]
    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("msm__"):
            current_mdl_params[i.split("msm__")[1]] = mdl_params[i]

    if "bootstrap__n_samples" in mdl_params.keys():
        bootstrap__n_samples = mdl_params["bootstrap__n_samples"]
    else:
        bootstrap__n_samples = 100
    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            print(protein)
            assignments = verboseload("assignments.pkl")
            msm_mdl =BootStrapMarkovStateModel(n_samples= bootstrap__n_samples, n_procs=2,
                                               msm_args = current_mdl_params
                                               )
            msm_mdl.fit([assignments[i] for i in assignments.keys()], pool=pool)
            verbosedump(msm_mdl, "bootstrap_msm_mdl.pkl")
            verbosedump(msm_mdl.mle_, "msm_mdl.pkl")
            fixed_assignments = {}
            for i in assignments.keys():
                fixed_assignments[i] = msm_mdl.mle_.transform(
                    assignments[i], mode='fill')[0]
            verbosedump(fixed_assignments, 'fixed_assignments.pkl')
    return            
def fit_bayes_msms(yaml_file):
    mdl_params = yaml_file["mdl_params"]
    msm__lag_time = mdl_params["msm__lag_time"]
    if "bayesmsm__n_samples" in mdl_params.keys():
        bayesmsm__n_samples = mdl_params["bayesmsm__n_samples"]
    else:
        bayesmsm__n_samples = 800
    if "bayesmsm__n_steps" in mdl_params.keys():
        bayesmsm__n_steps = mdl_params["bayesmsm__n_steps"]
    else:
        bayesmsm__n_steps = 1000000

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            print(protein)
            assignments = verboseload("assignments.pkl")
            msm_mdl = BayesianMarkovStateModel(n_samples=bayesmsm__n_samples,
                                               n_steps=bayesmsm__n_steps,
                                               lag_time=msm__lag_time,
                                               ergodic_cutoff=1.0/msm__lag_time,
                                               verbose=True).fit(
                [assignments[i] for i in assignments.keys()])
            _ = msm_mdl.all_eigenvalues_
            verbosedump(msm_mdl, "bayesmsm_mdl.pkl")
            fixed_assignments = {}
            for i in assignments.keys():
                fixed_assignments[i] = msm_mdl.transform(
                    assignments[i], mode='fill')[0]
            verbosedump(fixed_assignments, 'fixed_assignments.pkl')
    return
예제 #41
0
def find_most_important_residues_in_tIC(traj_file, tica_object, tic_features_csv, contact_residues,tic_residue_csv, feature_coefs_csv, duplicated_feature_coefs_csv, cutoff):
	try:
		tica = verboseload(tica_object)
	except:
		tica = load_dataset(tica_object)
	print traj_file
	traj = md.load_frame(traj_file, 0)
	#traj = fix_traj(traj)
	top = traj.topology 
	#residue_pairs = compute_contacts_below_cutoff([traj_file, [0]], cutoff = cutoff, contact_residues = contact_residues, anton = True)
	residue_pairs = generate_features(tic_features_csv)
	new_residue_pairs = []
	for pair in residue_pairs:
		new_residue_pairs.append(("%s%d.%d" %(pair[0][2], pair[0][1], pair[0][0])), ("%s%d.%d" %(pair[1][2], pair[1][1], pair[1][0])))
	residue_pairs = new_residue_pairs
	#print traj_file

	
	top_indices_per_tIC = {}
	feature_coefs_per_tIC = {}
	duplicated_feature_coefs_per_tIC = {}


	#for each tIC:
		#for each feature, get the absolute component value
		#add to feature_coefs_per_tIC dictionary the absolute coefficient for that tIC
		#duplicate them for the analysis where we look at residues individually
		#sort by absolute coefficient value

	#for each tIC:
		#

	for i in range(0, np.shape(tica.components_)[0]):
		print i
		index_components = [(j,abs(tica.components_[i][j])) for j in range(0,np.shape(tica.components_)[1])]
		feature_coefs_per_tIC[i] = [component[1] for component in index_components]
		duplicated_feature_coefs_per_tIC[i] = [j for k in feature_coefs_per_tIC[i] for j in (k, k)] 
		index_components = sorted(index_components, key= lambda x: x[1],reverse=True)
		print(index_components[0:10])
		list_i = [index_components[j][0] for j in range(0,len(index_components))]
		top_indices_per_tIC[i] = list_i
	
	top_residues_per_tIC = {}
	for i in range(0, np.shape(tica.components_)[0]):
		top_residues_per_tIC[i] = []
		for index in top_indices_per_tIC[i]:
			residues = residue_pairs[index]
			top_residues_per_tIC[i].append(residues)
		top_residues_per_tIC[i] = [item for sublist in top_residues_per_tIC[i] for item in sublist]

	residue_list = residue_pairs

	feature_coefs_per_tIC["residues_0"] = [pair[0] for pair in residue_list]
	feature_coefs_per_tIC["residues_1"] = [pair[1] for pair in residue_list]
	duplicated_feature_coefs_per_tIC["residues"] = [residue for residue_pair in residue_list for residue in residue_pair]

	write_map_to_csv(tic_residue_csv, top_residues_per_tIC, [])
	write_map_to_csv(feature_coefs_csv, feature_coefs_per_tIC, [])
	write_map_to_csv(duplicated_feature_coefs_csv, duplicated_feature_coefs_per_tIC, [])
	return
예제 #42
0
def plot_all_tics_and_clusters(tica_dir, transformed_data_dir, clusterer_dir, lag_time, label = "dot", active_cluster_ids = [], intermediate_cluster_ids = [], inactive_cluster_ids = []):
	try:
		transformed_data = verboseload(transformed_data_dir)
	except:
		transformed_data = load_dataset(transformed_data_dir)
	clusterer = verboseload(clusterer_dir)
	num_tics = np.shape(transformed_data[0])[1]
	print "Looking at %d tICS" %num_tics
	for i in range(0,num_tics):
		js = range(i+1, num_tics)
		plot_partial = partial(plot_tica_and_clusters, tica_dir = tica_dir, transformed_data = transformed_data, clusterer = clusterer, lag_time = lag_time, label = "dot", active_cluster_ids = active_cluster_ids, intermediate_cluster_ids = intermediate_cluster_ids, inactive_cluster_ids = inactive_cluster_ids, component_i = i)
		pool = mp.Pool(mp.cpu_count())
		pool.map(plot_partial, js)
		pool.terminate()
		#plot_tica_and_clusters(tica_dir = tica_dir, transformed_data = transformed_data, clusterer = clusterer, lag_time = lag_time, label = "dot", active_cluster_ids = active_cluster_ids, intermediate_cluster_ids = intermediate_cluster_ids, inactive_cluster_ids = inactive_cluster_ids, component_i = i, component_j = j)
	print "Printed all tICA coords and all requested clusters"
def fit_bootstrap(yaml_file,pool=None):
    mdl_params = yaml_file["mdl_params"]
    current_mdl_params={}
    bootstrap_mdl_params={}

    for i in mdl_params.keys():
        if i.startswith("msm__"):
            current_mdl_params[i.split("msm__")[1]] = mdl_params[i]
        if i.startswith("bootstrap__"):
            bootstrap_mdl_params[i.split("bootstrap__")[1]] = mdl_params[i]

    if "n_samples" not in bootstrap_mdl_params.keys():
        bootstrap_mdl_params["n_samples"] = 100

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            print(protein)
            assignments = verboseload("assignments.pkl")
            msm_mdl =BootStrapMarkovStateModel(n_procs=2,
                                               msm_args = current_mdl_params,
                                               **bootstrap_mdl_params)
            msm_mdl.fit([assignments[i] for i in assignments.keys()], pool=pool)
            verbosedump(msm_mdl, "bootstrap_msm_mdl.pkl")
            verbosedump(msm_mdl.mle_, "msm_mdl.pkl")
            fixed_assignments = {}
            for i in assignments.keys():
                fixed_assignments[i] = msm_mdl.mle_.transform(
                    assignments[i], mode='fill')[0]
            verbosedump(fixed_assignments, 'fixed_assignments.pkl')
    return            
예제 #44
0
def plot_tica(transformed_data_dir, lag_time):
	transformed_data = verboseload(transformed_data_dir)
	trajs = np.concatenate(transformed_data)
	plt.hexbin(trajs[:,0], trajs[:,1], bins='log', mincnt=1)
	pp = PdfPages("/scratch/users/enf/b2ar_analysis/tica_phi_psi_chi2_t%d.pdf" %lag_time)
	pp.savefig()
	pp.close()
예제 #45
0
def plot_pnas_vs_tics(pnas_dir, tic_dir, pnas_names, directory, scale = 7.14, refcoords_file = None):
	pnas = np.concatenate(load_file(pnas_dir))
	pnas[:,0] *= scale
	print(np.shape(pnas))
	print(len(pnas_names))
	if("ktICA" in tic_dir):
		tics = load_dataset(tic_dir)
	else:
		tics = verboseload(tic_dir)
	print(np.shape(tics))
	tics = np.concatenate(tics)
	print(np.shape(tics))
	if len(pnas_names) != np.shape(pnas)[1]:
		print("Invalid pnas names")
		return

	for i in range(0,np.shape(pnas)[1]):
		for j in range(0,np.shape(tics)[1]):
			tic = tics[:,j]
			pnas_coord = pnas[:,i]
			plt.hexbin(tic, pnas_coord, bins = 'log', mincnt=1)
			coord_name = pnas_names[i]
			tic_name = "tIC.%d" %(j+1)
			plt.xlabel(tic_name)
			plt.ylabel(coord_name)
			pp = PdfPages("%s/%s_%s_hexbin.pdf" %(directory, tic_name, coord_name))
			pp.savefig()
			pp.close()
			plt.clf()

	return
예제 #46
0
def normalize_project_series(yaml_file, output_folder="normalized_features",
                             stride=40,nrm=None):
    """
    routine to take a set of proteins features stored in the feature_dir and
    normalize them by removing the mean and setting variance to 1 using the standard
    scaler. The normalizer is dumped into the mdl dir.
    :param yaml_file: The yaml file to work with.
    :param output_folder: The name of the output folder to dump normalized features in
    :param stride: The initial stride in files to fit the normalizer with.
    This is necessary to prevent memory errors. defaults to every 40th file
    :param nrm: previously fit normalizer. else it uses the standard scaler from
    scikitlearn
    :return:
    """
    yaml_file = load_yaml_file(yaml_file)
    #setup normalizer
    if nrm is None:
        nrm = preprocessing.StandardScaler()
        all_data = {}
        for prt in yaml_file["protein_list"]:
            with enter_protein_data_dir(yaml_file, prt):
                print(prt)
                flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride]
                for f in flist:
                     all_data[f]=verboseload(f)

        seq=[]
        for i in all_data.keys():
           seq.extend(all_data[i])

        #fit it
        nrm.fit(seq)
        #dump it into the mdl dir.
        verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"])

    for prt in yaml_file["protein_list"]:
        _check_output_folder_exists(yaml_file, prt, output_folder)

        with enter_protein_data_dir(yaml_file, prt):
            output_folder_path = os.path.abspath(output_folder)
            flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))
            for f in flist:
                res = verboseload(f)
                res = nrm.transform(res)
                verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f)))

    return
예제 #47
0
def normalize_project_series(yaml_file, output_folder="normalized_features",
                             stride=1,nrm=None):
    """
    routine to take a set of proteins features stored in the feature_dir and
    normalize them by removing the mean and setting variance to 1 using the standard
    scaler. The normalizer is dumped into the mdl dir.
    :param yaml_file: The yaml file to work with.
    :param output_folder: The name of the output folder to dump normalized features in
    :param stride: The initial stride in files to fit the normalizer with.
    This is necessary to prevent memory errors. defaults to every 40th file
    :param nrm: previously fit normalizer. else it uses the standard scaler from
    scikitlearn
    :return:
    """
    yaml_file = load_yaml_file(yaml_file)
    #setup normalizer
    if nrm is None:
        nrm = preprocessing.RobustScaler()
        all_data = {}
        for prt in yaml_file["protein_list"]:
            with enter_protein_data_dir(yaml_file, prt):
                print(prt)
                flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))[::stride]
                for f in flist:
                     all_data[f]=verboseload(f)

        seq=[]
        for i in all_data.keys():
           seq.extend(all_data[i])

        #fit it
        nrm.fit(seq)
        #dump it into the mdl dir.
        verbosedump(nrm,"%s/nrm.h5"%yaml_file["mdl_dir"])

    for prt in yaml_file["protein_list"]:
        _check_output_folder_exists(yaml_file, prt, output_folder)

        with enter_protein_data_dir(yaml_file, prt):
            output_folder_path = os.path.abspath(output_folder)
            flist = glob.glob("./%s/*.jl"%(yaml_file["feature_dir"]))
            for f in flist:
                res = verboseload(f)
                res = nrm.transform(res)
                verbosedump(res,"%s/%s"%(output_folder_path, os.path.basename(f)))

    return
예제 #48
0
def reseed_from_clusterer(clusterer_file, main, tica_dir,
                          projected_features_dir, traj_files):
    clusterer = verboseload(clusterer_file)
    n_clusters = len(clusterer.cluster_centers_)
    print(n_clusters)
    clusters_map = make_clusters_map(verboseload(clusterer_file))
    count_tuples = []
    for i in range(0, n_clusters):
        count_tuples.append((i, len(clusters_map[i])))
    count_tuples.sort(key=operator.itemgetter(1))
    min_populated_clusters = [count_tuples[i][0] for i in range(0, 16)]
    print(min_populated_clusters)
    plot_all_tics_and_clusters(tica_dir,
                               projected_features_dir,
                               clusterer_file,
                               None,
                               tic_range=[0],
                               main=main,
                               label="cluster_id",
                               active_cluster_ids=min_populated_clusters)

    traj_index_frame_pairs = list(
        find_closest_indices_to_cluster_center(projected_features_dir,
                                               clusterer_file))
    traj_index_frame_pairs = [tuple(pair) for pair in traj_index_frame_pairs]

    for i, traj_index_frame_pair in enumerate(traj_index_frame_pairs):
        traj_index, frame = traj_index_frame_pair
        if i in min_populated_clusters:
            print("Looking at cluster %d" % i)
            print("Snapshot in: %s" % str(traj_index_frame_pair))
            snapshot = md.load_frame(traj_files[traj_index], index=frame)
            snapshot.save("%s/%smincount_snapshot_cluster%d.rst7" %
                          (tica_dir, main, i))
            snapshot.save("%s/%smincount_snapshot_cluster%d.pdb" %
                          (tica_dir, main, i))
            protein_indices = [
                a.index for a in snapshot.topology.atoms
                if a.residue.is_protein or "LIG" in str(a.residue)
            ]
            snapshot_protein = snapshot.atom_slice(protein_indices)
            snapshot_protein.save(
                "%s/%smincount_snapshot_cluster%d_protein.pdb" %
                (tica_dir, main, i))

    return (min_populated_clusters)
예제 #49
0
def cluster_project_wrapper(proj_folder,feature_dict,n_states):

     if os.path.exists(proj_folder+"/assignments.pkl"):
          return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl")
     elif os.path.exists(proj_folder+"/cluster_mdl.pkl"):
          cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl")
     else:
          cluster_mdl = KMeans(n_clusters = n_states)
          cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

     assignments={}
     for i in feature_dict.keys():
          assignments[i] = cluster_mdl.transform([feature_dict[i]])

     verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl")
     verbosedump(assignments,proj_folder+"/assignments.pkl")
     return cluster_mdl,assignments
예제 #50
0
def plot_all_tics(tica_dir, transformed_data_dir, lag_time):
	transformed_data = verboseload(transformed_data_dir)
	num_tics = np.shape(transformed_data[0])[1]
	print "Looking at %d tICS" %num_tics
	for i in range(0,num_tics):
		for j in range(i+1,num_tics):
			plot_tica_component_i_j(tica_dir, transformed_data_dir, lag_time, component_i = i, component_j = j)
	print "Printed all tICA coords"
예제 #51
0
def fit_and_transform(directory, stride=5):

    projected_data_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi_stride%d_projected.h5" % stride
    fit_model_filename = "/scratch/users/enf/b2ar_analysis/phi_psi_chi2_stride%s_tica_coords.h5" % stride
    #active_pdb_file = "/scratch/users/enf/b2ar_analysis/3P0G_pymol_prepped.pdb"
    active_pdb_file = "/scratch/users/enf/b2ar_analysis/system_B.pdb"

    tica_model = tICA(n_components=4)

    if not os.path.exists(projected_data_filename):
        print("loading feature files")
        feature_files = get_trajectory_files(directory)
        pool = mp.Pool(mp.cpu_count())
        features = pool.map(load_features, feature_files)
        pool.terminate()
        if not os.path.exists(fit_model_filename):
            print("fitting data to tICA model")
            fit_model = tica_model.fit(features)
            verbosedump(fit_model, fit_model_filename)
            transformed_data = fit_model.transform(features)
            verbosedump(transformed_data, projected_data_filename)
        else:
            print("loading tICA model")
            fit_model = verboseload(fit_model_filename)
            transformed_data = fit_model.transform(features)
            verbosedump(transformed_data, projected_data_filename)
    else:
        fit_model = verboseload(fit_model_filename)
        transformed_data = verboseload(projected_data_filename)

    active_pdb = md.load(active_pdb_file)
    top = active_pdb.topology
    atom_indices = [
        a.index for a in top.atoms
        if a.residue.is_protein and a.residue.resSeq != 341
        and a.residue.name[0:2] != "HI" and a.residue.resSeq != 79
        and a.residue.resSeq != 296 and a.residue.resSeq != 269 and a.residue.
        resSeq != 178 and a.residue.resSeq != 93 and a.residue.name != "NMA"
        and a.residue.name != "NME" and a.residue.name != "ACE"
    ]
    active_pdb = md.load(active_pdb_file, atom_indices=atom_indices)
    featurizer = DihedralFeaturizer(types=['phi', 'psi', 'chi2'])
    active_pdb_features = featurizer.transform(active_pdb)
    active_pdb_projected = fit_model.transform(active_pdb_features)
    print((active_pdb_projected[0:4]))
예제 #52
0
def plot_tica(transformed_data_dir, lag_time):
    transformed_data = verboseload(transformed_data_dir)
    trajs = np.concatenate(transformed_data)
    plt.hexbin(trajs[:, 0], trajs[:, 1], bins='log', mincnt=1)
    pp = PdfPages(
        "/scratch/users/enf/b2ar_analysis/tica_phi_psi_chi2_t%d.pdf" %
        lag_time)
    pp.savefig()
    pp.close()
예제 #53
0
def cluster_project_wrapper(proj_folder, feature_dict, n_states):

    if os.path.exists(proj_folder + "/assignments.pkl"):
        return verboseload(proj_folder +
                           "/cluster_mdl.pkl"), verboseload(proj_folder +
                                                            "/assignments.pkl")
    elif os.path.exists(proj_folder + "/cluster_mdl.pkl"):
        cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl")
    else:
        cluster_mdl = KMeans(n_clusters=n_states)
        cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

    assignments = {}
    for i in feature_dict.keys():
        assignments[i] = cluster_mdl.transform([feature_dict[i]])

    verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl")
    verbosedump(assignments, proj_folder + "/assignments.pkl")
    return cluster_mdl, assignments
예제 #54
0
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
    clusterer_dir = "%s/clusterer_%dclusters.h5" % (tica_dir, n_clusters)
    if (os.path.exists(clusterer_dir)):
        print("Already clustered")
    else:
        print("Clustering by KMeans")
        reduced_data = verboseload(data_dir)
        trajs = np.concatenate(reduced_data)
        clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1)
        clusterer.fit_transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
def transform_protein_kmeans(yaml_file,pca=False):
    mdl_dir = yaml_file["mdl_dir"]
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    kmeans_mdl = verboseload(kmeans_mdl_path)
    for protein in yaml_file["protein_list"]:
        print("Assigning protein %s" % protein)
        with enter_protein_mdl_dir(yaml_file, protein):
            if pca:
                tica_data = verboseload("pca_data.pkl")
            else:
                tica_data = verboseload("tica_data.pkl")
 
            # do assignments
            assignments = {}
            for i in tica_data.keys():
                assignments[i] = kmeans_mdl.predict([tica_data[i]])[0]
            verbosedump(assignments, 'assignments.pkl')

            print("Done assigning %s" % protein)
    return
예제 #56
0
def main():
    args = parse_commandline()
    traj_file = args.t
    top_file  = args.p
    tica_file = args.c
    tic_index = args.i
    out_file = args.o
    stride = args.s
    describer = args.d
    cutoff = args.u
    #load stuff
    trj = mdt.load(traj_file,top=top_file,stride=stride)
    tica_mdl = verboseload(tica_file)
    df = pd.DataFrame(verboseload(describer))


    dat_fn = "importances_{}.txt".format(out_file)
    tcl_fn = "{}.tcl".format(out_file)

    tica_to_vmd(df, tica_mdl, tic_index, traj_file, top_file,
                trj, stride, dat_fn, tcl_fn, cutoff)
예제 #57
0
def plot_all_tics(tica_dir, transformed_data_dir, lag_time):
    transformed_data = verboseload(transformed_data_dir)
    num_tics = np.shape(transformed_data[0])[1]
    print("Looking at %d tICS" % num_tics)
    for i in range(0, num_tics):
        for j in range(i + 1, num_tics):
            plot_tica_component_i_j(tica_dir,
                                    transformed_data_dir,
                                    lag_time,
                                    component_i=i,
                                    component_j=j)
    print("Printed all tICA coords")
예제 #58
0
 def __init__(self, series, name):
     if not isinstance(series, ProteinSeries):
         raise Exception("We need a project series to be associated "
                         "with this kinase")
     self.name = name
     self.project = series
     self.kmeans_mdl = self.project.kmeans_mdl
     self.tica_mdl = self.project.tica_mdl
     self.protein_mdl_dir = os.path.join(self.project.relative_loc,
                                         self.name)
     if os.path.isfile("%s/bootstrap_msm_mdl.pkl" % self.protein_mdl_dir):
         self.bootrap_msm = verboseload("%s/bootstrap_msm_mdl.pkl" %
                                        self.protein_mdl_dir)
     if os.path.isfile("%s/msm_mdl.pkl" % self.protein_mdl_dir):
         self.msm = verboseload("%s/msm_mdl.pkl" % self.protein_mdl_dir)
     if os.path.isfile("%s/bayesmsm_mdl.pkl" % self.protein_mdl_dir):
         self.bayesmsm = verboseload("%s/bayesmsm_mdl.pkl" %
                                     self.protein_mdl_dir)
     self.tica_data = verboseload("%s/tica_data.pkl" % self.protein_mdl_dir)
     self.assignments = verboseload("%s/assignments.pkl" %
                                    self.protein_mdl_dir)
     self.fixed_assignments = verboseload("%s/fixed_assignments.pkl" %
                                          self.protein_mdl_dir)
     self.n_states_ = self.msm.n_states_
     self.n_tics_ = self.kmeans_mdl.cluster_centers_.shape[1]
     self._computed = False
     self._tic_dict = None
     self._tic_min = None
     self._tic_max = None
     self._mlpt_fct = 0.4