示例#1
0
def cluster_minikmeans(tica_dir,
                       data_dir,
                       traj_dir,
                       n_clusters,
                       clusterer_dir=None,
                       tICs=None):
    if (os.path.exists(clusterer_dir)):
        reduced_data = load_file(data_dir)
        clusterer = verboseload(clusterer_dir)
        clusterer.labels_ = clusterer.transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
    else:
        print("Clustering by KMeans")
        try:
            reduced_data = verboseload(data_dir)
        except:
            reduced_data = load_dataset(data_dir)
        if tICs is not None:
            X = []
            for traj in reduced_data:
                X.append(traj[:, tICs])
        else:
            X = reduced_data

        clusterer = MiniBatchKMeans(n_clusters=n_clusters, n_init=10)
        clusterer.fit_transform(X)
        verbosedump(clusterer, clusterer_dir)
示例#2
0
def cluster_minikmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
	clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters)
	if (os.path.exists(clusterer_dir)):
		print "Already clustered"
	else:
		print "Clustering by KMeans"
		try:
			reduced_data = verboseload(data_dir)
		except:
			reduced_data = load_dataset(data_dir)
		trajs = np.concatenate(reduced_data)
		clusterer = MiniBatchKMeans(n_clusters = n_clusters)
		clusterer.fit_transform(reduced_data)
		verbosedump(clusterer, clusterer_dir)
示例#3
0
    def generate_clusters(self, ticad):
        """
        Updates the cluster data. Needs to be re-done each iteration as
        cluster from previous trajectories may change as we get more data.

        Returns: clustered dataset
        """
        clustr = MiniBatchKMeans(
            n_clusters=self.config.getint("model", "num_clusters"))
        clustered = clustr.fit_transform(ticad)
        if self.save_extras:
            utils.dump(clustr, "microstater.pkl")
        return clustered
示例#4
0
rs = np.random.RandomState(42)

# Load Fs Peptide Data
trajs = FsPeptide().get().trajectories

# Extract Backbone Dihedrals
featurizer = DihedralFeaturizer(types=['chi1'])
diheds = featurizer.fit_transform(trajs)

# Perform Dimensionality Reduction
tica_model = tICA(lag_time=2, n_components=2)
tica_trajs = tica_model.fit_transform(diheds)

# Perform Clustering
clusterer = MiniBatchKMeans(n_clusters=12, random_state=rs)
clustered_trajs = clusterer.fit_transform(tica_trajs)

# Construct MSM
msm = MarkovStateModel(lag_time=2)
assignments = msm.fit_transform(clustered_trajs)

# Plot Stacked Distributions
a = np.concatenate(assignments, axis=0)
d = np.concatenate(diheds, axis=0)

# Plot Stacked Distributions of the sine of each Chi1 angle
# within an arbitrary set of states {2, 5, 0}
path_data = [d[a == i][:, ::2] for i in [2, 5, 0]]
msme.plot_stackdist(path_data)
示例#5
0
def calculate_fitness(population_dihedral, diheds, score_global, i, lock):
    import pandas as pd
    import numpy as np
    pop_index = i
    new_diheds = []

    for i in range(0, len(diheds)):
        X = diheds[i]
        selected_features = X[:, population_dihedral]
        new_diheds.append(selected_features)
    from msmbuilder.preprocessing import RobustScaler
    scaler = RobustScaler()
    scaled_diheds = scaler.fit_transform(new_diheds)
    scaled_diheds = new_diheds
    from msmbuilder.decomposition import tICA
    tica_model = tICA(lag_time=2, n_components=5)
    tica_model.fit(scaled_diheds)
    tica_trajs = tica_model.transform(scaled_diheds)
    from msmbuilder.cluster import MiniBatchKMeans
    clusterer = MiniBatchKMeans(n_clusters=200, random_state=42)

    clustered_trajs = clusterer.fit_transform(tica_trajs)
    from msmbuilder.msm import MarkovStateModel
    msm = MarkovStateModel(lag_time=50, n_timescales=5)
    #msm.fit_transform(clustered_trajs)
    from sklearn.cross_validation import KFold
    n_states = [4]
    cv = KFold(len(clustered_trajs), n_folds=5)
    results = []
    for n in n_states:
        msm.n_states_ = n
        for fold, (train_index, test_index) in enumerate(cv):
            train_data = [clustered_trajs[i] for i in train_index]
            test_data = [clustered_trajs[i] for i in test_index]
            msm.fit(train_data)
            train_score = msm.score(train_data)
            test_score = msm.score(test_data)
            time_score = msm.timescales_[0]
            time_test_score = time_score + test_score
            print(time_score)
            print(test_score)
            av_score = time_test_score / 2
            results.append({
                'train_score': train_score,
                'test_score': test_score,
                'time_score': time_score,
                'av_score': av_score,
                'n_states': n,
                'fold': fold
            })
            print(msm.timescales_)
    results = pd.DataFrame(results)
    avgs = (results.groupby('n_states').aggregate(np.median).drop('fold',
                                                                  axis=1))
    best_nt = avgs['test_score'].idxmax()
    best_n = avgs['av_score'].idxmax()
    best_score = avgs.loc[best_n, 'av_score']
    best_scorent = avgs.loc[best_nt, 'test_score']
    print(best_scorent)
    lock.acquire()
    score_global.update({pop_index: best_scorent})
    lock.release()