def test_plot_voronoi(): kmeans = KMeans(n_clusters=15) kmeans.fit([data]) ax = plot_voronoi(kmeans) assert isinstance(ax, Subplot)
def test_plot_voronoi(self): kmeans = KMeans(n_clusters=15) kmeans.fit([data]) ax = plot_voronoi(kmeans, xlabel='x', ylabel='y') assert isinstance(ax, SubplotBase)
def cluster_msm(sequences,n_states, lag_times): for n in n_states: states = KMeans(n_clusters=n) states.fit(sequences) io.dump(states,str(n)+'n_cl.pkl') ts=np.zeros(5) for lag_time in lag_times: msm = MarkovStateModel(lag_time=lag_time, verbose=False,n_timescales=5) msm.fit(states.labels_) ts1=msm.timescales_ ts=np.vstack((ts,ts1)) io.dump(msm,str(n)+'n_'+str(lag_time)+'lt_msm.pkl') ts=np.delete(ts, (0), axis=0) io.dump(ts,str(n)+'n_timescales.pkl')
def cluster_project_wrapper(proj_folder,feature_dict,n_states): if os.path.exists(proj_folder+"/assignments.pkl"): return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl") elif os.path.exists(proj_folder+"/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters = n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments={} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl") verbosedump(assignments,proj_folder+"/assignments.pkl") return cluster_mdl,assignments
def cluster_project_wrapper(proj_folder, feature_dict, n_states): if os.path.exists(proj_folder + "/assignments.pkl"): return verboseload(proj_folder + "/cluster_mdl.pkl"), verboseload(proj_folder + "/assignments.pkl") elif os.path.exists(proj_folder + "/cluster_mdl.pkl"): cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl") else: cluster_mdl = KMeans(n_clusters=n_states) cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()]) assignments = {} for i in feature_dict.keys(): assignments[i] = cluster_mdl.transform([feature_dict[i]]) verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl") verbosedump(assignments, proj_folder + "/assignments.pkl") return cluster_mdl, assignments
def cluster(): ''' This function perfomes K-means clustering on the tICA space and saves assignsment files for each trajectory. Cluster centers are also saved at `microstate_centers.txt` file. ''' cluster = KMeans(n_clusters=n_states,n_jobs=-1,verbose=0, max_iter=100, tol=0.0001,) dataset, ev0, ev1 = [], [], [] print "Loading projected data..." for i in tqdm(range(start_traj, end_traj+1)): a = io.loadh('%s/traj%d_%s.h5' %(proj_path,i,traj_name))['arr_0'] a = a[:,0:2] dataset.append(a) ev0.extend(a[:,0]) ev1.extend(a[:,1]) print "Clustering %d datapoints..." %len(ev0) cluster.fit(dataset) for i in range(start_traj,end_traj+1): np.savetxt('%s/assigns_%d.txt' %(out_path,i),np.array(cluster.labels_[i-start_traj]),fmt='%d') np.savetxt('%s/microstate_centers.txt' %out_path,np.array(cluster.cluster_centers_)) print "Saved microstate assignments and microstate centers at %s" %out_path return cluster.cluster_centers_, np.array(ev0), np.array(ev1)
topFile='NarK-strip.pdb' dataset = [] ls = [] for i in sorted(glob.glob('*.npy')): a = np.load(i) b = np.array(a) dataset.append(b) ls.append(i) print(i) np.save('list', ls) #trajs = [np.load('data.npy')] # make cluster of the tICs trajectories cluster = KMeans(n_clusters=myn_clusters) cluster.fit(dataset) l = cluster.labels_ T = [] for trj in glob.glob('*strip.mdcrd'): T.append(trj) T.sort() # Write the output file, which have the information about population of each cluster, # trajectory name and frame number of corresponding frame asFunctions.writeOPF(l, T, myn_clusters, n_samples) # Based on information in output file, build the cpptraj input file asFunctions.CpptrajInGen_commonTop(topFile) #pickle.dump( cluster , open( "tICCluster.pkl", "wb"))
lagtime = 50 for n in n_clusters: kmeans = KMeans(n_clusters=n, n_jobs=-1) print "Clustering data to %d clusters..." % n for fold in range(nFolds): train_data = [] test_data = [] for i in range(len(tica_data)): cv = KFold(len(tica_data[i]), n_folds=nFolds) for current_fold, (train_index, test_index) in enumerate(cv): if current_fold == fold: train_data.append(tica_data[i][train_index]) test_data.append(tica_data[i][test_index]) reduced_train_data = sub_sampling_data(train_data, stride=100) kmeans.fit(reduced_train_data) assignments_train = kmeans.predict(train_data) assignments_test = kmeans.predict(test_data) msm = MarkovStateModel(lag_time=lagtime) msm.fit(assignments_train) train_score = msm.score_ test_score = msm.score(assignments_test) results.append({ 'train_score': train_score, 'test_score': test_score, 'n_states': n, 'fold': fold, 'timescales': msm.timescales_ })