def test_kcenters_4(): # test for predict() using non-euclidean distance. because of the # way the code is structructured, this takes a different path model = KCenters(n_clusters=10, metric='cityblock') data = np.random.randn(100, 2) labels1 = model.fit_predict([data]) labels2 = model.predict([data]) eq(labels1[0], labels2[0]) all_pairs = scipy.spatial.distance.cdist(data, model.cluster_centers_, metric='cityblock') eq(labels2[0], np.argmin(all_pairs, axis=1))
def test_kcenters_3(): # test for predict using euclidean distance m = KCenters(n_clusters=10) data = np.random.randn(100, 2) labels1 = m.fit_predict([data]) labels2 = m.predict([data]) eq(labels1[0], labels2[0]) all_pairs = scipy.spatial.distance.cdist(data, m.cluster_centers_) eq(labels2[0], np.argmin(all_pairs, axis=1))
def test_kcenters_1(): # make sure all the shapes are correct of the fit parameters m = KCenters(n_clusters=3) m.fit([np.random.randn(23,2), np.random.randn(10,2)]) assert isinstance(m.labels_, list) assert isinstance(m.distances_, list) assert len(m.labels_) == 2 eq(m.cluster_centers_.shape, (3,2)) eq(m.labels_[0].shape, (23,)) eq(m.labels_[1].shape, (10,)) eq(m.distances_[0].shape, (23,)) eq(m.distances_[1].shape, (10,)) eq(m.fit_predict([np.random.randn(10, 2)])[0].shape, (10,)) assert np.all(np.logical_not(np.isnan(m.distances_[0])))
#sequences of coordinates of ligand aromatic ring and Aps113 sequences_all = [] for this_sim in simulations: this_seq = util.featurize_RawPos(inds,this_sim) sequences_all.extend(this_seq) #print len(sequences_all) #print sequences_all[-1].shape #average position of Asp113 #res_pos_ave = np.mean(res_pos_A_1[0],axis = 0) # time_step = util.calc_time_step(times_path,stride = LOAD_STRIDE) # clustering = KCenters(n_clusters = 10) assignments = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ #print len(assignments) #print assignments[1].shape msm = MarkovStateModel(lag_time=180, verbose=True).fit(assignments) countsmat = msm.countsmat_ transmat = msm.transmat_ #print np.sum(countsmat) #np.savetxt('/home/shenglan/TryMSMbuilder/output/assignments.out',assignments, fmt = '%3.0f') np.savetxt('/home/shenglan/TryMSMbuilder/output/countsmat.out',countsmat,fmt = '%8.4g') np.savetxt('/home/shenglan/TryMSMbuilder/output/transmat.out',transmat,fmt = '%10.4g')
dist_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/dist_to_binding'\ +'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(distances, open(dist_path, 'wb')) # get N positions sequences_all = [] for this_sim in simulations: this_seq = util.featurize_RawPos(inds_N, [this_sim]) sequences_all.extend(this_seq) seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences' + '_s' + str( LOAD_STRIDE) + '.out' pickle.dump(sequences_all, open(seq_path, 'wb')) clustering = KCenters(n_clusters=N_CLUSTER) geo_assign = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \ +str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(geo_assign, open(geo_assign_path, 'wb')) micro_msm = MarkovStateModel(lag_time=1, reversible_type='transpose', ergodic_cutoff='off', verbose=True).fit(geo_assign) msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \ '_s'+str(LOAD_STRIDE)+'.out' pickle.dump(micro_msm, open(msm_path, 'wb'))
TS2_ticproj_list_array = [] tica_TS2_sequences = [] for line in open("ticproj_TS2"): TS2_ticproj_list_array.append(line.strip()) temp1 = numpy.loadtxt("%s/TS2_project_onto_GS_tics/%s_ticproj.txt" % (outputdir, line.strip())) temp1 = temp1[:, 0:num_tics_for_clustering] tica_TS2_sequences.append(temp1) tmp_counter = 0 kcenters = KCenters(n_clusters=nMicro) #kcenters = KCenters(n_clusters=num_tics_for_clustering) # Fr :) kcenters_sequences = kcenters.fit_predict( tica_sequences) #here it is ground state tica sequences print "begin to plot the microstate implied timescale into the objective dir" #plot implied timescale lag_times = range(10, 100, 10) #adjust variables n_timescales = 5 #adjust variables msm_timescales = implied_timescales(kcenters_sequences, lag_times, n_timescales=n_timescales, msm=MarkovStateModel( verbose=True, reversible_type='transpose'))
distances.append(this_lig) dist_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/dist_to_binding'\ +'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(distances,open(dist_path,'wb')) # get N positions sequences_all = [] for this_sim in simulations: this_seq = util.featurize_RawPos(inds_N,[this_sim]) sequences_all.extend(this_seq) seq_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/sequences'+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(sequences_all,open(seq_path,'wb')) clustering = KCenters(n_clusters = N_CLUSTER) geo_assign = clustering.fit_predict(sequences_all) centers = clustering.cluster_centers_ geo_assign_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_geoassign_c' \ +str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(geo_assign,open(geo_assign_path,'wb')) micro_msm = MarkovStateModel(lag_time=1, reversible_type = 'transpose', ergodic_cutoff = 'off' ,verbose=True).fit(geo_assign) msm_path = '/home/shenglan/TryMSMbuilder/output/ten_ligands/KC_msm_c'+str(N_CLUSTER)+ \ '_s'+str(LOAD_STRIDE)+'.out' pickle.dump(micro_msm,open(msm_path,'wb')) # map assignments
inds_N.append(iis) print inds_N #sequences of coordinates of ligands sequences_all = [] for this_sim in simulations: if use_COM: this_seq = util.featurize_RawPos(inds_all,this_sim,average = True) else: this_seq = util.featurize_RawPos(inds_N,this_sim) sequences_all.extend(this_seq) seqfile = '/home/shenglan/TryMSMbuilder/output/sequences'+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(sequences_all, open(seqfile,'wb')) KC_clustering = KCenters(n_clusters = N_CLUSTER) KC_assignments = KC_clustering.fit_predict(sequences_all) KC_centers = KC_clustering.cluster_centers_ KM_clustering = KCenters(n_clusters = N_CLUSTER) KM_assignments = KM_clustering.fit_predict(sequences_all) KM_centers = KM_clustering.cluster_centers_ KC_output_file = '/home/shenglan/TryMSMbuilder/output/KC_centers_c'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' KM_output_file = '/home/shenglan/TryMSMbuilder/output/KM_centers_c'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' np.savetxt(KC_output_file,KC_centers,fmt = '%10.4g') np.savetxt(KM_output_file,KM_centers,fmt = '%10.4g') KC_assign_file = '/home/shenglan/TryMSMbuilder/output/KC_assign_'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' KM_assign_file = '/home/shenglan/TryMSMbuilder/output/KM_assign_'+str(N_CLUSTER)+'_s'+str(LOAD_STRIDE)+'.out' pickle.dump(KC_assignments,open(KC_assign_file,'wb')) pickle.dump(KM_assignments,open(KM_assign_file,'wb'))
def test_kcenters_5(): model1 = KCenters(n_clusters=10, random_state=0, metric='euclidean') model2 = KCenters(n_clusters=10, random_state=0, metric='sqeuclidean') data = np.random.RandomState(0).randn(100, 2) eq(model1.fit_predict([data])[0], model2.fit_predict([data])[0])
print inds_N #sequences of coordinates of ligands sequences_all = [] for this_sim in simulations: if use_COM: this_seq = util.featurize_RawPos(inds_all, this_sim, average=True) else: this_seq = util.featurize_RawPos(inds_N, this_sim) sequences_all.extend(this_seq) seqfile = '/home/shenglan/TryMSMbuilder/output/sequences' + '_s' + str( LOAD_STRIDE) + '.out' pickle.dump(sequences_all, open(seqfile, 'wb')) KC_clustering = KCenters(n_clusters=N_CLUSTER) KC_assignments = KC_clustering.fit_predict(sequences_all) KC_centers = KC_clustering.cluster_centers_ KM_clustering = KCenters(n_clusters=N_CLUSTER) KM_assignments = KM_clustering.fit_predict(sequences_all) KM_centers = KM_clustering.cluster_centers_ KC_output_file = '/home/shenglan/TryMSMbuilder/output/KC_centers_c' + str( N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out' KM_output_file = '/home/shenglan/TryMSMbuilder/output/KM_centers_c' + str( N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out' np.savetxt(KC_output_file, KC_centers, fmt='%10.4g') np.savetxt(KM_output_file, KM_centers, fmt='%10.4g') KC_assign_file = '/home/shenglan/TryMSMbuilder/output/KC_assign_' + str( N_CLUSTER) + '_s' + str(LOAD_STRIDE) + '.out'