def test_score_1(): # test that GMRQ is equal to the sum of the first n eigenvalues, # when testing and training on the same dataset. sequence = [0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 1, 1, 0, 0, 0, 1, 2, 2, 2, 1, 1, 1, 0, 0] for n in [0, 1, 2]: model = MarkovStateModel(verbose=False, n_timescales=n) model.fit([sequence]) assert_approx_equal(model.score([sequence]), model.eigenvalues_.sum()) assert_approx_equal(model.score([sequence]), model.score_)
def calculate_fitness(population_dihedral, diheds, score_global, i, lock): import pandas as pd import numpy as np pop_index = i new_diheds = [] for i in range(0, len(diheds)): X = diheds[i] selected_features = X[:, population_dihedral] new_diheds.append(selected_features) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(new_diheds) scaled_diheds = new_diheds from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=5) tica_model.fit(scaled_diheds) tica_trajs = tica_model.transform(scaled_diheds) from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=200, random_state=42) clustered_trajs = clusterer.fit_transform(tica_trajs) from msmbuilder.msm import MarkovStateModel msm = MarkovStateModel(lag_time=50, n_timescales=5) #msm.fit_transform(clustered_trajs) from sklearn.cross_validation import KFold n_states = [4] cv = KFold(len(clustered_trajs), n_folds=5) results = [] for n in n_states: msm.n_states_ = n for fold, (train_index, test_index) in enumerate(cv): train_data = [clustered_trajs[i] for i in train_index] test_data = [clustered_trajs[i] for i in test_index] msm.fit(train_data) train_score = msm.score(train_data) test_score = msm.score(test_data) time_score = msm.timescales_[0] time_test_score = time_score + test_score print(time_score) print(test_score) av_score = time_test_score / 2 results.append({ 'train_score': train_score, 'test_score': test_score, 'time_score': time_score, 'av_score': av_score, 'n_states': n, 'fold': fold }) print(msm.timescales_) results = pd.DataFrame(results) avgs = (results.groupby('n_states').aggregate(np.median).drop('fold', axis=1)) best_nt = avgs['test_score'].idxmax() best_n = avgs['av_score'].idxmax() best_score = avgs.loc[best_n, 'av_score'] best_scorent = avgs.loc[best_nt, 'test_score'] print(best_scorent) lock.acquire() score_global.update({pop_index: best_scorent}) lock.release()
train_data = [] test_data = [] for i in range(len(tica_data)): cv = KFold(len(tica_data[i]), n_folds=nFolds) for current_fold, (train_index, test_index) in enumerate(cv): if current_fold == fold: train_data.append(tica_data[i][train_index]) test_data.append(tica_data[i][test_index]) reduced_train_data = sub_sampling_data(train_data, stride=100) kmeans.fit(reduced_train_data) assignments_train = kmeans.predict(train_data) assignments_test = kmeans.predict(test_data) msm = MarkovStateModel(lag_time=lagtime) msm.fit(assignments_train) train_score = msm.score_ test_score = msm.score(assignments_test) results.append({ 'train_score': train_score, 'test_score': test_score, 'n_states': n, 'fold': fold, 'timescales': msm.timescales_ }) results = pd.DataFrame(results) print results output_fn = "GMRQ_MSMs_score_for_tica_n_%d_%d.pkl" % (n_tIC, n) with open(output_fn, 'wb') as result_fn: pickle.dump(results, result_fn) result_fn.close()
kcenters = KCenters(n_clusters=n_Micro, metric='euclidean', random_state=0) kcenters.fit(train_data_projection) train_data_sequence = kcenters.predict( train_data_projection) test_data_sequence = kcenters.predict(test_data_projection) msm = MarkovStateModel( n_timescales=3, lag_time=100, reversible_type='transpose', verbose=False, sliding_window=True, ergodic_cutoff='on') #the parameters may change msm.fit(train_data_sequence) train_score = msm.score(train_data_sequence) test_score = msm.score(test_data_sequence) f1 = open( sub_resultdir + '/Fold_%d_tica_lagtime_%d_ntics_%d_nMicro_%d_gmrq.summary' % (fold, tica_correlation_time, n_tics, n_Micro), 'w') f1.write('train_score:%f' % (train_score)) f1.write('\n') f1.write('test_score:%f' % (test_score)) f1.write('\n') f1.close() print( 'computing implied timescale for training data' ) #the x-range to plot implied timescale should also change train_msm_timescales = implied_timescales( train_data_sequence,