def scale_data(self, scaler='Robust'): print('Scale featurized data been called\n') print('-------------------------------\n') from msmbuilder.preprocessing import RobustScaler if scaler == 'Robust': scaler = RobustScaler() self.scaled_data = scaler.fit_transform(self.sim_seqs) print('scaled ', self.scaled_data[0].shape) # # print("Scaling feautirized data successfully") print('-----------------------------------\n')
def calculate_fitness(population_dihedral, diheds, score_global, i, lock): import pandas as pd import numpy as np pop_index = i new_diheds = [] for i in range(0, len(diheds)): X = diheds[i] selected_features = X[:, population_dihedral] new_diheds.append(selected_features) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(new_diheds) scaled_diheds = new_diheds from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=5) tica_model.fit(scaled_diheds) tica_trajs = tica_model.transform(scaled_diheds) from msmbuilder.cluster import MiniBatchKMeans clusterer = MiniBatchKMeans(n_clusters=200, random_state=42) clustered_trajs = clusterer.fit_transform(tica_trajs) from msmbuilder.msm import MarkovStateModel msm = MarkovStateModel(lag_time=50, n_timescales=5) #msm.fit_transform(clustered_trajs) from sklearn.cross_validation import KFold n_states = [4] cv = KFold(len(clustered_trajs), n_folds=5) results = [] for n in n_states: msm.n_states_ = n for fold, (train_index, test_index) in enumerate(cv): train_data = [clustered_trajs[i] for i in train_index] test_data = [clustered_trajs[i] for i in test_index] msm.fit(train_data) train_score = msm.score(train_data) test_score = msm.score(test_data) time_score = msm.timescales_[0] time_test_score = time_score + test_score print(time_score) print(test_score) av_score = time_test_score / 2 results.append({ 'train_score': train_score, 'test_score': test_score, 'time_score': time_score, 'av_score': av_score, 'n_states': n, 'fold': fold }) print(msm.timescales_) results = pd.DataFrame(results) avgs = (results.groupby('n_states').aggregate(np.median).drop('fold', axis=1)) best_nt = avgs['test_score'].idxmax() best_n = avgs['av_score'].idxmax() best_score = avgs.loc[best_n, 'av_score'] best_scorent = avgs.loc[best_nt, 'test_score'] print(best_scorent) lock.acquire() score_global.update({pop_index: best_scorent}) lock.release()
def main(): cli = argparse.ArgumentParser() cli.add_argument('-e', '--eps', help='eps', default=1, type=float) cli.add_argument('-m', '--min_samples', help='min_samples', default=5, type=int) cli.add_argument('-l', '--nlist', help='nlist', default=1000, type=int) cli.add_argument('-p', '--nprobe', help='nprob', default=10, type=int) # Download example dataset from msmbuilder.example_datasets import AlanineDipeptide ala2 = AlanineDipeptide(verbose=False) xyz = ala2.get().trajectories print(ala2.description()) #xyz = [t[::10] for t in xyz] print("{} trajectories".format(len(xyz))) # msmbuilder does not keep track of units! You must keep track of your # data's timestep to_ns = 0.5 print("with length {} ns".format(set(len(x) * to_ns for x in xyz))) from msmbuilder.featurizer import DihedralFeaturizer featurizer = DihedralFeaturizer(types=['phi', 'psi']) diheds = featurizer.fit_transform(xyz) print(xyz[0].xyz.shape) print(diheds[0].shape) from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) from msmbuilder.decomposition import tICA tica_model = tICA(lag_time=2, n_components=2) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) featurizer = DihedralFeaturizer(types=['phi', 'psi'], sincos=False) diheds = featurizer.fit_transform(xyz) print(diheds[0].shape) print(tica_trajs[0].shape) # =========================================================================== #if os.path.isfile("./phi_angles.txt") and os.path.isfile("./psi_angles.txt") is True: # phi_angles = np.loadtxt("./phi_angles.txt", dtype=np.float32) # psi_angles = np.loadtxt("./psi_angles.txt", dtype=np.float32) #X = np.column_stack((phi_angles, psi_angles)) #print(X.shape) phi_angles = np.degrees(diheds[0][:, 0]) psi_angles = np.degrees(diheds[0][:, 1]) print(phi_angles) X = tica_trajs[0].astype(np.float32) #rint(X) n_size = X.shape[0] dimension = X.shape[1] #print(X.shape) # =========================================================================== args = cli.parse_args() eps = args.eps # eps min_samples = args.min_samples # min_samples nlist = args.nlist nprobe = args.nprobe IVFFlat = True print('n_size = %d,\t dimension = %d,\t eps = %f, min_samples = %d' % (n_size, dimension, eps, min_samples)) n_samples = 1000 percent = 0.9 import random whole_samples = random.sample(list(X), n_samples) #print whole_samples from metrics.pairwise import pairwise_distances sample_dist_metric = pairwise_distances(whole_samples, whole_samples, metric='l2') print(sample_dist_metric.shape) sample_dist = [] for i in range(0, n_samples): for j in range(i + 1, n_samples): sample_dist.append(sample_dist_metric[i, j]) sorted_sample_dist = np.sort(sample_dist) print("Len of samples:", len(sorted_sample_dist), np.max(sorted_sample_dist), np.min(sorted_sample_dist)) eps_list = [] len_samples = len(sorted_sample_dist) for percent in [0.30, 0.20, 0.10]: #,0.005, 0.003, # 0.002, 0.001, 0.0008, 0.0005, 0.0003, 0.0002, 0.0001, 0.00005]: #percent /= 10.0 index = int(round(len_samples * percent)) if index == len_samples: index -= 1 dc = sorted_sample_dist[index] #print index, sorted_sample_dist[index] eps_list.append(dc) print(eps_list) #print X # =========================================================================== # do Clustering using MR -DBSCAN method clustering_name = "mr-dbscan_iter_" #potential = True remove_outliers = False potential = False eps = eps_list[0] min_samples = 1 len_frames = len(X) print("Total frames:", len_frames) print("Running first calculation") db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat) db.fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True old_assignments = db.labels_ n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) # Calculating percentage of each states frame_bincount = np.bincount( old_assignments[old_assignments >= 0]) #remove outliers frame_freq_index_sorted = np.argsort( frame_bincount)[::-1] # descending arg sort frame_freq_percent_sorted = frame_bincount[ frame_freq_index_sorted] / np.float32(len_frames) print(frame_freq_percent_sorted[0:10]) print(frame_freq_index_sorted[0:10]) old_frame_freq_percent_sorted = frame_freq_percent_sorted old_frame_freq_index_sorted = frame_freq_index_sorted n_microstates = len( set(old_assignments)) - (1 if -1 in old_assignments else 0) print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + '0' + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=old_assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) n_iterations = len(eps_list) print("n_iterations:", n_iterations) min_samples_list = [50, 30, 10] #min_samples_list = [50, 30, 20, 15, 10, 8, 5, 2] n_min_samples = len(min_samples_list) #eps_list = [3.0, 2.0, 1.0, 0.8, 0.5] #min_samples_list = [3, 3, 3, 3, 3, 2, 2] results = np.zeros((n_min_samples, n_iterations, len_frames), dtype=np.int32) for i in range(1, n_iterations): eps = eps_list[i] min_samples = min_samples_list[i] db = Faiss_DBSCAN(eps=eps, min_samples=min_samples, nlist=nlist, nprobe=nprobe, metric="l2", GPU=False, IVFFlat=IVFFlat).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True new_assignments = db.labels_ if i is n_iterations - 1: remove_outliers = True #else: # remove_outliers = False assignments = merge_assignments(new_assignments, old_assignments, remove_outliers=remove_outliers) n_microstates = len(set(assignments)) - (1 if -1 in assignments else 0) #results[j,i, :]= np.array(assignments) print("Iter:", i, "Running MR-DBSCAN at eps:", eps, 'min_sampes:', min_samples, 'Estimated number of clusters:', n_microstates) #print('Estimated number of clusters: %d' % n_microstates) iter_name = clustering_name + str(i) + '_eps_' + str( eps) + '_min_samples_' + str(min_samples) + '_n_states_' + str( n_microstates) plot_cluster(labels=assignments, phi_angles=phi_angles, psi_angles=psi_angles, name=iter_name, potential=potential) #old_assignments = assignments #print(results) #np.save("results.npy", results) #np.savetxt("results.csv", results, fmt="%d", delimiter=",") np.savetxt("eps_list.txt", eps_list, fmt="%f", delimiter=",") np.savetxt("min_samples_list.txt", min_samples_list, fmt="%d", delimiter=",")
if __name__ == "__main__": trajectory_dir = '/Volumes/REA_Data/AADH/traj_5_rxts' topology_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.psf' reference_file = '/Users/robert_arbon/Code/AADH/Analysis/MSM_Reactants_Only/2agy_rxt.pdb' reference_traj = md.load(reference_file) # Load the meta data meta = load_metadata(traj_dir=trajectory_dir, top=topology_file) # Featurize feature = RawPositionsFeaturizer(ref_traj=reference_traj) ftrajs = featurize(featurizer=feature, meta_data=meta) # Summarize variance = np.var(combine(ftrajs), axis=0) plot_features(variance, name='Variance.png', feature_name='Variance', ordered=False) # Normalize scaler = RobustScaler() strajs = scaler.fit_transform(ftrajs) # perform tICA tica_obj = tICA(n_components=10, lag_time=10, kinetic_mapping=True) tica_traj = tica_obj.fit_transform(strajs)
#featurizer = DihedralFeaturizer(types=['chi1', 'chi2'], resids= 73,74,75,76,77,78,79,80,81,82,83) diheds = featurizer.fit_transform(ds) dump(diheds, "features.pkl") #print(ds[0].shape) print(diheds[0].shape) # this basically maps every feature to atom indices. df1 = pd.DataFrame(featurizer.describe_features(ds)) dump(df1, "feature_descriptor.pkl") #Robust scaling from msmbuilder.preprocessing import RobustScaler scaler = RobustScaler() scaled_diheds = scaler.fit_transform(diheds) print(diheds[0].shape) print(scaled_diheds[0].shape) #Reducing dimension tica_model = tICA(lag_time=1, n_components=10) # fit and transform can be done in seperate steps: tica_model.fit(diheds) tica_trajs = tica_model.transform(diheds) print(diheds[0].shape) print(tica_trajs[0].shape) #lets dump the tica mdl for future use verbosedump(tica_model, "tica_mdl_flapchi1angle.pkl")
def main_modified(generations): import numpy as np from msmbuilder.preprocessing import RobustScaler import time import pickle import os import multiprocessing os.environ["OMP_NUM_THREADS"] = "1" import operator from multiprocessing import Pool from operator import itemgetter diheds=Get_dihedral_features_villin() scaler = RobustScaler() scaled_feature = scaler.fit_transform(diheds) Val=Laplacian_score(scaled_feature) # output of imp_features and col_mean of the laplacian score of each dihedral col_mean=Val[0] imp_features=Val[1] current_gen = 0 for_each_gen_score =[] population_each_gen=[] population_dihedral=[] population_dihedral=initial_population(imp_features) cross_probability=0.8 num_parents=(int)(cross_probability*len(population_dihedral)) population_dihedral_duplicate=[] numberOfThreads = multiprocessing.cpu_count() f = open("benzamidine_diheds_ga_score"+str(generations)+".txt", "a") while current_gen < generations: manager = multiprocessing.Manager() score = manager.dict() processes = [] lock = multiprocessing.Lock() for i in range(len(population_dihedral)): p = multiprocessing.Process(target=calculate_fitness, args=(population_dihedral[i],scaled_feature,score,i,lock)) processes.append(p) #starttime = time.time() for i in chunks(processes,numberOfThreads): #chunks is a function : has to be defined p_count=0 for process in i: process.start() p_count=p_count+1 print("the started process are"+str(p_count)) for process in i: process.join() p_count=p_count-1 print("the joined process are"+str(p_count)) for process in i: process.terminate() p_count=p_count+1 print("the terminated process are"+str(p_count)) scored_population={} scored_population=dict(sorted(score.items(), key=operator.itemgetter(1))) for_each_gen_score.append(scored_population) population_each_gen.append(population_dihedral) scored_population_list=list(scored_population.keys()) parents=[] parents = select_parents_rank_based(scored_population,population_dihedral,cross_probability) offsprings_1=[] offsprings_1=crossover(parents,population_dihedral) parents_binary=[] parents_binary=parents_binarize(parents,imp_features) offsprings_2_binary=[] count_mutation=len(population_dihedral)-len(offsprings_1) offsprings_2_binary=mutation_binary_offspring(parents_binary,4,count_mutation) #,col_mean,imp_features) offsprings_2=[] offsprings_2=binary_to_pop_dih(offsprings_2_binary) for i in range(len(offsprings_2)): offsprings_2[i]=np.asarray(offsprings_2[i]) for i in range(len(offsprings_1)): offsprings_1[i]=np.asarray(offsprings_1[i]) offsprings=[] offsprings=offsprings_1+offsprings_2 # offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-1]]) #offsprings.append(population_dihedral[scored_population_list[len(scored_population_list)-2]]) population_dihedral=[] population_dihedral=offsprings current_gen = current_gen+1 print(for_each_gen_score,file=f) f.close() return for_each_gen_score,population_each_gen,scaled_feature,imp_features
import matplotlib matplotlib.use('Agg') from matplotlib.pylab import plt from utilities import plot_box if __name__ == '__main__': # Load feature_name = 'Positions' meta, feature_trajs = load_trajs('Unscaled-{}-ftraj'.format(feature_name)) # Select scaler featurizer = RobustScaler() # Transform values featurizer.fit_transform(feature_trajs.values()) scaled_trajs = {} for k, v in feature_trajs.items(): scaled_trajs[k] = featurizer.partial_transform(v) # Plot unscaled features ftrajs = np.concatenate([fx[::100] for fx in scaled_trajs.values()]) fig, ax = plt.subplots(figsize=(15, 5)) plot_box(ax, fxx=ftrajs, feature_name='Scaled {}'.format(feature_name)) fig.tight_layout() fig.savefig("Scaled-{}-box.pdf".format(feature_name)) # Save save_trajs(scaled_trajs, 'Scaled-{}-ftraj'.format(feature_name), meta) save_generic(featurizer, 'Scaled-{}-featurizer.pickl'.format(feature_name))