def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' # tICA parameters tica_lagtime = 10 # determine from implied timescales tica_components = 8 # how many tICs to compute n_clusters = 100 # denotes number of microstates n_timescales = tica_components # plot all eigenvalues --> timescales md_time_step = 0.02 # ns subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot stride = int(subsampled_time_step / md_time_step) #time step stride for sub-sampling equil_time = 1. # ns equil_steps = 1 #int(equil_time / md_time_step) time steps to be removed from start lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024]) cluster_method = 'kcenters' # 'kcenters/kmeans' all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations all_ticas = [[1,2]] # override: just show analysis for first two components cluster_percentage_cutoff = 5 # clusters with a relative population less than this # number will not be labeled on plot i.e. 0 : all clusters labeled verbose = False print("\nCalculating tICA components...") # Load in feature files THIS WILL NEED TO BE CHANGED if feats == None: if calculate_strides: self.calculate_stride_distances(stride, equil_steps) data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num) else: data = self.data else: data = np.load(feats) features = [] for run in data: for clone in run: gen_seq = [] for gen in clone: if gen is not None and gen[0] is not None: if calculate_strides or feats is not None: gen_seq.append(gen) else: gen_seq.append(gen[::stride]) if len(gen_seq) > 0: gen_cat = np.concatenate(gen_seq) if calculate_strides: features.append(gen_cat) else: features.append(gen_cat[equil_steps:]) features = np.asarray(features) print(features.shape) print(features[0].shape) tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(features) np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates) # Initiate and populate an array for each component for i in range(tica_components): exec('tica_' + str(i+1) + ' = []') for i in tqdm.tqdm(range(len(features))): for j in range(len(tica_coordinates[i])): for k in range(tica_components): exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])') # Perform clustering based on the cluster_method parameter. if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") # Determine cluster assignment for each frame. sequences = clusters.fit_transform(tica_coordinates) np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences) np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. print("\nDetermining cluster populations...") if not os.path.exists('%s/cluster_centers' % self.tICA_dir): os.makedirs('%s/cluster_centers' % self.tICA_dir) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts) # Plot all unique combinations of tICA components print("\nPlotting tICA components with cluster centers...") all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) for j in tqdm.tqdm(range(len(all_ticas))): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log') x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24) plt.xlabel('tic'+str(all_ticas[j][0])) plt.ylabel('tic'+str(all_ticas[j][1])) plt.title(self.proj_num) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png') plt.close() ########################################################################### for filename in os.listdir(self.tICA_dir + '/cluster_centers'): if filename.endswith('.pdb'): os.remove(self.tICA_dir + '/cluster_centers/' + filename) # Write out PDBs for each cluster center print("Performing cluster analytics and saving center PDBs...\n") runs, clones, gens = data.shape[0], data.shape[1], data.shape[2] x, y, z = 0, 0, 0 for i in range(len(features)): if i % clones == 0 and i != 0: x += 1 if i % gens == 0: y = 0 n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length traj_cat = [] print('x: %d, y: %d, z: %d' % (x, y, z)) while True: try: traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z) traj_cat.append(md.load(traj, top=self.gro_file)) z += 1 except: break if len(traj_cat) > 0: trajectory_file = md.join(traj_cat) xtc_len = len(trajectory_file) y += 1 z = 0 for j in range(len(cluster_indices)): frames = range(xtc_len) # map the strided frame number back to xtc frame number strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = trajectory_file[xtc_frame] cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]])) if verbose: print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]])) print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features))) print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices))) print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
def calculate_tica_components(): print("Calculating tICA components...") in_files = glob.glob("out*npy") loaded_files = [ np.load(filename) for filename in in_files ] tica = tICA(lag_time=tica_lagtime, n_components=int(tica_components)).fit_transform(loaded_files) np.save('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components), tica) tica_data = 'data_lag_%d_comp_%d' %(tica_lagtime, tica_components) joblib.dump(tica, tica_data) data = np.load('lag_%d_comp_%d.npy' %(tica_lagtime, tica_components)) for i in range(len(glob.glob('out*npy'))): # extract the four tICA components for j in range(len(data[i])): tica_1.append(data[i][j][0]) tica_2.append(data[i][j][1]) tica_3.append(data[i][j][2]) tica_4.append(data[i][j][3]) # Clustering via KCenters if cluster_method == 'kcenters': print("Clustering via KCenters...") clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': print("Clustering via KMeans...") clusters = KMeans(n_clusters) else: sys.exit("Invalid cluster_method. Use kmeans or kcenters.") sequences = clusters.fit_transform(tica) np.save('lag_%d_clusters_%d_sequences.npy' %(tica_lagtime, n_clusters), sequences) np.save('lag_%d_clusters_%d_center.npy' %(tica_lagtime, n_clusters), clusters.cluster_centers_) cluster_data = 'lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(sequences, cluster_data) # Determining cluster populations print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] # Plotting the tICA components print("Plotting tICA components with cluster centers...") plt.figure(0) # plotting tica_1, tica_2 plt.hexbin(tica_1, tica_2, bins='log') #, cmap=cmaps.viridis x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][1] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): # adds percentage contribution for each cluster plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_2.png') plt.figure(1) # plotting tica_1, tica_3 plt.hexbin(tica_1, tica_3, bins='log') x_centers = [clusters.cluster_centers_[i][0] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][2] for i in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip([ "%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_1_3.png') plt.figure(2) # plotting tica_2, tica_3 plt.hexbin(tica_2, tica_3, bins='log') x_centers = [clusters.cluster_centers_[j][1] for j in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[j][2] for j in range(len(clusters.cluster_centers_))] plt.plot(x_centers, y_centers, 'wo') for label, x, y in zip(["%.4f"%i for i in percentages], x_centers, y_centers): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('tica_2_3.png') # Determining cluster entropy ( this yields errors for me ) # print("Determining cluster entropy") # cluster_entropy = (-1.0*normalized_counts*np.log(normalized_counts)).sum() # np.savetxt('cluster_entropy.dat', cluster_entropy) # Determining the cluster populations and writing out PDBs for cluster centers print("Determining cluster populations...") counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) # how many frames are in each cluster normalized_counts = counts/float(counts.sum()) np.savetxt('populations.dat', normalized_counts) print("Performing cluster analytics and saving center PDBs...\n") for i in range(len(glob.glob("traj*xtc"))): n_snapshots = len(clusters.distances_[i]) cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # frames that have centers cluster_labels = sequences[i][cluster_indices] # number of cluster if cluster_indices.size != 0: # print only the trajectories that have cluster centers for j in range(len(cluster_labels)): # for each cluster center found in this trajectory print('Cluster center', cluster_labels[j], 'was found in trajectory', str(i) + '.') print('It is found on frame', cluster_indices[j], 'and has a relative population of', "%.4f"%percentages[cluster_labels[j]], '%.') xtcfile = sorted(glob.glob("traj*xtc"))[i] for j in range(len(cluster_indices)): # actually saving the snapshots cluster_traj = md.load_frame(xtcfile, cluster_indices[j], top='structure.gro') cluster_traj.save_pdb('state_%d.pdb' %cluster_labels[j]+1) # Calculating IPTs print("\nCalculating Implied Timescales...") timescales = implied_timescales(sequences, lagtimes, n_timescales=n_timescales, msm=MarkovStateModel(verbose=False)) implied_timescale_data = 'ipt_lag_%d_clusters_%d.pkl' %(tica_lagtime, n_clusters) joblib.dump(timescales, implied_timescale_data) numpy_timescale_data = 'lag_%d_clusters_%d_timescales.npy' %(tica_lagtime, n_clusters) np.savetxt('lagtimes.txt', lagtimes) np.save(numpy_timescale_data, timescales) # Plotting IPTs (lagtimes and timescales) print("Plotting Implied Timescales...") for i in range(n_timescales): plt.figure(42) plt.plot(lagtimes * time_step, timescales[:, i] * time_step, 'o-') plt.yscale('log') plt.xlabel('lagtime (ns)') plt.ylabel('Implied timescales (ns)') plt.savefig('lag_%d_clusters_%d_.png' %(tica_lagtime, n_clusters))
def compute_tica_components(): '''Load in the features, calculate a given number of tICA components (tica_components) given a lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates a list for each desired component, clusters the data, saving normalized populations as populations.dat and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are calculated, saved, and plotted. ''' verbose = False save_pdb = True color_by = 'cluster' if verbose: print("\nCalculating tICA components...") if not os.path.exists(project_title + '/tica_%d'%n_clusters): os.mkdir(project_title + '/tica_%d'%n_clusters) # load in feature files and determine indices of unbiased ensembles feature_files = [] for i in range(runs): run_files = sorted(glob.glob(/features/' + "P*R%d_*npy"%i)) feature_files += run_files if i in unbiased_runs: unbiased_indices = [len(feature_files) - len(run_files),len(feature_files)] features = [np.load(x) for x in feature_files] # perform tICA calculation and extract score / eigenvectors tica_coordinates = tICA(lag_time=tica_lagtime, n_components=int(n_components)).fit_transform(features) tica_components = tICA(lag_time=tica_lagtime, n_components=int(n_components)).fit(features) eigenvectors = np.transpose(tica_components.eigenvectors_) tica_score = tica_components.score(features) np.save('%s/tica_%d/tica_coords-lag_%d-comp_%d.npy' %( project_title, n_clusters, tica_lagtime, n_components), tica_coordinates) np.save('%s/tica_%d/tica_comps-lag_%d-comp_%d.npy' %( project_title, n_clusters, tica_lagtime, n_components), tica_components) # Perform clustering based on the cluster_method parameter. if verbose: print('Clustering via %s'%cluster_method) if cluster_method == 'kcenters': clusters = KCenters(n_clusters) elif cluster_method == 'kmeans': clusters = KMeans(n_clusters) elif cluster_method == 'kmedoids': clusters = KMedoids(n_clusters) else: sys.exit('Invalid cluster_method. Use kcenters/kmeans/kmedoids.') # Cluster unbiased data and fit biased data to these centers new_assignments = [] sequences = clusters.fit_transform(tica_coordinates[unbiased_indices[0]:unbiased_indices[1]]) for i in tqdm.tqdm_notebook(range(unbiased_indices[0])): tica_traj = tica_coordinates[i] if isinstance(tica_traj, np.ndarray): if not (tica_traj.dtype == 'float32' or tica_traj.dtype == 'float64'): tica_traj = tica_traj.astype('float64') labels, inertia = msmbuilder.libdistance.assign_nearest( tica_traj, clusters.cluster_centers_, metric='euclidean') new_assignments.append(labels) new_assignments += sequences # tack the unbiased assignments back on to the end. np.save('%s/tica_%d/lag_%d_clusters_%d_assignments.npy' %( project_title, n_clusters, tica_lagtime, n_clusters), new_assignments) np.save('%s/tica_%d/lag_%d_clusters_%d_center.npy' %( project_title, n_clusters, tica_lagtime, n_clusters), clusters.cluster_centers_) # Determine cluster populations, normalize the counts, and save as percentages for # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data. # Finally, save normalized counts. if verbose: print("\nDetermining cluster populations...") if not os.path.exists('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)): os.mkdir('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)) if not os.path.exists('%s/tica_%d/plots'%(project_title,n_clusters)): os.mkdir('%s/tica_%d/plots'%(project_title,n_clusters)) counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)]) normalized_counts = counts/float(counts.sum()) percentages = [ i*100 for i in normalized_counts ] population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ] np.savetxt('%s/tica_%d/%s_clusters/populations.dat' %(project_title,n_clusters,cluster_method), normalized_counts) # Plot all unique combinations of tICA components if verbose: print("\nPlotting tICA components...") tica_coordinates = np.concatenate(tica_coordinates) new_assignments = np.concatenate(new_assignments) cluster_colors = matplotlib.cm.rainbow(np.linspace(0,1,n_clusters)) for j in tqdm.tqdm_notebook(range(len(all_ticas)),leave=False): # For each pair if all_ticas[j][0] < all_ticas[j][1]: plt.figure(j, figsize=(20,16)) tICx, tICy = all_ticas[j][0]-1, all_ticas[j][1]-1 plt.hexbin(tica_coordinates[:,tICx],tica_coordinates[:,tICy], bins='log') for l in tqdm.tqdm(range(len(tica_coordinates))[::stride*2]): if color_by == 'cluster': plt.plot(tica_coordinates[l][tICx], tica_coordinates[l][tICy], color=cluster_colors[new_assignments[l]], linestyle="", marker="o") x_centers = [clusters.cluster_centers_[i][tICx] for i in range(len(clusters.cluster_centers_))] y_centers = [clusters.cluster_centers_[i][tICy] for i in range(len(clusters.cluster_centers_))] high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ] high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ] plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o") plt.plot(tica_coordinates[:,tICx][0],tica_coordinates[:,tICy][0], color='k', marker='*',markersize=24) plt.xlabel('tIC'+str(all_ticas[j][0])) plt.ylabel('tIC'+str(all_ticas[j][1])) plt.title(project_title) # Add labels for high-population cluster centers for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers): plt.annotate( label, xy = (x, y), xytext = (-15, 15), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.savefig('%s/tica_%d/plots/tica_%d_%d.png'%(project_title,n_clusters, all_ticas[j][0], all_ticas[j][1])) plt.close() # Write out PDBs for each cluster center if verbose: print("Performing cluster analytics and saving center PDBs...\n") if save_pdb: trajectory_files, feature_files, cluster_features = [],[],[] for run in range(runs): # get only xtc files that correlate to cluster-center features trajectory_files += [re.sub('features', 'traj_data/RUN%d'%run,re.sub('npy','xtc',x) ) for x in sorted(glob.glob('%s/features/*R%d_*npy'%( project_title,run)))] feature_files += sorted(glob.glob('%s/features/*R%d_*npy'%(project_title,run))) for i in tqdm.tqdm_notebook(range(len(trajectory_files)),leave=False): n_snapshots = len(clusters.distances_[i]) # Determine frames that are cluster centers cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ] # Determine number of each cluster, correlates to populations.dat cluster_labels = sequences[i][cluster_indices] # Save each cluster center as a pdb if list(cluster_indices): # load center-containing xtcs to check length xtc_len = len(md.load(trajectory_files[i],top=structure_file)) # map strided frame number back to xtc frame number for j in range(len(cluster_indices)): frames = range(xtc_len) strided_frames = frames[equil_steps:][::stride] xtc_frame = frames.index(strided_frames[cluster_indices[j]]) cluster_traj = md.load_frame(trajectory_files[i], xtc_frame, top=structure_file) cluster_features.append(np.load(feature_files[i])[cluster_indices[j]]) cluster_traj.save_pdb('%s/tica_%d/%s_clusters/state_%d.pdb' %(project_title,n_clusters,cluster_method, cluster_labels[j])) # save cluster information with open('%s/tica_%d/cluster.dat'%(project_title,n_clusters),'w') as f: f.write('\nSuccessfully saved PDB for cluster: %d, (rel.pop: %.3f)'%( cluster_labels[j],percentages[cluster_labels[j]])) f.write('traj_file: %s (%d/%d)'%(trajectory_files[i],i,len(features))) f.write('frame: %d (%d/%d centers from this trajectory)'%( cluster_indices[j],j,len(cluster_indices))) f.write('strided: npy_frame/npy_len = %d/%d = %f'%( cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots)) f.write('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%( xtc_frame,xtc_len,xtc_frame/xtc_len)) f.close() # save features corresponding to each cluster center np.save('%s/tica_%d/cluster_features.npy'%(project_title,n_clusters),cluster_features) return tica_score