def test_statistical_inefficiency_multiple(): X, Y, energy = generate_data() timeseries.statisticalInefficiencyMultiple(X) timeseries.statisticalInefficiencyMultiple(X ** 2) timeseries.statisticalInefficiencyMultiple(X[0, :] ** 2) timeseries.statisticalInefficiencyMultiple(X[0:2, :] ** 2) timeseries.statisticalInefficiencyMultiple(energy)
def test_statistical_inefficiency_multiple(): X, Y, energy = generate_data() timeseries.statisticalInefficiencyMultiple(X) timeseries.statisticalInefficiencyMultiple(X**2) timeseries.statisticalInefficiencyMultiple(X[0, :]**2) timeseries.statisticalInefficiencyMultiple(X[0:2, :]**2) timeseries.statisticalInefficiencyMultiple(energy)
0, trajectory_segment_length) for k in range(K): # Determine which replica generated the data from temperature k at this iteration replica_index = replica_ik[iteration, k] # Reconstruct portion of replica trajectory. U_kt_replica[replica_index, snapshot_indices] = U_kt[k, snapshot_indices] phi_kt_replica[replica_index, snapshot_indices] = phi_kt[k, snapshot_indices] psi_kt_replica[replica_index, snapshot_indices] = psi_kt[k, snapshot_indices] # Estimate the statistical inefficiency of the simulation by analyzing the timeseries of interest. # We use the max of cos and sin of the phi and psi timeseries because they are periodic angles. # The print "Computing statistical inefficiencies..." g_cosphi = timeseries.statisticalInefficiencyMultiple( numpy.cos(phi_kt_replica * numpy.pi / 180.0)) print "g_cos(phi) = %.1f" % g_cosphi g_sinphi = timeseries.statisticalInefficiencyMultiple( numpy.sin(phi_kt_replica * numpy.pi / 180.0)) print "g_sin(phi) = %.1f" % g_sinphi g_cospsi = timeseries.statisticalInefficiencyMultiple( numpy.cos(psi_kt_replica * numpy.pi / 180.0)) print "g_cos(psi) = %.1f" % g_cospsi g_sinpsi = timeseries.statisticalInefficiencyMultiple( numpy.sin(psi_kt_replica * numpy.pi / 180.0)) print "g_sin(psi) = %.1f" % g_sinpsi # Subsample data with maximum of all correlation times. print "Subsampling data..." g = numpy.max(numpy.array([g_cosphi, g_sinphi, g_cospsi, g_sinpsi])) indices = timeseries.subsampleCorrelatedData(U_kt[k, :], g=g) print "Using g = %.1f to obtain %d uncorrelated samples per temperature" % (
heavyIndices = np.array(heavyIndices) cuIndices = np.array(cuIndices) #Load in the potential energies, INCLUDING RESTRAINT, at all states for this simulation to figure out frames to skip alcDat = np.loadtxt(alchemicalFile) startTime = alcDat[0, 1] startFrame = int( startTime ) - 1 #Be careful here... need write frequency in alchemical file to match exactly with positions #AND assuming that have written in 1 ps increments... #Also, first frame in trajectory is NOT at time zero, so subtract 1 if endTime == -1: thisPot = alcDat[:, 3:-1] else: thisPot = alcDat[:endTime, 3:-1] thisg = timeseries.statisticalInefficiencyMultiple(thisPot) print("Statistical inefficiency for this set of potential energies: %f" % thisg) #print(startTime) #print(startFrame) #print(thisPot.shape) #Next load in the trajectory and get all solute coordinates that matter top.rb_torsions = pmd.TrackedList([]) top = pt.load_parmed(top, traj=False) if endTime == -1: traj = pt.iterload(trajFile, top, frame_slice=(startFrame, -1)) else: traj = pt.iterload(trajFile, top,
if __name__ == "__main__" : var=numpy.ones(N) for replica in xrange(2,K+1): var=numpy.concatenate((var,numpy.ones(N))) X=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N))/10.0 Y=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N)) # X=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N)) # Y=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N)) # print "X.shape = " # print X.shape energy = 10*(X**2)/2.0 + (Y**2)/2.0 print "statisticalInefficiencyMultiple(X)" print timeseries.statisticalInefficiencyMultiple(X) print "statisticalInefficiencyMultiple(X**2)" print timeseries.statisticalInefficiencyMultiple(X**2) print "statisticalInefficiencyMultiple(X[0,:]**2)" print timeseries.statisticalInefficiencyMultiple(X[0,:]**2) print "statisticalInefficiencyMultiple(X[0:2,:]**2)" print timeseries.statisticalInefficiencyMultiple(X[0:2,:]**2) print "statisticalInefficiencyMultiple(energy)" print timeseries.statisticalInefficiencyMultiple(energy) # Exit with success. # TODO: Add some checks to test statistical inefficinecies are within normal expected range. sys.exit(0)
def summarize_timeseries(concentrations): """ Use boostrap sampling together with statisticalInefficiencyMultiple to calculate the mean correlation function and its 95% confidence intervals as well as the estimated autocorrelation time. Parameters ---------- concentrations: np.ndarray Arrays with different time series data in each row. Returns ------- mean_corr_func, lower, upper: numpy.ndarray the mean, 5th percentile, and 97.5 percentile of the estimated autocorrelation function. auto_corr_time_mean, auto_corr_time_std the mean autocorrelation time with its standard error. """ boot_samples = 50 auto_corr_time = np.zeros(boot_samples) corr_data = [] for sample in range(boot_samples): ints = np.random.choice(3, 3) concs = [ concentrations[ints[0], :], concentrations[ints[1], :], concentrations[ints[2], :] ] g, c = timeseries.statisticalInefficiencyMultiple( concs, return_correlation_function=True, fast=False) auto_corr_time[sample] = (g - 1) / 2.0 corr_data += c # The correlation time may be computed up to a different maximum time for different bootstrap samples. # Finding the maximum time. max_time = 0 for tup in corr_data: if tup[0] > max_time: max_time = tup[0] # Unpacking each bootstrapped correlation function for easier analysis unpacked_corr_func = {} for i in range(max_time): unpacked_corr_func[i] = [] for data in corr_data: unpacked_corr_func[data[0] - 1].append(data[1]) # Working out the confidense intervals. mean_corr_func = np.zeros(max_time) lower = np.zeros(max_time) upper = np.zeros(max_time) for i in range(max_time): mean_corr_func[i] = np.mean(unpacked_corr_func[i]) lower[i] = np.percentile(unpacked_corr_func[i], q=2.5) upper[i] = np.percentile(unpacked_corr_func[i], q=97.5) # When the first lower estimate hits zero, ensure that all supsequent data points are also zero. zero_from = np.where(lower <= 0.0)[0][0] lower[zero_from:] = 0 return mean_corr_func, lower, upper, auto_corr_time.mean( ), auto_corr_time.std()
def umbrella_PMF(x_kn, data_path, eq, k_bias, save_path, temps_to_use='All', save=True, max_time=None): """ CURRENT FUNCTION USED TO COMPUTE SUBSTRUCTURE AND 1D PMF x_kn is some list of coordiante values along which you want to compute free energies We assume that x_kn is either an array where each row is a timecourse at a given condition, and the rows are in order of increasing reporter values (ex. temp or setpoint) OR that x_kn is a 1D array where a new trajectory starts every t values, and the trajectories are in order of increasing reporter values You can also enter None, in whcih case x_kn is simply the native contacts Alternatively you can enter x_kn as a string, for instnace 'rmsd', in whcih case the program extracts that variale from the data IMPORTANT: YOU NEED TO MAKE SURE THAT THE POINTS IN X_KN CORRESPOND TO SAME TIMEPOINTS AS THE POINTS IN THE DATA FILE...IF IT DOESN'T, THEN THE FUNCTION WILL TRY TO FIX IT BY SUBSAMPLING THE X_KN, BUT I DON'T TRUST THIS... data_path tells you where the native contacts data is located eq tells you how many steps you want to leave out initially while the simulations equilibrate k_bias is the spring constant save_path is where you want to save the results, for instance 'ADK_umbrella_multistart/Substructure_PMF.dat' As a model for this, see pymbar/examples/umbrella-sampling-pmf/umbrella-sampling.py May need to download this from github, not sure it's on home computer By the way, on 3/17/20, added a parameter max_time, which is the last MC step to be used in equliibrium calculations Typically, we use everything from eq till the end of the simulation (which is the case if max_time has its default value of None) But if you set some numerical value for max_time, then we'll use somethign else For instnace, if eq = 0 and max_time = 100000000, then we'll only use the first 100000000 MC timesteps to compute PMF BUt if eq is set to, say, 150000000 and max_time is kept at its default value of None, then we use everything from 150000000 and beyond to compute PMF """ print("# loading data...") log_file_data, temperatures, setpoints, log_files, times, variables = load_data.load_log_data( data_path) energies_index = variables.index('energy') energies = log_file_data[:, :, energies_index] # if 'natives' in variables: natives_index = variables.index('natives') natives = log_file_data[:, :, natives_index] else: #assume no umbrella biasing natives = np.zeros(np.shape(energies)) k_bias = 0 if type(x_kn) == str: x_kn = log_file_data[:, :, variables.index(x_kn)] elif np.shape(x_kn) == (): x_kn = natives setpoints = np.array(setpoints) temperatures = np.array(temperatures) n_conditions = np.shape(natives)[0] x_kn = np.array(x_kn) del log_file_data if x_kn.ndim != 2: x_kn = np.reshape(x_kn, (n_conditions, int(len(x_kn) / n_conditions))) if temps_to_use != "All": indices_to_use = [ t for t, temp in enumerate(temperatures) if temp in temps_to_use ] natives = natives[indices_to_use, :] energies = energies[indices_to_use, :] x_kn = x_kn[indices_to_use, :] n_conditions = len(indices_to_use) temperatures = temperatures[indices_to_use] setpoints = setpoints[indices_to_use] sample_frequency = int(np.shape(natives)[1] / np.shape(x_kn)[1]) keep = np.arange(0, np.shape(natives)[1], sample_frequency) natives = natives[:, keep] energies = energies[:, keep] times = np.array([times[t] for t in range(len(times)) if t in keep]) eq_index = np.where(times == eq)[0][0] if max_time == None: natives = natives[:, eq_index:] energies = energies[:, eq_index:] x_kn = x_kn[:, eq_index:] else: max_index = np.where(times == max_time)[0][0] natives = natives[:, eq_index:max_index] energies = energies[:, eq_index:max_index] x_kn = x_kn[:, eq_index:max_index] print(times[eq_index:max_index]) n_timepoints = np.shape(natives)[1] print("# calculating potential...") N_k = np.array([n_timepoints for k in range(n_conditions)], np.int32) #u_kn=np.zeros() u_kln = np.zeros((n_conditions, n_conditions, n_timepoints)) #ukln tells you the reduced potential energy (energy/kbT + spring cost) that #point n from condition k would experience if it were to occur in some (other) #condition l for k in range(n_conditions): for n in range(n_timepoints): u_kln[k, :, n] = energies[k, n] / temperatures + k_bias * ( natives[k, n] - setpoints)**2 #Have to add in bias by hand since the energies term from log files does not include that bias! print("# Computing normalizations...") mbar = pymbar.MBAR( u_kln, N_k ) #This initialization computes the log partition functions for all conditions (temperature/bias combinations) #dF = mbar.getFreeEnergyDifferences()[0][0,:] #In these previous steps, we compute the full trace (partition function) for all conditions (setpoint and temp combinations)... #We will now compute the free energies (partial trace over only snapshots assigned to a state) under a DIFFERENT condition (which was not represented #in the conditions whose normalizations we just calculated)--namely: the state in which you have no bias unique_temperatures = np.unique(temperatures) unique_x = np.unique(x_kn) print('Computing state free energies...') x_n = x_kn.flatten() nbins = len(unique_x) bin_n = np.array([np.where(unique_x == x)[0][0] for x in x_n]) free_energies = np.zeros((len(unique_temperatures), len(unique_x))) uncertainties = np.zeros((len(unique_temperatures), len(unique_x))) for t, temp in enumerate(unique_temperatures): print("Computing free energy at T={}".format(temp)) u_n = energies.flatten() / unique_temperatures[ t] #reduced potential energy at temperature we care about # we do NOT include bias in above formula because we want to compute the PMF specificlaly under the condition of no bias #f_i, df_i = mbar.computePMF(u_n, bin_n, nbins, uncertainties='from-normalization') f_i, df_i = mbar.computePMF(u_n, bin_n, nbins, uncertainties='from-lowest') #f_i, df_i = mbar.computePMF(u_n, bin_n, nbins, uncertainties='all-differences') f_i = f_i - np.min(f_i) #set the lowest free energy to 0 #f_i=f_i+np.log(np.sum(np.exp(-f_i))) #normalize--this doesn't work well because you get overflow error free_energies[t, :] = f_i uncertainties[t, :] = df_i """ As for the uncertainties: Since I was not computing these for many of my previous proteins, I do not want to make a new variable to avoid creating confusion with number of variables to be loaded by joblib Rather, what I will do from now on is append the uncertainties to a second page of the free_energies array Also, the uncertainties are scaled by sqrt(N/N_eff), where N is total nubmer of samples, and N_eff is effective number of uncorrelated samples """ #First, compute statistical inefficiency using all data g = timeseries.statisticalInefficiencyMultiple(natives) NNN = len(x_n) N_eff = NNN / g uncertainties = uncertainties * np.sqrt(NNN / N_eff) free_energies = np.stack((free_energies, uncertainties), axis=2) if save: joblib.dump([unique_x, free_energies, temperatures], save_path) return unique_x, unique_temperatures, free_energies
psi_kt_replica = psi_kt.copy() for iteration in range(niterations): # Determine which snapshot indices are associated with this iteration snapshot_indices = iteration*trajectory_segment_length + numpy.arange(0,trajectory_segment_length) for k in range(K): # Determine which replica generated the data from temperature k at this iteration replica_index = replica_ik[iteration,k] # Reconstruct portion of replica trajectory. U_kt_replica[replica_index,snapshot_indices] = U_kt[k,snapshot_indices] phi_kt_replica[replica_index,snapshot_indices] = phi_kt[k,snapshot_indices] psi_kt_replica[replica_index,snapshot_indices] = psi_kt[k,snapshot_indices] # Estimate the statistical inefficiency of the simulation by analyzing the timeseries of interest. # We use the max of cos and sin of the phi and psi timeseries because they are periodic angles. # The print "Computing statistical inefficiencies..." g_cosphi = timeseries.statisticalInefficiencyMultiple(numpy.cos(phi_kt_replica * numpy.pi / 180.0)) print "g_cos(phi) = %.1f" % g_cosphi g_sinphi = timeseries.statisticalInefficiencyMultiple(numpy.sin(phi_kt_replica * numpy.pi / 180.0)) print "g_sin(phi) = %.1f" % g_sinphi g_cospsi = timeseries.statisticalInefficiencyMultiple(numpy.cos(psi_kt_replica * numpy.pi / 180.0)) print "g_cos(psi) = %.1f" % g_cospsi g_sinpsi = timeseries.statisticalInefficiencyMultiple(numpy.sin(psi_kt_replica * numpy.pi / 180.0)) print "g_sin(psi) = %.1f" % g_sinpsi # Subsample data with maximum of all correlation times. print "Subsampling data..." g = numpy.max(numpy.array([g_cosphi, g_sinphi, g_cospsi, g_sinpsi])) indices = timeseries.subsampleCorrelatedData(U_kt[k,:], g = g) print "Using g = %.1f to obtain %d uncorrelated samples per temperature" % (g, len(indices)) N_max = int(numpy.ceil(T / g)) # max number of samples per temperature U_kn = numpy.zeros([K, N_max], numpy.float64) phi_kn = numpy.zeros([K, N_max], numpy.float64)
K = 10 if __name__ == "__main__": var = numpy.ones(N) for replica in xrange(2, K + 1): var = numpy.concatenate((var, numpy.ones(N))) X = numpy.random.normal(numpy.zeros(K * N), var).reshape((K, N)) / 10.0 Y = numpy.random.normal(numpy.zeros(K * N), var).reshape((K, N)) # X=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N)) # Y=numpy.random.normal(numpy.zeros(K*N), var).reshape((K,N)) # print "X.shape = " # print X.shape energy = 10 * (X**2) / 2.0 + (Y**2) / 2.0 print "statisticalInefficiencyMultiple(X)" print timeseries.statisticalInefficiencyMultiple(X) print "statisticalInefficiencyMultiple(X**2)" print timeseries.statisticalInefficiencyMultiple(X**2) print "statisticalInefficiencyMultiple(X[0,:]**2)" print timeseries.statisticalInefficiencyMultiple(X[0, :]**2) print "statisticalInefficiencyMultiple(X[0:2,:]**2)" print timeseries.statisticalInefficiencyMultiple(X[0:2, :]**2) print "statisticalInefficiencyMultiple(energy)" print timeseries.statisticalInefficiencyMultiple(energy) # Exit with success. # TODO: Add some checks to test statistical inefficinecies are within normal expected range. sys.exit(0)
pickle_file = open(data_pickle_fn, 'wb') dump( (U_kn_correlated, A_ikn_correlated), pickle_file ) pickle_file.close() print "" ####################################################################### # Subsample {U,A}_kn_correlated to be uncorrelated # ####################################################################### print "Subsampling to achieve uncorrelated data" if stat_inefficiency == None: print "(1 of 2) Calculating statistical inefficiency (i = ", stdout.flush() for d in range(N_CVs): statnew = timeseries.statisticalInefficiencyMultiple(A_ikn_correlated[d]) stat_inefficiency = max([stat_inefficiency, statnew]) print stat_inefficiency, ")" else: print "(1 of 2) Using given statistical inefficiency (i =", str(stat_inefficiency) + ")" indices = timeseries.subsampleCorrelatedData(U_kn_correlated[0,:], g = stat_inefficiency) N_uncorrelated_samples = len(indices) print "(2 of 2) Subsampling to achieve", N_uncorrelated_samples, "samples per replica" U_kn = zeros([ N_replicas+N_output_temps,N_uncorrelated_samples], float32) A_ikn = zeros([N_CVs,N_replicas+N_output_temps,N_uncorrelated_samples], float32) for k in range(N_replicas): U_kn[k] = U_kn_correlated[k][indices]