def make_replica_dcd_files(topology, timestep=5 * unit.femtosecond, time_interval=200, output_dir="output", output_data="output.nc", checkpoint_data="output_checkpoint.nc", frame_begin=0, frame_stride=1): """ Make dcd files from replica exchange simulation trajectory data. :param topology: OpenMM Topology :type topology: `Topology() <https://simtk.org/api_docs/openmm/api4_1/python/classsimtk_1_1openmm_1_1app_1_1topology_1_1Topology.html>`_ :param timestep: Time step used in the simulation (default=5*unit.femtosecond) :type timestep: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>` float * simtk.unit :param time_interval: frequency, in number of time steps, at which positions were recorded (default=200) :type time_interval: int :param output_directory: path to which we will write the output (default='output') :type output_directory: str :param output_data: name of output .nc data file (default='output.nc') :type output_data: str :param checkpoint_data: name of checkpoint .nc data file (default='output_checkpoint.nc') :type checkpoint_data: str :param frame_begin: Frame at which to start writing the dcd trajectory (default=0) :type frame_begin: int :param frame_stride: advance by this many time intervals when writing dcd trajectories (default=1) :type frame_stride: int """ file_list = [] output_data_path = os.path.join(output_dir, output_data) # Get number of replicas: reporter = MultiStateReporter(output_data_path, open_mode="r") states = reporter.read_thermodynamic_states()[0] n_replicas = len(states) sampler_states = reporter.read_sampler_states(iteration=0) xunit = sampler_states[0].positions[0].unit for replica_index in range(n_replicas): replica_trajectory = extract_trajectory( topology, replica_index=replica_index, output_data=output_data_path, checkpoint_data=checkpoint_data, frame_begin=frame_begin, frame_stride=frame_stride) file_name = f"{output_dir}/replica_{replica_index+1}.dcd" file = open(file_name, "wb") dcd_file = DCDFile(file, topology, timestep, firstStep=frame_begin, interval=time_interval) for positions in replica_trajectory: # Add the units consistent with replica_energies positions *= xunit DCDFile.writeModel(dcd_file, positions) file.close() file_list.append(file_name) return file_list
def process_replica_exchange_data( output_data="output/output.nc", output_directory="output", series_per_page=4, write_data_file=True, plot_production_only=False, print_timing=False, equil_nskip=1, frame_begin=0, frame_end=-1, ): """ Read replica exchange simulation data, detect equilibrium and decorrelation time, and plot replica exchange results. :param output_data: path to output .nc file from replica exchange simulation, (default='output/output.nc') :type output_data: str :param output_directory: path to which output files will be written (default='output') :type output_directory: stry :param series_per_page: number of replica data series to plot per pdf page (default=4) :type series_per_page: int :param write_data_file: Option to write a text data file containing the state_energies array (default=True) :type write_data_file: Boolean :param plot_production_only: Option to plot only the production region, as determined from pymbar detectEquilibration (default=False) :type plot_production_only: Boolean :param equil_nskip: skip this number of frames to sparsify the energy timeseries for pymbar detectEquilibration (default=1) - this is used only when frame_begin=0 and the trajectory has less than 40000 frames. :type equil_nskip: Boolean :param frame_begin: analyze starting from this frame, discarding all prior as equilibration period (default=0) :type frame_begin: int :param frame_end: analyze up to this frame only, discarding the rest (default=-1). :type frame_end: int :returns: - replica_energies ( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ( np.float( [number_replicas,number_simulation_steps] ), simtk.unit ) ) - The potential energies for all replicas at all (printed) time steps - replica_state_indices ( np.int64( [number_replicas,number_simulation_steps] ), simtk.unit ) - The thermodynamic state assignments for all replicas at all (printed) time steps - production_start ( int - The frame at which the production region begins for all replicas, as determined from pymbar detectEquilibration - sample_spacing ( int - The number of frames between uncorrelated state energies, estimated using heuristic algorithm ) - n_transit ( np.float( [number_replicas] ) ) - Number of half-transitions between state 0 and n for each replica - mixing_stats ( tuple ( np.float( [number_replicas x number_replicas] ) , np.float( [ number_replicas ] ) , float( statistical inefficiency ) ) ) - transition matrix, corresponding eigenvalues, and statistical inefficiency """ t1 = time.perf_counter() # Read the simulation coordinates for individual temperature replicas reporter = MultiStateReporter(output_data, open_mode="r") t2 = time.perf_counter() if print_timing: print(f"open data time: {t2-t1}") # figure out what the time between output is. # We assume all use the same time step (which i think is required) mcmove = reporter.read_mcmc_moves()[0] time_interval = mcmove.n_steps * mcmove.timestep t3 = time.perf_counter() if print_timing: print(f"read_mcmc_moves time: {t3-t2}") # figure out what the temperature list is states = reporter.read_thermodynamic_states()[0] t4 = time.perf_counter() if print_timing: print(f"read_thermodynamics_states time: {t4-t3}") temperature_list = [] for s in states: temperature_list.append(s.temperature) analyzer = ReplicaExchangeAnalyzer(reporter) t5 = time.perf_counter() ( replica_energies, unsampled_state_energies, neighborhoods, replica_state_indices, ) = analyzer.read_energies() # Truncate output of read_energies() to last frame of interest if frame_end > 0: # Use frames from frame_begin to frame_end replica_energies = replica_energies[:, :, :frame_end] unsampled_state_energies = unsampled_state_energies[:, :, :frame_end] neighborhoods = neighborhoods[:, :, :frame_end] replica_state_indices = replica_state_indices[:, :frame_end] t6 = time.perf_counter() if print_timing: print(f"read_energies time: {t6-t5}") n_particles = np.shape( reporter.read_sampler_states(iteration=0)[0].positions)[0] temps = np.array([temp._value for temp in temperature_list]) beta_k = 1 / (kB * temps) n_replicas = len(temperature_list) for k in range(n_replicas): replica_energies[:, k, :] *= beta_k[k]**(-1) t7 = time.perf_counter() if print_timing: print(f"reduce replica energies time: {t7-t6}") total_steps = len(replica_energies[0][0]) state_energies = np.zeros([n_replicas, total_steps]) t8 = time.perf_counter() # there must be some better way to do this as list comprehension. for step in range(total_steps): for state in range(n_replicas): state_energies[state, step] = replica_energies[np.where( replica_state_indices[:, step] == state)[0], 0, step] t9 = time.perf_counter() if print_timing: print(f"assign state energies time: {t9-t8}") # can run physical-valication on these state_energies # Use pymbar timeseries module to detect production period t10 = time.perf_counter() # Start of equilibrated data: t0 = np.zeros((n_replicas)) # Statistical inefficiency: g = np.zeros((n_replicas)) subsample_indices = {} # If sufficiently large, discard the first 20000 frames as equilibration period and use # subsampleCorrelatedData to get the energy decorrelation time. if total_steps >= 40000 or frame_begin > 0: if frame_begin > 0: # If specified, use frame_begin as the start of the production region production_start = frame_begin else: # Otherwise, use frame 20000 production_start = 20000 for state in range(n_replicas): subsample_indices[state] = timeseries.subsampleCorrelatedData( state_energies[state][production_start:], conservative=True, ) g[state] = subsample_indices[state][1] - subsample_indices[state][0] else: # For small trajectories, use detectEquilibration for state in range(n_replicas): t0[state], g[state], Neff_max = timeseries.detectEquilibration( state_energies[state], nskip=equil_nskip) # Choose the latest equil timestep to apply to all states production_start = int(np.max(t0)) # Assume a normal distribution (very rough approximation), and use mean plus # the number of standard deviations which leads to (n_replica-1)/n_replica coverage # For 12 replicas this should be the mean + 1.7317 standard deviations # x standard deviations is the solution to (n_replica-1)/n_replica = erf(x/sqrt(2)) # This is equivalent to a target of 23/24 CDF value print(f"g: {g.astype(int)}") def erf_fun(x): return np.power((erf(x / np.sqrt(2)) - (n_replicas - 1) / n_replicas), 2) # x must be larger than zero opt_g_results = minimize_scalar(erf_fun, bounds=(0, 10)) if not opt_g_results.success: print("Error solving for correlation time, exiting...") print(f"erf opt results: {opt_g_results}") exit() sample_spacing = int(np.ceil(np.mean(g) + opt_g_results.x * np.std(g))) t11 = time.perf_counter() if print_timing: print(f"detect equil and subsampling time: {t11-t10}") print("state mean energies variance") for state in range(n_replicas): state_mean = np.mean(state_energies[state, production_start::sample_spacing]) state_std = np.std(state_energies[state, production_start::sample_spacing]) print(f" {state:4d} {state_mean:10.6f} {state_std:10.6f}") t12 = time.perf_counter() if write_data_file == True: f = open(os.path.join(output_directory, "replica_energies.dat"), "w") for step in range(total_steps): f.write(f"{step:10d}") for replica_index in range(n_replicas): f.write( f"{replica_energies[replica_index,replica_index,step]:12.6f}" ) f.write("\n") f.close() t13 = time.perf_counter() if print_timing: print(f"Optionally write .dat file: {t13-t12}") t14 = time.perf_counter() if plot_production_only == True: plot_replica_exchange_energies( state_energies[:, production_start:], temperature_list, series_per_page, time_interval=time_interval, time_shift=production_start * time_interval, file_name=f"{output_directory}/rep_ex_ener.pdf", ) plot_replica_exchange_energy_histograms( state_energies[:, production_start:], temperature_list, file_name=f"{output_directory}/rep_ex_ener_hist.pdf", ) plot_replica_exchange_summary( replica_state_indices[:, production_start:], temperature_list, series_per_page, time_interval=time_interval, time_shift=production_start * time_interval, file_name=f"{output_directory}/rep_ex_states.pdf", ) plot_replica_state_matrix( replica_state_indices[:, production_start:], file_name=f"{output_directory}/state_probability_matrix.pdf", ) else: plot_replica_exchange_energies( state_energies, temperature_list, series_per_page, time_interval=time_interval, file_name=f"{output_directory}/rep_ex_ener.pdf", ) plot_replica_exchange_energy_histograms( state_energies, temperature_list, file_name=f"{output_directory}/rep_ex_ener_hist.pdf", ) plot_replica_exchange_summary( replica_state_indices, temperature_list, series_per_page, time_interval=time_interval, file_name=f"{output_directory}/rep_ex_states.pdf", ) plot_replica_state_matrix( replica_state_indices, file_name=f"{output_directory}/state_probability_matrix.pdf", ) t15 = time.perf_counter() if print_timing: print(f"plotting time: {t15-t14}") # Analyze replica exchange state transitions # For each replica, how many times does the thermodynamic state go between state 0 and state n # For consistency with the other mixing statistics, use only the production region here replica_state_indices_prod = replica_state_indices[:, production_start:] # Number of one-way transitions from states 0 to n or states n to 0 n_transit = np.zeros((n_replicas, 1)) # Replica_state_indices is [n_replicas x n_iterations] for rep in range(n_replicas): last_bound = None for i in range(replica_state_indices_prod.shape[1]): if replica_state_indices_prod[ rep, i] == 0 or replica_state_indices_prod[rep, i] == ( n_replicas - 1): if last_bound is None: # This is the first time state 0 or n is visited pass else: if last_bound != replica_state_indices_prod[rep, i]: # This is a completed transition from 0 to n or n to 0 n_transit[rep] += 1 last_bound = replica_state_indices_prod[rep, i] t16 = time.perf_counter() if print_timing: print(f"replica transition analysis: {t16-t15}") # Compute transition matrix from the analyzer mixing_stats = analyzer.generate_mixing_statistics( number_equilibrated=production_start) t17 = time.perf_counter() if print_timing: print(f"compute transition matrix: {t17-t16}") print(f"total time elapsed: {t17-t1}") return (replica_energies, replica_state_indices, production_start, sample_spacing, n_transit, mixing_stats)
def make_state_pdb_files(topology, output_dir="output", output_data="output.nc", checkpoint_data="output_checkpoint.nc", frame_begin=0, frame_stride=1, center=True): """ Make pdb files by state from replica exchange simulation trajectory data. Note: these are discontinuous trajectories with constant temperature state. :param topology: OpenMM Topology :type topology: `Topology() <https://simtk.org/api_docs/openmm/api4_1/python/classsimtk_1_1openmm_1_1app_1_1topology_1_1Topology.html>`_ :param output_directory: path to which we will write the output (default='output') :type output_directory: str :param output_data: name of output .nc data file (default='output.nc') :type output_data: str :param checkpoint_data: name of checkpoint .nc data file (default='output_checkpoint.nc') :type checkpoint_data: str :param frame_begin: Frame at which to start writing the pdb trajectory (default=0) :type frame_begin: int :param frame_stride: advance by this many frames when writing pdb trajectories (default=1) :type frame_stride: int :param center: align the center of mass of each structure in the discontinuous state trajectory (default=True) :type center: Boolean :returns: - file_list ( List( str ) ) - A list of names for the files that were written """ file_list = [] output_data_path = os.path.join(output_dir, output_data) # Get number of states: reporter = MultiStateReporter(output_data_path, open_mode="r") states = reporter.read_thermodynamic_states()[0] sampler_states = reporter.read_sampler_states(iteration=0) xunit = sampler_states[0].positions[0].unit for state_index in range(len(states)): state_trajectory = extract_trajectory(topology, state_index=state_index, output_data=output_data_path, checkpoint_data=checkpoint_data, frame_begin=frame_begin, frame_stride=frame_stride) file_name = f"{output_dir}/state_{state_index+1}.pdb" file = open(file_name, "w") PDBFile.writeHeader(topology, file=file) modelIndex = 1 # TODO: replace this with MDTraj alignment tool if center == True: center_x = np.mean(state_trajectory[0, :, 0]) center_y = np.mean(state_trajectory[0, :, 1]) center_z = np.mean(state_trajectory[0, :, 2]) for positions in state_trajectory: if center == True: positions[:, 0] += (center_x - np.mean(positions[:, 0])) positions[:, 1] += (center_y - np.mean(positions[:, 1])) positions[:, 2] += (center_z - np.mean(positions[:, 2])) # Add the units consistent with replica_energies positions *= xunit PDBFile.writeModel(topology, positions, file=file, modelIndex=modelIndex) PDBFile.writeFooter(topology, file=file) file.close() file_list.append(file_name) return file_list
def make_state_dcd_files(topology, timestep=5 * unit.femtosecond, time_interval=200, output_dir="output", output_data="output.nc", checkpoint_data="output_checkpoint.nc", frame_begin=0, frame_stride=1, center=True): """ Make dcd files by state from replica exchange simulation trajectory data. Note: these are discontinuous trajectories with constant temperature state. :param topology: OpenMM Topology :type topology: `Topology() <https://simtk.org/api_docs/openmm/api4_1/python/classsimtk_1_1openmm_1_1app_1_1topology_1_1Topology.html>`_ :param timestep: Time step used in the simulation (default=5*unit.femtosecond) :type timestep: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>` float * simtk.unit :param time_interval: frequency, in number of time steps, at which positions were recorded (default=200) :type time_interval: int :param output_directory: path to which we will write the output (default='output') :type output_directory: str :param output_data: name of output .nc data file (default='output.nc') :type output_data: str :param checkpoint_data: name of checkpoint .nc data file (default='output_checkpoint.nc') :type checkpoint_data: str :param frame_begin: Frame at which to start writing the dcd trajectory (default=0) :type frame_begin: int :param frame_stride: advance by this many time intervals when writing dcd trajectories (default=1) :type frame_stride: int :param center: align the center of mass of each structure in the discontinuous state trajectory (default=True) :type center: Boolean """ file_list = [] output_data_path = os.path.join(output_dir, output_data) # Get number of states: reporter = MultiStateReporter(output_data_path, open_mode="r") states = reporter.read_thermodynamic_states()[0] sampler_states = reporter.read_sampler_states(iteration=0) xunit = sampler_states[0].positions[0].unit for state_index in range(len(states)): state_trajectory = extract_trajectory(topology, state_index=state_index, output_data=output_data_path, checkpoint_data=checkpoint_data, frame_begin=frame_begin, frame_stride=frame_stride) file_name = f"{output_dir}/state_{state_index+1}.dcd" file = open(file_name, "wb") dcd_file = DCDFile(file, topology, timestep, firstStep=frame_begin, interval=time_interval) # TODO: replace this with MDTraj alignment tool if center == True: center_x = np.mean(state_trajectory[0, :, 0]) center_y = np.mean(state_trajectory[0, :, 1]) center_z = np.mean(state_trajectory[0, :, 2]) for positions in state_trajectory: if center == True: positions[:, 0] += (center_x - np.mean(positions[:, 0])) positions[:, 1] += (center_y - np.mean(positions[:, 1])) positions[:, 2] += (center_z - np.mean(positions[:, 2])) # Add the units consistent with replica_energies positions *= xunit DCDFile.writeModel(dcd_file, positions) file.close() file_list.append(file_name) return file_list
def bootstrap_free_energy_folding(Q, Q_folded, output_data="output/output.nc", frame_begin=0, sample_spacing=1, n_trial_boot=200, num_intermediate_states=0, conf_percent='sigma', plotfile_dir="output"): """ Function for computing uncertainty of free energy, entropy, and enthalpy using bootstrapping with varying starting frames. :param Q: native contact fraction array of size [n_frames x n_states] (with equilibration region already trimmed) :type Q: 2D numpy array ( float ) :param Q_folded: threshold for a native contact fraction corresponding to a folded state (Q[i,j] is folded if Q[i,j] >= Q_folded) :type Q_folded: float :param output_data: Path to the simulation .nc file. :type output_data: str :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0) :type frame_begin: int :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData :type sample_spacing: int :param n_trial_boot: number of trials to run for generating bootstrapping uncertainties (default=200) :type n_trial_boot: int :param num_intermediate_states: Number of unsampled thermodynamic states between sampled states to include in the calculation :type num_intermediate_states: int :returns: - full_T_list - A 1D numpy array listing of all temperatures, including sampled and intermediate unsampled - deltaF_values - A dictionary of the form {"statei_statej": 1D numpy array}, containing free energy change for each T in full_T_list, for each conformational state transition. - deltaF uncertainty - A dictionary containing tuple of 1D numpy arrays of lower/upper of uncertainties corresponding to deltaF_values - deltaS_values - A dictionary of the form {"statei_statej": 1D numpy array}, containing entropy change for each T in full_T_list, for each conformational state transition. - deltaS uncertainty - A dictionary containing tuple of 1D numpy arrays of lower/upper uncertainties corresponding to deltaS_values - deltaU_values - A dictionary of the form {"statei_statej": 1D numpy array}, containing enthalpy change for each T in full_T_list, for each conformational state transition. - deltaU uncertainty - A dictionary containing tuple of 1D numpy arrays of lower/upper of uncertainties corresponding to deltaU_values """ # extract reduced energies and the state indices from the .nc reporter = MultiStateReporter(output_data, open_mode="r") analyzer = ReplicaExchangeAnalyzer(reporter) ( replica_energies_all, unsampled_state_energies, neighborhoods, replica_state_indices, ) = analyzer.read_energies() # Get temperature_list from .nc file: states = reporter.read_thermodynamic_states()[0] temperature_list = [] for s in states: temperature_list.append(s.temperature) # Select production frames to analyze replica_energies_prod = replica_energies_all[:, :, frame_begin::] # For shifting reference frame bootstrap, we need the entire Q and energy arrays starting from frame_start if np.shape(replica_energies_prod)[2] != np.shape(Q)[0]: print( f'Error: Q array of shape {Q.shape} incompatible with energies array of shape{replica_energies_prod.shape}' ) exit() Q_all = Q # Overall results: deltaF_values = {} deltaF_uncertainty = {} deltaS_values = {} deltaS_uncertainty = {} deltaU_values = {} deltaU_uncertainty = {} # Uncertainty for each sampling trial: deltaF_values_boot = {} deltaF_uncertainty_boot = {} deltaS_values_boot = {} deltaS_uncertainty_boot = {} deltaU_values_boot = {} deltaU_uncertainty_boot = {} # Get units: F_unit = (kB * unit.kelvin).unit # units of free energy T_unit = temperature_list[0].unit S_unit = F_unit / T_unit U_unit = F_unit for i_boot in range(n_trial_boot): # Here we can potentially change the reference frame for each bootstrap trial. # This requires the array slicing to be done here, not above. ref_shift = np.random.randint(sample_spacing) # Replica energies and Q already have equilibration period removed: replica_energies = replica_energies_prod[:, :, ref_shift::sample_spacing] Q = Q_all[ref_shift::sample_spacing, :] # Get all possible sample indices sample_indices_all = np.arange(0, len(replica_energies[0, 0, :])) # n_samples should match the size of the sliced replica energy dataset sample_indices = resample(sample_indices_all, replace=True, n_samples=len(sample_indices_all)) n_states = len(Q[0, :]) replica_energies_resample = np.zeros_like(replica_energies) # replica_energies is [n_states x n_states x n_frame] # Q is [nframes x n_states] Q_resample = np.zeros((len(sample_indices), n_states)) # Select the sampled frames from Q and replica_energies: j = 0 for i in sample_indices: replica_energies_resample[:, :, j] = replica_energies[:, :, i] Q_resample[j, :] = Q[i, :] j += 1 # Run free energy expectation calculation: full_T_list, deltaF_values_boot[i_boot], deltaF_uncertainty_boot[ i_boot] = expectations_free_energy( Q_resample, Q_folded, temperature_list, bootstrap_energies=replica_energies_resample, num_intermediate_states=num_intermediate_states, ) # Get entropy/enthalpy for fitting current free energy data: # The inner dictionary keys will be transition names deltaS_values_boot[i_boot] = {} deltaU_values_boot[i_boot] = {} deltaS_values_boot[i_boot], deltaU_values_boot[ i_boot] = get_entropy_enthalpy(deltaF_values_boot[i_boot], full_T_list) arr_deltaF_values_boot = {} arr_deltaS_values_boot = {} arr_deltaU_values_boot = {} # Loop over all conformational transitions: for key, value in deltaF_values_boot[0].items(): arr_deltaF_values_boot[key] = np.zeros( (n_trial_boot, len(full_T_list))) arr_deltaS_values_boot[key] = np.zeros( (n_trial_boot, len(full_T_list))) arr_deltaU_values_boot[key] = np.zeros( (n_trial_boot, len(full_T_list))) # Compute mean values: # Free energy: for i_boot in range(n_trial_boot): for key, value in deltaF_values_boot[i_boot].items(): arr_deltaF_values_boot[key][i_boot, :] = value.value_in_unit( F_unit) deltaF_values = {} for key, value in arr_deltaF_values_boot.items(): deltaF_values[key] = np.mean(value, axis=0) * F_unit # Entropy: for i_boot in range(n_trial_boot): arr_deltaS_values_boot[key][ i_boot, :] = deltaS_values_boot[i_boot][key].value_in_unit(S_unit) deltaS_values = {} for key, value in arr_deltaS_values_boot.items(): deltaS_values[key] = np.mean(value, axis=0) * S_unit # Enthalpy: for i_boot in range(n_trial_boot): arr_deltaU_values_boot[key][ i_boot, :] = deltaU_values_boot[i_boot][key].value_in_unit(U_unit) deltaU_values = {} for key, value in arr_deltaU_values_boot.items(): deltaU_values[key] = np.mean(value, axis=0) * U_unit # Compute confidence intervals: deltaF_uncertainty = {} deltaS_uncertainty = {} deltaU_uncertainty = {} if conf_percent == 'sigma': # Use analytical standard deviation instead of percentile method: # Free energy: for key, value in arr_deltaF_values_boot.items(): F_std = np.std(value, axis=0) * F_unit deltaF_uncertainty[key] = (-F_std, F_std) # Entropy: for key, value in arr_deltaS_values_boot.items(): S_std = np.std(value, axis=0) * S_unit deltaS_uncertainty[key] = (-S_std, S_std) # Enthalpy: for key, value in arr_deltaU_values_boot.items(): U_std = np.std(value, axis=0) * U_unit deltaU_uncertainty[key] = (-U_std, U_std) else: # Compute specified confidence interval: p_lo = (100 - conf_percent) / 2 p_hi = 100 - p_lo # Free energy: for key, value in arr_deltaF_values_boot.items(): F_diff = value - np.mean(value, axis=0) F_conf_lo = np.percentile( F_diff, p_lo, axis=0, interpolation='linear') * F_unit F_conf_hi = np.percentile( F_diff, p_hi, axis=0, interpolation='linear') * F_unit deltaF_uncertainty[key] = (F_conf_lo, F_conf_hi) # Entropy: for key, value in arr_deltaS_values_boot.items(): S_diff = value - np.mean(value, axis=0) S_conf_lo = np.percentile( S_diff, p_lo, axis=0, interpolation='linear') * S_unit S_conf_hi = np.percentile( S_diff, p_hi, axis=0, interpolation='linear') * S_unit deltaS_uncertainty[key] = (S_conf_lo, S_conf_hi) # Enthalpy: for key, value in arr_deltaU_values_boot.items(): U_diff = value - np.mean(value, axis=0) U_conf_lo = np.percentile( U_diff, p_lo, axis=0, interpolation='linear') * U_unit U_conf_hi = np.percentile( U_diff, p_hi, axis=0, interpolation='linear') * U_unit deltaU_uncertainty[key] = (U_conf_lo, U_conf_hi) # Plot results: # Free energy: plot_free_energy_results(full_T_list, deltaF_values, deltaF_uncertainty, plotfile=f"{plotfile_dir}/free_energy_boot.pdf") # Entropy and enthalpy: plot_entropy_enthalpy( full_T_list, deltaS_values, deltaU_values, deltaS_uncertainty=deltaS_uncertainty, deltaU_uncertainty=deltaU_uncertainty, plotfile_entropy=f"{plotfile_dir}/entropy_boot.pdf", plotfile_enthalpy=f"{plotfile_dir}/enthalpy_boot.pdf") return full_T_list, deltaF_values, deltaF_uncertainty, deltaS_values, deltaS_uncertainty, deltaU_values, deltaU_uncertainty
def get_heat_capacity(frame_begin=0, sample_spacing=1, frame_end=-1, output_data="output/output.nc", num_intermediate_states=0, frac_dT=0.05, plot_file=None, bootstrap_energies=None): """ Given a .nc output and a number of intermediate states to insert for the temperature list, this function calculates and plots the heat capacity profile. :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0) :type frame_begin: int :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData (default=1) :type sample_spacing: int :param frame_end: index of last frame to include in heat capacity calculation (default=-1) :type frame_end: int :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data (default = "output/output.nc") :type output_data: str :param num_intermediate_states: The number of states to insert between existing states in 'temperature_list' (default=0) :type num_intermediate_states: int :param frac_dT: The fraction difference between temperatures points used to calculate finite difference derivatives (default=0.05) :type num_intermediate_states: float :param plotfile: path to filename to output plot :type plotfile: str :param bootstrap_energies: a custom replica_energies array to be used for bootstrapping calculations. Used instead of the energies in the .nc file. :type bootstrap_energies: 2d numpy array (float) :returns: - C_v ( List( float ) ) - The heat capacity values for all (including inserted intermediates) states - dC_v ( List( float ) ) - The uncertainty in the heat capacity values for intermediate states - new_temp_list ( List( float * unit.simtk.temperature ) ) - The temperature list corresponding to the heat capacity values in 'C_v' """ if bootstrap_energies is not None: # Use a subsampled replica_energy matrix instead of reading from file replica_energies = bootstrap_energies # Still need to get the thermodynamic states reporter = MultiStateReporter(output_data, open_mode="r") else: # extract reduced energies and the state indices from the .nc reporter = MultiStateReporter(output_data, open_mode="r") analyzer = ReplicaExchangeAnalyzer(reporter) ( replica_energies_all, unsampled_state_energies, neighborhoods, replica_state_indices, ) = analyzer.read_energies() # Select production frames to analyze if frame_end > 0: replica_energies = replica_energies_all[:, :, frame_begin: frame_end:sample_spacing] else: replica_energies = replica_energies_all[:, :, frame_begin:: sample_spacing] # Get the temperature list from .nc file: states = reporter.read_thermodynamic_states()[0] temperature_list = [] for s in states: temperature_list.append(s.temperature) # determine the numerical values of beta at each state in units consistent with the temperature Tunit = temperature_list[0].unit temps = np.array([temp.value_in_unit(Tunit) for temp in temperature_list ]) # should this just be array to begin with beta_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole / Tunit) * temps) # convert the energies from replica/evaluated state/sample form to evaluated state/sample form replica_energies = pymbar.utils.kln_to_kn(replica_energies) n_samples = len(replica_energies[0, :]) # calculate the number of states we need expectations at. We want it at all of the original # temperatures, each intermediate temperature, and then temperatures +/- from the original # to take finite derivatives. # create an array for the temperature and energy for each state, including the # finite different state. num_sampled_T = len(temps) n_unsampled_states = 3 * (num_sampled_T + (num_sampled_T - 1) * num_intermediate_states) unsampled_state_energies = np.zeros([n_unsampled_states, n_samples]) full_T_list = np.zeros(n_unsampled_states) # delta is the spacing between temperatures. delta = np.zeros(num_sampled_T - 1) # fill in a list of temperatures at all original temperatures and all intermediate states. full_T_list[0] = temps[0] t = 0 for i in range(num_sampled_T - 1): delta[i] = (temps[i + 1] - temps[i]) / (num_intermediate_states + 1) for j in range(num_intermediate_states + 1): full_T_list[t] = temps[i] + delta[i] * j t += 1 full_T_list[t] = temps[-1] n_T_vals = t + 1 # add additional states for finite difference calculation and the requested spacing/ full_T_list[n_T_vals] = full_T_list[0] - delta[0] * frac_dT full_T_list[2 * n_T_vals] = full_T_list[0] + delta[0] * frac_dT for i in range(1, n_T_vals - 1): ii = i // (num_intermediate_states + 1) full_T_list[i + n_T_vals] = full_T_list[i] - delta[ii] * frac_dT full_T_list[i + 2 * n_T_vals] = full_T_list[i] + delta[ii] * frac_dT full_T_list[2 * n_T_vals - 1] = full_T_list[n_T_vals - 1] - delta[-1] * frac_dT full_T_list[3 * n_T_vals - 1] = full_T_list[n_T_vals - 1] + delta[-1] * frac_dT # calculate betas of all of these temperatures beta_full_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole / Tunit) * full_T_list) ti = 0 N_k = np.zeros(n_unsampled_states) for k in range(n_unsampled_states): # Calculate the reduced energies at all temperatures, sampled and unsample. unsampled_state_energies[ k, :] = replica_energies[0, :] * (beta_full_k[k] / beta_k[0]) if ti < len(temps): # store in N_k which states do and don't have samples. if full_T_list[k] == temps[ti]: ti += 1 N_k[k] = n_samples // len( temps) # these are the states that have samples # call MBAR to find weights at all states, sampled and unsampled mbarT = pymbar.MBAR(unsampled_state_energies, N_k, verbose=False, relative_tolerance=1e-12) for k in range(n_unsampled_states): # get the 'unreduced' potential -- we can't take differences of reduced potentials # because the beta is different; math is much more confusing with derivatives of the reduced potentials. unsampled_state_energies[k, :] /= beta_full_k[k] # we don't actually need these expectations, but this code can be used to validate #results = mbarT.computeExpectations(unsampled_state_energies, state_dependent=True) #E_expect = results[0] #dE_expect = results[1] # expectations for the differences between states, which we need for numerical derivatives results = mbarT.computeExpectations(unsampled_state_energies, output="differences", state_dependent=True) DeltaE_expect = results[0] dDeltaE_expect = results[1] # Now calculate heat capacity (with uncertainties) using the finite difference approach. Cv = np.zeros(n_T_vals) dCv = np.zeros(n_T_vals) for k in range(n_T_vals): im = k + n_T_vals # +/- delta up and down. ip = k + 2 * n_T_vals Cv[k] = (DeltaE_expect[im, ip]) / (full_T_list[ip] - full_T_list[im]) dCv[k] = (dDeltaE_expect[im, ip]) / (full_T_list[ip] - full_T_list[im]) # add units so the plot has the right units. Cv *= unit.kilojoule_per_mole / Tunit # always kJ/mol, since the OpenMM output is in kJ/mol. dCv *= unit.kilojoule_per_mole / Tunit full_T_list *= Tunit # plot and return the heat capacity (with units) if plot_file is not None: plot_heat_capacity(Cv, dCv, full_T_list[0:n_T_vals], file_name=plot_file) return (Cv, dCv, full_T_list[0:n_T_vals])
def physical_validation_ensemble(output_data="output.nc", output_directory="ouput", plotfile='ensemble_check', pairs='single', ref_state_index=0): """ Run ensemble physical validation test for 2 states in replica exchange simulation :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data :type output_data: str :param plotfile: Filename for outputting ensemble check plot :type plotfile: str :param pairs: Option for running ensemble validation on all replica pair combinations ('all'), adjacent pairs ('adjacent'), or single pair with optimal spacing ('single') :param ref_state_index: Index in temperature_list to use as one of the states in the ensemble check. The other state will be chosen based on the energy standard deviation at the reference state. Ignored if pairs='all' :type ref_state_index: int """ # Get temperature list and read the energies for individual temperature replicas reporter = MultiStateReporter(output_data, open_mode="r") analyzer = ReplicaExchangeAnalyzer(reporter) states = reporter.read_thermodynamic_states()[0] temperature_list = [] for s in states: temperature_list.append(s.temperature) ( replica_energies, unsampled_state_energies, neighborhoods, replica_state_indices, ) = analyzer.read_energies() n_particles = np.shape( reporter.read_sampler_states(iteration=0)[0].positions)[0] T_unit = temperature_list[0].unit temps = np.array([temp.value_in_unit(T_unit) for temp in temperature_list]) beta_k = 1 / (kB.value_in_unit(unit.kilojoule_per_mole / T_unit) * temps) n_replicas = len(temperature_list) for k in range(n_replicas): replica_energies[:, k, :] *= beta_k[k]**(-1) total_steps = len(replica_energies[0][0]) state_energies = np.zeros([n_replicas, total_steps]) for step in range(total_steps): for state in range(n_replicas): state_energies[state, step] = replica_energies[np.where( replica_state_indices[:, step] == state)[0], 0, step] state_energies *= unit.kilojoule_per_mole T_array = np.zeros(len(temperature_list)) for i in range(len(temperature_list)): T_array[i] = temperature_list[i].value_in_unit(T_unit) if pairs.lower() != 'single' and pairs.lower( ) != 'adjacent' and pairs.lower() != 'all': print( f"Error: Pair option '{pairs}' not recognized, using default option 'single'" ) pairs = 'single' if pairs.lower() == 'single': # Run ensemble validation on one optimally spaced temperature pair quantiles = {} # Find optimal state pair for ensemble check: # Compute standard deviations of each energy distribution: state_energies_std = np.std(state_energies, axis=1) # Select reference state point T_ref = temperature_list[ref_state_index] std_ref = state_energies_std[ref_state_index] # Compute optimal spacing: deltaT = 2 * kB * T_ref**2 / std_ref #print("DeltaT: %r" %deltaT) # Find closest match T_diff = np.abs(T_ref.value_in_unit(T_unit) - T_array) T_opt_index = np.argmin(np.abs(deltaT.value_in_unit(T_unit) - T_diff)) T_opt = temperature_list[T_opt_index] # Set SimulationData for physical validation state1_index = ref_state_index state2_index = T_opt_index sim_data1, sim_data2 = set_simulation_data(state_energies, T_array, state1_index, state2_index) # Run physical validation try: quantiles_ij = pv.ensemble.check(sim_data1, sim_data2, total_energy=False, filename=plotfile) quantiles[ f"state{state1_index}_state{state2_index}"] = quantiles_ij[0] except InputError: print( f"Insufficient overlap between trajectories for states {state1_index} and {state2_index}. Skipping..." ) elif pairs.lower() == 'adjacent': # Run ensemble validation on all adjacent temperature pairs quantiles = {} for i in range(len(temperature_list) - 1): # Set SimulationData for physical validation state1_index = i state2_index = i + 1 sim_data1, sim_data2 = set_simulation_data(state_energies, T_array, state1_index, state2_index) # Run physical validation try: quantiles_ij = pv.ensemble.check( sim_data1, sim_data2, total_energy=False, filename=f"{plotfile}_{state1_index}_{state2_index}") quantiles[ f"state{state1_index}_state{state2_index}"] = quantiles_ij[ 0] except InputError: print( f"Insufficient overlap between trajectories for states {state1_index} and {state2_index}. Skipping..." ) elif pairs.lower() == 'all': # Run ensemble validation on all combinations of temperature pairs quantiles = {} for i in range(len(temperature_list)): for j in range(i + 1, len(temperature_list)): # Set SimulationData for physical validation state1_index = i state2_index = j sim_data1, sim_data2 = set_simulation_data( state_energies, T_array, state1_index, state2_index) # Run physical validation try: quantiles_ij = pv.ensemble.check( sim_data1, sim_data2, total_energy=False, filename=f"{plotfile}_{state1_index}_{state2_index}") quantiles[ f"state{state1_index}_state{state2_index}"] = quantiles_ij[ 0] except InputError: print( f"Insufficient overlap between trajectories for states {state1_index} and {state2_index}. Skipping..." ) return quantiles