def get_equilibration_points(df, column_name=None): """ directly uses pymbar's timeseries utility! source: https://github.com/choderalab/pymbar https://pymbar.readthedocs.io/en/master/timeseries.html and references therein """ equilibration_data = dict([]) for name in df.columns.values: """ t - t_0 starting point of equilibrated part of series g - the statistical innefficency = 2*correlationtime +1 N_effmax - effective number of uncorrelated samples """ if isinstance(df.loc[0, name], complex): [t, g, Neff_max] = timeseries.detectEquilibration( np.array([x.real for x in df.loc[:, name]])) equilibration_data[name + "_real"] = [t, g, Neff_max] [t, g, Neff_max] = timeseries.detectEquilibration( np.array([x.imag for x in df.loc[:, name]])) equilibration_data[name + "_imag"] = [t, g, Neff_max] else: [t, g, Neff_max] = timeseries.detectEquilibration(df.loc[:, name]) equilibration_data[name] = [t, g, Neff_max] print(equilibration_data) return equilibration_data
def test_detectEquil_constant_trailing(): # This explicitly tests issue #122, see https://github.com/choderalab/pymbar/issues/122 x = np.random.normal(size=100) * 0.01 x[50:] = 3.0 # The input data is some MCMC chain where the trailing end of the chain is a constant sequence. (t, g, Neff_max) = timeseries.detectEquilibration(x) """
def is_converged(series: Series, frac_min=0.5): ''' Determine whether a time series has converged or not Parameters ---------- series : Series Time series frac_min : float Consider this time series is converged only if the fraction of converged parts relative to the full series is larger than this threshold Returns ------- converged : bool Converged or not when : float From when this times series converged ''' from pymbar import timeseries n_points = len(series) array = np.array(series) t0, g, Neff_max = timeseries.detectEquilibration(array, nskip=max( 1, n_points // 100)) if t0 > n_points * (1 - frac_min): return False, series.index[t0] return True, series.index[t0]
def calc_df(u_kln): """ u_kln should be (nstates) x (nstates) x (nframes) note that u_kln should be normalized by kT already where each element is a config from frame `n` of a trajectory conducted with state `k` with energy recalculated using parameters of state `l` """ dims = u_kln.shape if dims[0] != dims[1]: raise ValueError( "dimensions {} of u_kln should be square in the first two indices". format(dims)) nstates = dims[0] N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() # save data? return DeltaF_ij, dDeltaF_ij
def get_decorrelated_samples(replica_positions, replica_energies, temperature_list): """ Given a set of replica exchange trajectories, energies, and associated temperatures, this function returns decorrelated samples, as obtained from pymbar with timeseries.subsampleCorrelatedData. :param replica_positions: Positions array for the replica exchange data for which we will write PDB files :type replica_positions: `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ( np.array( [n_replicas,cgmodel.num_beads,3] ), simtk.unit ) :param replica_energies: List of dimension num_replicas X simulation_steps, which gives the energies for all replicas at all simulation steps :type replica_energies: List( List( float * simtk.unit.energy for simulation_steps ) for num_replicas ) :param temperature_list: List of temperatures for the simulation data. :type temperature_list: List( float * simtk.unit.temperature ) :returns: - configurations ( List( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ (n_decorrelated_samples,cgmodel.num_beads,3), simtk.unit ) ) - A list of decorrelated samples - energies ( List( `Quantity() <http://docs.openmm.org/development/api-python/generated/simtk.unit.quantity.Quantity.html>`_ ) ) - The energies for the decorrelated samples (configurations) """ all_poses = [] all_energies = [] for replica_index in range(len(replica_positions)): energies = replica_energies[replica_index][replica_index] [t0, g, Neff_max] = timeseries.detectEquilibration(energies) energies_equil = energies[t0:] poses_equil = replica_positions[replica_index][t0:] indices = timeseries.subsampleCorrelatedData(energies_equil) for index in indices: all_energies.append(energies_equil[index]) all_poses.append(poses_equil[index]) all_energies = np.array([float(energy) for energy in all_energies]) return (all_poses, all_energies)
def collatedata(dictionary): #I want this function to read in my huge data dictionaries #Then I want it to work out the approximate final value (U or N) for each markov chain and simulation step #Then i want to average each of these approximate final values across parallel markov chains simsteps = list(dictionary.keys()) print(simsteps) timesteps = list(dictionary[simsteps[0]].keys()) print(timesteps) parallelsims = len(dictionary[simsteps[0]][timesteps[0]]) print(dictionary[simsteps[0]][timesteps[0]]) resultstot = {} # datalist=np.zeros(len(EDicts[simsteps[0]].keys())) for t in simsteps: resultstot[t] = {'Ave': [], 'Std': []} for q in range(parallelsims): print(q) for r in simsteps: #print(len(datalist)) placeholder = [] for s in timesteps: placeholder.append(dictionary[r][s][q]) #print(placeholder) datalist = np.asarray(placeholder, dtype=float) #print(datalist) [t0, g, Neff_max] = timeseries.detectEquilibration(datalist) #print('t0={0}'.format(t0)) #print(datalist[t0:]) avg = np.mean(datalist[t0:]) sdv = np.std(datalist[t0:]) resultstot[r]['Ave'].append(round(avg, 3)) resultstot[r]['Std'].append(round(sdv, 3)) return resultstot
def calcTension(energy_data, verbose=False): dE1 = energy_data[:, 1] - energy_data[:, 0] dE2 = energy_data[:, 2] - energy_data[:, 0] BdE1 = dE1 / kTkJmol BdE2 = dE2 / kTkJmol nstates = 2 nframes = len(dE1) u_kln = np.zeros([nstates, nstates, nframes], np.float64) u_kln[0, 1, :] = BdE1 u_kln[1, 0, :] = BdE2 N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T if verbose: print("...found {} uncorrelated samples out of {} total samples...". format(N_k, nframes)) if verbose: print("=== Computing free energy differences ===") mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() tension = DeltaF_ij[ 0, 1] / da * 1e18 * kT #(in J/m^2). note da already has a factor of two for the two areas! tensionError = dDeltaF_ij[0, 1] / da * 1e18 * kT if verbose: print('tension (pymbar): {} +/- {}N/m'.format(tension, tensionError)) return tension, tensionError
def production(self): if os.path.exists(self.production_dcd_filename) and os.path.exists(self.production_data_filename): return prmtop = app.AmberPrmtopFile(self.prmtop_filename) pdb = app.PDBFile(self.equil_pdb_filename) system = prmtop.createSystem(nonbondedMethod=app.PME, nonbondedCutoff=CUTOFF, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, FRICTION, TIMESTEP) system.addForce(mm.MonteCarloBarostat(PRESSURE, self.temperature, BAROSTAT_FREQUENCY)) simulation = app.Simulation(prmtop.topology, system, integrator) simulation.context.setPositions(pdb.positions) simulation.context.setPeriodicBoxVectors(*pdb.topology.getPeriodicBoxVectors()) simulation.context.setVelocitiesToTemperature(self.temperature) print('Production.') simulation.reporters.append(app.DCDReporter(self.production_dcd_filename, OUTPUT_FREQUENCY)) simulation.reporters.append(app.StateDataReporter(self.production_data_filename, OUTPUT_DATA_FREQUENCY, step=True, potentialEnergy=True, temperature=True, density=True)) converged = False while not converged: simulation.step(N_STEPS) d = pd.read_csv(self.production_data_filename, names=["step", "U", "Temperature", "Density"], skiprows=1) density_ts = np.array(d.Density) [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000) density_ts = density_ts[t0:] density_mean_stderr = density_ts.std() / np.sqrt(Neff) if density_mean_stderr < STD_ERROR_TOLERANCE: converged = True
def compute_timeseries(reduced_potentials): """ Use pymbar timeseries to compute the uncorrelated samples in an array of reduced potentials. Returns the uncorrelated sample indices. Arguments --------- reduced_potentials : np.array of floats reduced potentials from which a timeseries is to be extracted Returns ------- t0 : int production region index g : float statistical inefficiency Neff_max : int effective number of samples in production region full_uncorrelated_indices : list of ints uncorrelated indices """ from pymbar import timeseries t0, g, Neff_max = timeseries.detectEquilibration( reduced_potentials) #computing indices of uncorrelated timeseries A_t_equil = reduced_potentials[t0:] uncorrelated_indices = timeseries.subsampleCorrelatedData(A_t_equil, g=g) A_t = A_t_equil[uncorrelated_indices] full_uncorrelated_indices = [i + t0 for i in uncorrelated_indices] return [t0, g, Neff_max, A_t, full_uncorrelated_indices]
def equilibrium_detection(df, series=None, lower=None, upper=None, step=None): """Subsample a DataFrame using automated equilibrium detection on a timeseries. If `series` is ``None``, then this function will behave the same as :func:`slicing`. Parameters ---------- df : DataFrame DataFrame to subsample according to equilibrium detection on `series`. series : Series Series to detect equilibration on. If ``None``, no equilibrium detection-based subsampling will be performed. lower : float Lower bound to pre-slice `series` data from. upper : float Upper bound to pre-slice `series` to (inclusive). step : int Step between `series` items to pre-slice by. Returns ------- DataFrame `df` subsampled according to subsampled `series`. See Also -------- pymbar.timeseries.detectEquilibration : detailed background """ if _check_multiple_times(df): raise KeyError("Duplicate time values found; equilibrium detection " "is only meaningful for a single, contiguous, " "and sorted timeseries.") if not _check_sorted(df): raise KeyError("Equilibrium detection only works as expected if " "values are sorted by time, increasing.") if series is not None: series = slicing(series, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series statinef = statisticalInefficiency(series) # calculate statistical inefficiency of series, with equilibrium detection t, statinef, Neff_max = detectEquilibration(series.values) # we round up statinef = int(np.rint(statinef)) # subsample according to statistical inefficiency series = series.iloc[t::statinef] df = df.loc[series.index] else: df = slicing(df, lower=lower, upper=upper, step=step) return df
def get_equilibration_data(timeseries_to_analyze): """ Compute equilibration method given a timeseries See the ``pymbar.timeseries.detectEquilibration`` function for full documentation """ [n_equilibration, g_t, n_effective_max] = timeseries.detectEquilibration(timeseries_to_analyze) return n_equilibration, g_t, n_effective_max
def is_equilibrated(data, threshold_fraction=0.50, threshold_neff=50, nskip=1): """Check if a dataset is equilibrated based on a fraction of equil data. Using `pymbar.timeseries` module, check if a timeseries dataset has enough equilibrated data based on two threshold values. The threshold_fraction value translates to the fraction of total data from the dataset 'a_t' that can be thought of as being in the 'production' region. The threshold_neff is the minimum amount of effectively uncorrelated samples to have in a_t to consider it equilibrated. The `pymbar.timeseries` module returns the starting index of the 'production' region from 'a_t'. The fraction of 'production' data is then compared to the threshold value. If the fraction of 'production' data is >= threshold fraction this will return a list of [True, t0, g, Neff] and [False, None, None, None] otherwise. Parameters ---------- data : numpy.typing.Arraylike 1-D time dependent data to check for equilibration. threshold_fraction : float, optional, default=0.8 Fraction of data expected to be equilibrated. threshold_neff : int, optional, default=100 Minimum amount of effectively correlated samples to consider a_t 'equilibrated'. nskip : int, optional, default=1 Since the statistical inefficiency is computed for every time origin in a call to timeseries.detectEquilibration, for larger datasets (> few hundred), increasing nskip might speed this up, while discarding more data. Returns ------- list : [True, t0, g, Neff] If the data set is considered properly equilibrated list : [False, None, None, None] If the data set is not considered properly equilibrated """ if threshold_fraction < 0.0 or threshold_fraction > 1.0: raise ValueError( f"Passed 'threshold_fraction' value: {threshold_fraction}, " "expected value between 0.0-1.0." ) threshold_neff = int(threshold_neff) if threshold_neff < 1: raise ValueError( f"Passed 'threshold_neff' value: {threshold_neff}, expected value " "1 or greater." ) [t0, g, Neff] = timeseries.detectEquilibration(data, nskip=nskip) frac_equilibrated = 1.0 - (t0 / np.shape(data)[0]) if (frac_equilibrated >= threshold_fraction) and (Neff >= threshold_neff): return [True, t0, g, Neff] else: return [False, None, None, None]
def is_converged(series: Series, frac_min=0.5) -> (bool, float): from pymbar import timeseries n_points = len(series) array = np.array(series) t0, g, Neff_max = timeseries.detectEquilibration(array, nskip=max(1, n_points // 100)) if t0 > n_points * (1 - frac_min): return False, series.index[t0] return True, series.index[t0]
def subsample(enthalpies): """ Subsamples the enthalpies using John Chodera's code. This is probably better than the simple cutoff we normally use. No output -- it modifies the lists directly """ # Use automatic equilibration detection and pymbar.timeseries to subsample [t0, g, Neff_max] = timeseries.detectEquilibration(enthalpies) enthalpies = enthalpies[t0:] return timeseries.subsampleCorrelatedData(enthalpies, g=g)
def production(self): utils.make_path('production/') self.production_dcd_filename = "production/"+self.identifier +"_production.dcd" self.production_pdb_filename = "production/"+self.identifier +"_production.pdb" self.production_data_filename = "production/"+self.identifier +"_production.csv" utils.make_path(self.production_dcd_filename) if os.path.exists(self.production_pdb_filename): return if self.ran_equilibrate: pdb = app.PDBFile(self.equil_pdb_filename) topology = pdb.topology positions = pdb.positions else: positions = self.packed_trj.openmm_positions(0) topology = self.packed_trj.top.to_openmm() topology.setUnitCellDimensions(mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer) ff = self.ffxml system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, self.friction, self.timestep) system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency)) simulation = app.Simulation(topology, system, integrator) simulation.context.setPositions(positions) if not self.ran_equilibrate: print('Minimizing.') simulation.minimizeEnergy() simulation.context.setVelocitiesToTemperature(self.temperature) print('Production.') simulation.reporters.append(app.DCDReporter(self.production_dcd_filename, self.output_frequency)) simulation.reporters.append(app.StateDataReporter(self.production_data_filename, self.output_data_frequency, step=True, potentialEnergy=True, temperature=True, density=True)) converged = False while not converged: simulation.step(self.n_steps) d = pd.read_csv(self.production_data_filename, names=["step", "U", "Temperature", "Density"], skiprows=1) density_ts = np.array(d.Density) [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000) density_ts = density_ts[t0:] density_mean_stderr = density_ts.std() / np.sqrt(Neff) if density_mean_stderr < self.stderr_tolerance: converged = True del(simulation) if self.ran_equilibrate: traj = md.load(self.production_dcd_filename, top=self.equil_pdb_filename)[-1] else: traj = md.load(self.production_dcd_filename, top=self.box_pdb_filename)[-1] traj.save(self.production_pdb_filename)
def calc_statistics(_data): t0, g, Neff = timeseries.detectEquilibration(_data) data_equil = _data[t0:] indices_subsampled = timeseries.subsampleCorrelatedData(data_equil, g=g) sub_data = data_equil[indices_subsampled] avg = sub_data.mean() std = sub_data.std() err = sub_data.std() / np.sqrt(len(indices_subsampled)) summary = [avg, std, err, t0, g, Neff] return summary
def _construct_decorrelation_mask(self, sim_collection, rep, skip): enes = sim_collection.reps_energies[rep] ops = sim_collection.reps_order_params[rep] steps = enes.steps rpots = utility.calc_reduced_potentials(enes, ops, sim_collection.conditions) start_i, g, Neff = timeseries.detectEquilibration(rpots, nskip=skip) template = '{:<8} {:<8} {:<3} {:<4.1f} {:<.1f}' print(template.format(sim_collection.conditions.fileformat, steps, start_i, g, Neff)) indices = (timeseries.subsampleCorrelatedData(rpots[start_i:], g=skip*g)) return [i + start_i for i in indices]
def decorrelate_data(self, dHdl_data=None, u_nk_data=None): dHdl, u_nk = [], [] if dHdl_data is not None: dHdl.append(equilibrium_detection(dHdl_data, dHdl_data.iloc[:, 0])) #logger(f'Subsampling dHdl data of the {ordinal(self.n_state)} state ...') _, g1, _ = detectEquilibration(dHdl_data.iloc[:, 0].values) dHdl = pd.concat(dHdl) setattr(dHdl, 'statineff', g1) if u_nk_data is not None: u_nk.append(equilibrium_detection(u_nk_data, u_nk_data.iloc[:, 0])) #logger(f'Subsampling u_nk data of the {ordinal(self.n_state)} state ...\n') t2, g2, N2 = detectEquilibration(u_nk_data.iloc[:, 0].values) u_nk = pd.concat(u_nk) setattr(u_nk, 'statineff', g2) logger("Data preprocessing completed!\n") if os.path.isfile('temporary.xvg') is True: os.system("rm temporary.xvg") return dHdl, u_nk
def compute_timeseries(reduced_potentials: np.array) -> list: """ Use pymbar timeseries to compute the uncorrelated samples in an array of reduced potentials. Returns the uncorrelated sample indices. """ from pymbar import timeseries t0, g, Neff_max = timeseries.detectEquilibration( reduced_potentials) #computing indices of uncorrelated timeseries A_t_equil = reduced_potentials[t0:] uncorrelated_indices = timeseries.subsampleCorrelatedData(A_t_equil, g=g) A_t = A_t_equil[uncorrelated_indices] full_uncorrelated_indices = [i + t0 for i in uncorrelated_indices] return [t0, g, Neff_max, A_t, full_uncorrelated_indices]
def _calc_stat_neff(self): """ Estimate the statistical inefficiency of the salt occupancy. """ stat_ineff = [] for counts in self.nsalt: t, g, Neff = ts.detectEquilibration(counts, fast=True) stat_ineff.append(g) stat_ineff = np.array(stat_ineff) # Correcting the statistical inefficieny has returned a value of 1.0, when there were no acceptances. stat_ineff[np.where(stat_ineff == 1.0)] = np.inf return stat_ineff
def gather_dg(self, u_kln, nstates): # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [_, g, __] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, _] = mbar.getFreeEnergyDifferences() print("Number of uncorrelated samples per state: {}".format(N_k)) return DeltaF_ij, dDeltaF_ij
def get_stats(data): """ later, can generalize, to use one column for decorrelating and getting reference indices """ [t0, g, Neff] = timeseries.detectEquilibration(data) data_equil = data[t0:] indices = timeseries.subsampleCorrelatedData(data_equil, g=g) sub_data = data_equil[indices] avg = sub_data.mean() std = sub_data.std() err = sub_data.std()/np.sqrt( len(indices) ) return avg,std,err, t0,g,Neff, sub_data
def test_analyze_time_series(): """Compare the output of the ``analyze_time_series`` utility with ``pymbar``.""" np.random.seed(4) random_array = np.random.rand(10) statistics = analyze_time_series(random_array, minimum_samples=3) expected_index, expected_value, _ = detectEquilibration(random_array, fast=False) assert expected_index == statistics.equilibration_index assert np.isclose(statistics.statistical_inefficiency, expected_value) assert statistics.n_total_points == 10 assert 0 < statistics.n_uncorrelated_points <= 10 assert 0 <= statistics.equilibration_index < 10
def production(in_top, in_pdb, out_dcd, out_csv, temperature): temperature = temperature * u.kelvin # TODO: recycle John's simtk.unit parser pdb = app.PDBFile(in_pdb) top = app.GromacsTopFile(in_top) top.topology.setPeriodicBoxVectors(pdb.topology.getPeriodicBoxVectors()) system = top.createSystem(nonbondedMethod=app.PME, nonbondedCutoff=CUTOFF, constraints=app.HBonds) integrator = mm.LangevinIntegrator(temperature, FRICTION, TIMESTEP) system.addForce( mm.MonteCarloBarostat(PRESSURE, temperature, BAROSTAT_FREQUENCY)) simulation = app.Simulation(top.topology, system, integrator) simulation.context.setPositions(pdb.positions) simulation.context.setPeriodicBoxVectors( *pdb.topology.getPeriodicBoxVectors()) simulation.context.setVelocitiesToTemperature(temperature) print('Production.') simulation.reporters.append(app.DCDReporter(out_dcd, OUTPUT_FREQUENCY)) simulation.reporters.append( app.StateDataReporter(out_csv, OUTPUT_DATA_FREQUENCY, step=True, potentialEnergy=True, temperature=True, density=True)) converged = False while not converged: simulation.step(N_STEPS) d = pd.read_csv(out_csv, names=["step", "U", "Temperature", "Density"], skiprows=1) density_ts = np.array(d.Density) [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000) density_ts = density_ts[t0:] density_mean_stderr = density_ts.std() / np.sqrt(Neff) if density_mean_stderr < STD_ERROR_TOLERANCE: converged = True
def find_equilibrium(all_timeseries, column_name = None): """ directly uses pymbar's timeseries utility! source: https://github.com/choderalab/pymbar https://pymbar.readthedocs.io/en/master/timeseries.html and references therein """ equilibration_data=dict([]) for name in all_timeseries: [t, g, Neff_max] = timeseries.detectEquilibration(np.array(all_timeseries[name]) ) """ t - t_0 starting point of equilibrated part of series g - the statistical innefficency = 2*correlationtime +1 N_effmax - effective number of uncorrelated samples """ equilibration_data[name] = [t,g,Neff_max] return equilibration_data
def read_concentration(files, discard=10, fast=False): """ Calculate the mean concentration and standard error from numerous numerous simulations, where each simulation has a fixed chemical potential. Timeseries analysis is used to determine equilibrium properties. Parameters ---------- files: list of str the path to each results file that will be analysed. discard: int the initial amount of data to throw away fast: bool whether to perform the fast varient of the time series analysis """ concentration = np.zeros(len(files)) standard_error = np.zeros(len(files)) delta_mu = np.zeros(len(files)) lower = np.zeros(len(files)) upper = np.zeros(len(files)) for i in range(len(files)): ncfile = Dataset(files[i], 'r') volume = ncfile.groups['Sample state data']['volume'][:] #ncations = ncfile.groups['Sample state data']['species counts'][:, 1] nsalt = np.min(ncfile.groups['Sample state data']['species counts'][:, 1:2], axis=1) delta_mu[i] = ncfile.groups['Control parameters']['delta_chem'][0] ncfile.close() # Get the concentration in Molarity c = 1.0 * nsalt / volume * 1.66054 # Estimate the mean and standard error with timeseries analysis t_equil, stat_ineff, n_eff = timeseries.detectEquilibration(c[discard:], fast=fast) #mu, sigma, num_batches, conf_width = misc_tools.batch_estimate_2(c[(discard + t_equil):], stat_ineff) #print("{0} batches for {1}".format(num_batches, files[i])) c_equil = c[(discard + t_equil):] concentration[i] = np.mean(c_equil) independent_inds = timeseries.subsampleCorrelatedData(c_equil, g=stat_ineff, conservative=True) mu_samps = misc_tools.bootstrap_estimates(c_equil[independent_inds]) lower[i] = np.percentile(mu_samps, 2.5) upper[i] = np.percentile(mu_samps, 97.5) standard_error[i] = mu_samps.std() return concentration, standard_error, delta_mu, lower, upper
def equilibrate(traj, verbose=False, name=None): traj = np.array(traj) if traj.ndim == 1: t0, g, n_eff = timeseries.detectEquilibration(traj) if t0 == 0 and traj.size > 10: # See https://github.com/choderalab/pymbar/issues/277 t0x, gx, n_effx = timeseries.detectEquilibration(traj[10:]) if t0x != 0: t0 = t0x + 10 n = traj.size res = traj[t0:] elif traj.ndim == 2 and traj.shape[0] == 2: t01, g1, n_eff1 = timeseries.detectEquilibration(traj[0]) t02, g2, n_eff2 = timeseries.detectEquilibration(traj[1]) t0 = max(t01, t02) if t0 == 0 and traj.shape[1] > 10: # See https://github.com/choderalab/pymbar/issues/277 t01x, g1x, n_eff1x = timeseries.detectEquilibration(traj[0, 10:]) t02x, g2x, n_eff2x = timeseries.detectEquilibration(traj[1, 10:]) t0x = max(t01x, t02x) if t0x != 0: t0 = t0x + 10 n = traj.shape[1] res = traj[:, t0:] elif traj.ndim == 2: raise NotImplementedError( 'trajectory.equilibrate() in 2 dimensions is only ' 'implemented for exactly two timeseries.') else: raise NotImplementedError( 'trajectory.equilibrate() is not implemented for ' 'trajectories with more than 2 dimensions.') if verbose: if not name: name = 'Trajectory' if t0 == 0: print( '{:s} equilibration: No frames discarded for burn-in.'.format( name)) elif t0 == 1: print('{:s} equilibration: First frame ({:.1%} of ' 'trajectory) discarded for burn-in.'.format(name, 1 / n)) else: print('{:s} equilibration: First {:d} frames ({:.1%} of ' 'trajectory) discarded for burn-in.'.format( name, t0, t0 / n)) return res
def __init__(self, ani_model: AlchemicalANI, ani_trajs: list, potential_energy_trajs: list, lambdas, max_snapshots_per_window=50, ): K = len(lambdas) assert (len(ani_trajs) == K) assert (len(potential_energy_trajs) == K) self.ani_model = ani_model self.ani_trajs = ani_trajs self.potential_energy_trajs = potential_energy_trajs self.lambdas = lambdas # thin each based automatic equilibration detection N_k = [] snapshots = [] for i in range(K): traj = self.ani_trajs[i] equil, g = detectEquilibration(self.potential_energy_trajs[i])[:2] thinning = int(g) if len(traj[equil::thinning]) > max_snapshots_per_window: # what thinning will give me len(traj[equil::thinning]) == max_snapshots_per_window? thinning = int((len(traj) - equil) / max_snapshots_per_window) new_snapshots = list(traj[equil::thinning].xyz * unit.nanometer)[:max_snapshots_per_window] N_k.append(len(new_snapshots)) snapshots.extend(new_snapshots) self.snapshots = snapshots N = len(snapshots) u_kn = np.zeros((K, N)) for k in range(K): lamb = lambdas[k] self.ani_model.lambda_value = lamb for n in range(N): u_kn[k, n] = self.ani_model.calculate_energy(snapshots[n]) / kT self.mbar = MBAR(u_kn, N_k)
def test_compare_detectEquil(show_hist=False): """ compare detectEquilibration implementations (with and without binary search + fft) """ t_res = [] N=100 for _ in xrange(100): A_t = testsystems.correlated_timeseries_example(N=N, tau=5.0) + 2.0 B_t = testsystems.correlated_timeseries_example(N=N, tau=5.0) + 1.0 C_t = testsystems.correlated_timeseries_example(N=N*2, tau=5.0) D_t = np.concatenate([A_t, B_t, C_t, np.zeros(20)]) #concatenate and add flat region to one end (common in MC data) bs_de = timeseries.detectEquilibration_binary_search(D_t, bs_nodes=10) std_de = timeseries.detectEquilibration(D_t, fast=False, nskip=1) t_res.append(bs_de[0]-std_de[0]) t_res_mode = float(stats.mode(t_res)[0][0]) eq(t_res_mode,0.,decimal=1) if show_hist: import matplotlib.pyplot as plt plt.hist(t_res) plt.show()
def test_compare_detectEquil(show_hist=False): """ compare detectEquilibration implementations (with and without binary search + fft) """ t_res = [] N = 100 for _ in xrange(100): A_t = testsystems.correlated_timeseries_example(N=N, tau=5.0) + 2.0 B_t = testsystems.correlated_timeseries_example(N=N, tau=5.0) + 1.0 C_t = testsystems.correlated_timeseries_example(N=N * 2, tau=5.0) D_t = np.concatenate([A_t, B_t, C_t]) bs_de = timeseries.detectEquilibration_binary_search(D_t, bs_nodes=10) std_de = timeseries.detectEquilibration(D_t, fast=False, nskip=1) t_res.append(bs_de[0] - std_de[0]) t_res_mode = float(stats.mode(t_res)[0][0]) eq(t_res_mode, 0., decimal=1) if show_hist: import matplotlib.pyplot as plt plt.hist(t_res) plt.show()
def individual_analysis_procedure(temperature): ### # # This subroutine analyzes a timeseries for 'temperature', # and generates a set of decorrelated sample energies and distances, # which are used in later sampling to generate a free energy surface. # ### if (search_for_existing_data and not (os.path.exists( str(output_dir + str(temperature) + "/uncorrelated_distances.dat")) )) or not (search_for_existing_data): output_obj = open(str(output_dir + str(temperature) + "/sim_data.dat"), 'r') E_total_all_temp = np.array( [l.split(',')[3] for l in output_obj.readlines()] ) # E_total_all_temp temporarily stores the total energies from NaCl simulation output output_obj.close() distances = util.get_distances( str(output_dir + str(temperature) + "/coordinates.pdb"), simulation_steps) # Read in the distances E_total_all = np.array( np.delete(E_total_all_temp, 0, 0), dtype=float ) # E_total_all stores total energies from NaCl simulation output, after re-typing [t0, g, Neff_max] = timeseries.detectEquilibration( E_total_all, nskip=nskip ) # Identify the indices of samples with high statistical efficiency (g) E_total_equil = E_total_all[ t0:] # Using the index for the equilibration time (t0), truncate the time-series data before this index uncorrelated_energy_indices = timeseries.subsampleCorrelatedData( E_total_equil, g=g) # Determine indices of uncorrelated samples np.savetxt( str(output_dir + str(temperature) + '/uncorrelated_total_energies.dat'), E_total_equil[uncorrelated_energy_indices] ) # Write uncorrelated total energies to file np.savetxt( str(output_dir + str(temperature) + '/uncorrelated_distances.dat'), distances[uncorrelated_energy_indices] ) # Write uncorrelated Na-Cl distances to file return
def gather_dg(self, u_kln, nstates): u_kln = np.vstack(u_kln) # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [_, g, __] = timeseries.detectEquilibration(u_kln[k, k, :]) indices = timeseries.subsampleCorrelatedData(u_kln[k, k, :], g=g) N_k[k] = len(indices) u_kln[k, :, 0:N_k[k]] = u_kln[k, :, indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, _] = mbar.getFreeEnergyDifferences() logger.debug( "Number of uncorrelated samples per state: {}".format(N_k)) logger.debug("Relative free energy change for {0} = {1} +- {2}".format( self.name, DeltaF_ij[0, nstates - 1] * self.kTtokcal, dDeltaF_ij[0, nstates - 1] * self.kTtokcal)) return DeltaF_ij[0, nstates - 1] * self.kTtokcal, dDeltaF_ij[0, nstates - 1] * self.kTtokcal
def equilibrate(traj, verbose=False, name=None): traj = np.array(traj) if traj.ndim == 1: t0, g, n_eff = timeseries.detectEquilibration(traj) if t0 == 0 and traj.size > 10: # See https://github.com/choderalab/pymbar/issues/277 t0x, gx, n_effx = timeseries.detectEquilibration(traj[10:]) if t0x != 0: t0 = t0x + 10 n = traj.size res = traj[t0:] elif traj.ndim == 2 and traj.shape[0] == 2: t01, g1, n_eff1 = timeseries.detectEquilibration(traj[0]) t02, g2, n_eff2 = timeseries.detectEquilibration(traj[1]) t0 = max(t01, t02) if t0 == 0 and traj.shape[1] > 10: # See https://github.com/choderalab/pymbar/issues/277 t01x, g1x, n_eff1x = timeseries.detectEquilibration(traj[0, 10:]) t02x, g2x, n_eff2x = timeseries.detectEquilibration(traj[1, 10:]) t0x = max(t01x, t02x) if t0x != 0: t0 = t0x + 10 n = traj.shape[1] res = traj[:, t0:] elif traj.ndim == 2: raise NotImplementedError('trajectory.equilibrate() in 2 dimensions is only ' 'implemented for exactly two timeseries.') else: raise NotImplementedError('trajectory.equilibrate() is not implemented for ' 'trajectories with more than 2 dimensions.') if verbose: if not name: name = 'Trajectory' if t0 == 0: print('{:s} equilibration: No frames discarded for burn-in.'.format(name)) elif t0 == 1: print('{:s} equilibration: First frame ({:.1%} of ' 'trajectory) discarded for burn-in.'.format(name, 1 / n)) else: print('{:s} equilibration: First {:d} frames ({:.1%} of ' 'trajectory) discarded for burn-in.'.format(name, t0, t0 / n)) return res
def run_endpoint_perturbation(lambda_thermodynamic_state, nonalchemical_thermodynamic_state, initial_hybrid_sampler_state, mc_move, n_iterations, factory, lambda_index=0, print_work=False, write_system=False, write_state=False, write_trajectories=False): """ Parameters ---------- lambda_thermodynamic_state : ThermodynamicState The thermodynamic state corresponding to the hybrid system at a lambda endpoint nonalchemical_thermodynamic_state : ThermodynamicState The nonalchemical thermodynamic state for the relevant endpoint initial_hybrid_sampler_state : SamplerState Starting positions for the sampler. Must be compatible with lambda_thermodynamic_state mc_move : MCMCMove The MCMove that will be used for sampling at the lambda endpoint n_iterations : int The number of iterations factory : HybridTopologyFactory The hybrid topology factory lambda_index : int, optional, default=0 The index, 0 or 1, at which to retrieve nonalchemical positions print_work : bool, optional, default=False If True, will print work values write_system : bool, optional, default=False If True, will write alchemical and nonalchemical System XML files write_state : bool, optional, default=False If True, write alchemical (hybrid) State XML files each iteration write_trajectories : bool, optional, default=False If True, will write trajectories Returns ------- df : float Free energy difference between alchemical and nonalchemical systems, estimated with EXP ddf : float Standard deviation of estimate, corrected for correlation, from EXP estimator. """ import mdtraj as md #run an initial minimization: mcmc_sampler = mcmc.MCMCSampler(lambda_thermodynamic_state, initial_hybrid_sampler_state, mc_move) mcmc_sampler.minimize(max_iterations=20) new_sampler_state = mcmc_sampler.sampler_state if write_system: with open(f'hybrid{lambda_index}-system.xml', 'w') as outfile: outfile.write(openmm.XmlSerializer.serialize(lambda_thermodynamic_state.system)) with open(f'nonalchemical{lambda_index}-system.xml', 'w') as outfile: outfile.write(openmm.XmlSerializer.serialize(nonalchemical_thermodynamic_state.system)) #initialize work array w = np.zeros([n_iterations]) non_potential = np.zeros([n_iterations]) hybrid_potential = np.zeros([n_iterations]) #run n_iterations of the endpoint perturbation: hybrid_trajectory = unit.Quantity(np.zeros([n_iterations, lambda_thermodynamic_state.system.getNumParticles(), 3]), unit.nanometers) # DEBUG nonalchemical_trajectory = unit.Quantity(np.zeros([n_iterations, nonalchemical_thermodynamic_state.system.getNumParticles(), 3]), unit.nanometers) # DEBUG for iteration in range(n_iterations): # Generate a new sampler state for the hybrid system mc_move.apply(lambda_thermodynamic_state, new_sampler_state) # Compute the hybrid reduced potential at the new sampler state hybrid_context, integrator = cache.global_context_cache.get_context(lambda_thermodynamic_state) new_sampler_state.apply_to_context(hybrid_context, ignore_velocities=True) hybrid_reduced_potential = lambda_thermodynamic_state.reduced_potential(hybrid_context) if write_state: state = hybrid_context.getState(getPositions=True, getParameters=True) state_xml = openmm.XmlSerializer.serialize(state) with open(f'state{iteration}_l{lambda_index}.xml', 'w') as outfile: outfile.write(state_xml) # Construct a sampler state for the nonalchemical system if lambda_index == 0: nonalchemical_positions = factory.old_positions(new_sampler_state.positions) elif lambda_index == 1: nonalchemical_positions = factory.new_positions(new_sampler_state.positions) else: raise ValueError("The lambda index needs to be either one or zero for this to be meaningful") nonalchemical_sampler_state = SamplerState(nonalchemical_positions, box_vectors=new_sampler_state.box_vectors) if write_trajectories: state = hybrid_context.getState(getPositions=True) hybrid_trajectory[iteration,:,:] = state.getPositions(asNumpy=True) nonalchemical_trajectory[iteration,:,:] = nonalchemical_positions # Compute the nonalchemical reduced potential nonalchemical_context, integrator = cache.global_context_cache.get_context(nonalchemical_thermodynamic_state) nonalchemical_sampler_state.apply_to_context(nonalchemical_context, ignore_velocities=True) nonalchemical_reduced_potential = nonalchemical_thermodynamic_state.reduced_potential(nonalchemical_context) # Compute and store the work w[iteration] = nonalchemical_reduced_potential - hybrid_reduced_potential non_potential[iteration] = nonalchemical_reduced_potential hybrid_potential[iteration] = hybrid_reduced_potential if print_work: print(f'{iteration:8d} {hybrid_reduced_potential:8.3f} {nonalchemical_reduced_potential:8.3f} => {w[iteration]:8.3f}') if write_trajectories: if lambda_index == 0: nonalchemical_mdtraj_topology = md.Topology.from_openmm(factory._topology_proposal.old_topology) elif lambda_index == 1: nonalchemical_mdtraj_topology = md.Topology.from_openmm(factory._topology_proposal.new_topology) md.Trajectory(hybrid_trajectory / unit.nanometers, factory.hybrid_topology).save(f'hybrid{lambda_index}.pdb') md.Trajectory(nonalchemical_trajectory / unit.nanometers, nonalchemical_mdtraj_topology).save(f'nonalchemical{lambda_index}.pdb') # Analyze data and return results [t0, g, Neff_max] = timeseries.detectEquilibration(w) w_burned_in = w[t0:] [df, ddf] = pymbar.EXP(w_burned_in) ddf_corrected = ddf * np.sqrt(g) results = [df, ddf_corrected, t0, Neff_max] return results, non_potential, hybrid_potential
def overlap_check(reference_system, positions, platform_name=None, precision=None, nsteps=50, nsamples=200, factory_args=None, cached_trajectory_filename=None): """ Test overlap between reference system and alchemical system by running a short simulation. Parameters ---------- reference_system : simtk.openmm.System The reference System object to compare with positions : simtk.unit.Quantity with units compatible with nanometers The positions to assess energetics for. platform_name : str, optional, default=None The name of the platform to use for benchmarking. nsteps : int, optional, default=50 Number of molecular dynamics steps between samples. nsamples : int, optional, default=100 Number of samples to collect. factory_args : dict(), optional, default=None Arguments passed to AbsoluteAlchemicalFactory. cached_trajectory_filename : str, optional, default=None If specified, attempt to cache (or reuse) trajectory. """ # Create a fully-interacting alchemical state. factory = AbsoluteAlchemicalFactory(reference_system, **factory_args) alchemical_state = AlchemicalState() alchemical_system = factory.createPerturbedSystem(alchemical_state) temperature = 300.0 * unit.kelvin collision_rate = 5.0 / unit.picoseconds timestep = 2.0 * unit.femtoseconds kT = (kB * temperature) # Select platform. platform = None if platform_name: platform = openmm.Platform.getPlatformByName(platform_name) # Create integrators. reference_integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) alchemical_integrator = openmm.VerletIntegrator(timestep) # Create contexts. if platform: reference_context = openmm.Context(reference_system, reference_integrator, platform) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator, platform) else: reference_context = openmm.Context(reference_system, reference_integrator) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator) ncfile = None if cached_trajectory_filename: cache_mode = 'write' # Try reading from cache from netCDF4 import Dataset if os.path.exists(cached_trajectory_filename): try: ncfile = Dataset(cached_trajectory_filename, 'r') if (ncfile.variables['positions'].shape == (nsamples, reference_system.getNumParticles(), 3)): # Read the cache if everything matches cache_mode = 'read' except: pass if cache_mode == 'write': # If anything went wrong, create a new cache. try: (pathname, filename) = os.path.split(cached_trajectory_filename) if not os.path.exists(pathname): os.makedirs(pathname) ncfile = Dataset(cached_trajectory_filename, 'w', format='NETCDF4') ncfile.createDimension('samples', 0) ncfile.createDimension('atoms', reference_system.getNumParticles()) ncfile.createDimension('spatial', 3) ncfile.createVariable('positions', 'f4', ('samples', 'atoms', 'spatial')) except Exception as e: logger.info(str(e)) logger.info('Could not create a trajectory cache (%s).' % cached_trajectory_filename) ncfile = None # Collect simulation data. reference_context.setPositions(positions) du_n = np.zeros([nsamples], np.float64) # du_n[n] is the print() import click with click.progressbar(range(nsamples)) as bar: for sample in bar: if cached_trajectory_filename and (cache_mode == 'read'): # Load cached frames. positions = unit.Quantity(ncfile.variables['positions'][sample,:,:], unit.nanometers) reference_context.setPositions(positions) else: # Run dynamics. reference_integrator.step(nsteps) # Get reference energies. reference_state = reference_context.getState(getEnergy=True, getPositions=True) reference_potential = reference_state.getPotentialEnergy() if np.isnan(reference_potential/kT): raise Exception("Reference potential is NaN") # Get alchemical energies. alchemical_context.setPositions(reference_state.getPositions(asNumpy=True)) alchemical_state = alchemical_context.getState(getEnergy=True) alchemical_potential = alchemical_state.getPotentialEnergy() if np.isnan(alchemical_potential/kT): raise Exception("Alchemical potential is NaN") du_n[sample] = (alchemical_potential - reference_potential) / kT if cached_trajectory_filename and (cache_mode == 'write') and (ncfile is not None): ncfile.variables['positions'][sample,:,:] = reference_state.getPositions(asNumpy=True) / unit.nanometers # Clean up. del reference_context, alchemical_context if cached_trajectory_filename and (ncfile is not None): ncfile.close() # Discard data to equilibration and subsample. from pymbar import timeseries [t0, g, Neff] = timeseries.detectEquilibration(du_n) indices = timeseries.subsampleCorrelatedData(du_n, g=g) du_n = du_n[indices] # Compute statistics. from pymbar import EXP [DeltaF, dDeltaF] = EXP(du_n) # Raise an exception if the error is larger than 3kT. MAX_DEVIATION = 3.0 # kT if (dDeltaF > MAX_DEVIATION): report = "DeltaF = %12.3f +- %12.3f kT (%5d samples, g = %6.1f)" % (DeltaF, dDeltaF, Neff, g) raise Exception(report) return
from pymbar import timeseries # determine equilibrated region [t, g, Neff_max] = timeseries.detectEquilibration(A_t) # extract equilibrated region A_t_equilibrated = A_t[t:]
print "%16s %8d" % (dimension_name, len(ncfile.dimensions[dimension_name])) # Read dimensions. niterations = ncfile.variables['positions'].shape[0] nstates = ncfile.variables['positions'].shape[1] natoms = ncfile.variables['positions'].shape[2] print "Read %(niterations)d iterations, %(nstates)d states" % vars() # Read reference PDB file. reference_pdb_filename = os.path.join(source_directory, "complex.pdb") atoms = read_pdb(reference_pdb_filename) # Choose number of samples to discard to equilibration u_n = extract_u_n(ncfile) if numpy.any(numpy.isnan(u_n)): continue nskip = int(len(u_n) / 100.0) [nequil, g_t, Neff_max] = timeseries.detectEquilibration(u_n, nskip) print [nequil, Neff_max] # Resample configurations for state 0. state = 0 nsamples = 5000 output_pdb_filename = os.path.join(source_directory, 'resampled.pdb') write_pdb_resampled(ncfile, atoms, output_pdb_filename, state, nequil, nsamples) # Close input NetCDF file. ncfile.close() #except Exception as e: # print str(e) # pass
kT = unit.AVOGADRO_CONSTANT_NA * unit.BOLTZMANN_CONSTANT_kB * integrator.getTemperature() for k in range(nstates): for iteration in range(niterations): print('state %5d iteration %5d / %5d' % (k, iteration, niterations)) # Set alchemical state context.setParameter('lambda', lambdas[k]) # Run some dynamics integrator.step(nsteps) # Compute energies at all alchemical states for l in range(nstates): context.setParameter('lambda', lambdas[l]) u_kln[k,l,iteration] = context.getState(getEnergy=True).getPotentialEnergy() / kT # Estimate free energy of Lennard-Jones particle insertion from pymbar import MBAR, timeseries # Subsample data to extract uncorrelated equilibrium timeseries N_k = np.zeros([nstates], np.int32) # number of uncorrelated samples for k in range(nstates): [nequil, g, Neff_max] = timeseries.detectEquilibration(u_kln[k,k,:]) indices = timeseries.subsampleCorrelatedData(u_kln[k,k,:], g=g) N_k[k] = len(indices) u_kln[k,:,0:N_k[k]] = u_kln[k,:,indices].T # Compute free energy differences and statistical uncertainties mbar = MBAR(u_kln, N_k) [DeltaF_ij, dDeltaF_ij, Theta_ij] = mbar.getFreeEnergyDifferences() print('DeltaF_ij (kT):') print(DeltaF_ij) print('dDeltaF_ij (kT):') print(dDeltaF_ij)
def analyze(source_directory): """ Analyze contents of store files to compute free energy differences. Parameters ---------- source_directory : string The location of the NetCDF simulation storage files. """ # Storage for different phases. data = dict() phase_prefixes = ['solvent', 'complex'] suffixes = ['explicit', 'implicit'] DeltaF_restraints = None # Process each netcdf file. netcdf_files_found = 0 for phase in phase_prefixes: # Read reference PDB file. #from simtk.openmm import app #reference_pdb_filename = os.path.join(source_directory, phase + '.pdb') #reference_pdb = app.PDBFile(reference_pdb_filename) #if phase in ['vacuum', 'solvent']: # reference_pdb_filename = os.path.join(source_directory, "ligand.pdb") #else: # reference_pdb_filename = os.path.join(source_directory, "complex.pdb") #atoms = read_pdb(reference_pdb_filename) for suffix in suffixes: # Construct full path to NetCDF file. fullpath = os.path.join(source_directory, '%s-%s.nc' % (phase, suffix)) logger.debug("Attempting to open %s..." % fullpath) # Skip if the file doesn't exist. if (not os.path.exists(fullpath)): continue # Open NetCDF file for reading. logger.info("Opening NetCDF trajectory file '%(fullpath)s' for reading..." % vars()) try: ncfile = netcdf.Dataset(fullpath, 'r') except Exception as e: logger.error(e.message) raise Exception("Error opening NetCDF trajectory file '%(fullpath)s' for reading..." % vars()) # DEBUG logger.info("dimensions:") for dimension_name in ncfile.dimensions.keys(): logger.info("%16s %8d" % (dimension_name, len(ncfile.dimensions[dimension_name]))) # Read dimensions. niterations = ncfile.variables['positions'].shape[0] nstates = ncfile.variables['positions'].shape[1] natoms = ncfile.variables['positions'].shape[2] logger.info("Read %(niterations)d iterations, %(nstates)d states" % vars()) # Increment number of netcdf files found. netcdf_files_found += 1 # Read standard state correction free energy. if phase == 'complex': DeltaF_restraints = ncfile.groups['metadata'].variables['standard_state_correction'][0] # Read reference PDB file. #if phase in ['vacuum', 'solvent']: # reference_pdb_filename = os.path.join(source_directory, "ligand.pdb") #else: # reference_pdb_filename = os.path.join(source_directory, "complex.pdb") #atoms = read_pdb(reference_pdb_filename) # Check to make sure no self-energies go nan. #check_energies(ncfile, atoms) # Check to make sure no positions are nan #check_positions(ncfile) # Choose number of samples to discard to equilibration MIN_ITERATIONS = 10 # minimum number of iterations to use automatic detection if niterations > MIN_ITERATIONS: from pymbar import timeseries u_n = extract_u_n(ncfile) u_n = u_n[1:] # discard initial frame of zero energies TODO: Get rid of initial frame of zero energies [nequil, g_t, Neff_max] = timeseries.detectEquilibration(u_n) nequil += 1 # account for initial frame of zero energies logger.info([nequil, Neff_max]) else: nequil = 1 # discard first frame g_t = 1 Neff_max = niterations # Examine acceptance probabilities. show_mixing_statistics(ncfile, cutoff=0.05, nequil=nequil) # Estimate free energies. (Deltaf_ij, dDeltaf_ij) = estimate_free_energies(ncfile, ndiscard = nequil, g=g_t) # Estimate average enthalpies (DeltaH_i, dDeltaH_i) = estimate_enthalpies(ncfile, ndiscard = nequil, g=g_t) # Accumulate free energy differences entry = dict() entry['DeltaF'] = Deltaf_ij[0,nstates-1] entry['dDeltaF'] = dDeltaf_ij[0,nstates-1] entry['DeltaH'] = DeltaH_i[nstates-1] - DeltaH_i[0] entry['dDeltaH'] = np.sqrt(dDeltaH_i[0]**2 + dDeltaH_i[nstates-1]**2) data[phase] = entry # Get temperatures. ncvar = ncfile.groups['thermodynamic_states'].variables['temperatures'] temperature = ncvar[0] * units.kelvin kT = kB * temperature # Close input NetCDF file. ncfile.close() # Give the user a useful warning if no NetCDF files found. if netcdf_files_found == 0: raise Exception("No YANK output files were found in the specified store directory (%s)" % source_directory) # Compute hydration free energy (free energy of transfer from vacuum to water) #DeltaF = data['vacuum']['DeltaF'] - data['solvent']['DeltaF'] #dDeltaF = numpy.sqrt(data['vacuum']['dDeltaF']**2 + data['solvent']['dDeltaF']**2) #print "Hydration free energy: %.3f +- %.3f kT (%.3f +- %.3f kcal/mol)" % (DeltaF, dDeltaF, DeltaF * kT / units.kilocalories_per_mole, dDeltaF * kT / units.kilocalories_per_mole) # Compute enthalpy of transfer from vacuum to water #DeltaH = data['vacuum']['DeltaH'] - data['solvent']['DeltaH'] #dDeltaH = numpy.sqrt(data['vacuum']['dDeltaH']**2 + data['solvent']['dDeltaH']**2) #print "Enthalpy of hydration: %.3f +- %.3f kT (%.3f +- %.3f kcal/mol)" % (DeltaH, dDeltaH, DeltaH * kT / units.kilocalories_per_mole, dDeltaH * kT / units.kilocalories_per_mole) if DeltaF_restraints is None: raise Exception("DeltaF_restraints not found.") # Compute binding free energy. DeltaF = data['solvent']['DeltaF'] - DeltaF_restraints - data['complex']['DeltaF'] dDeltaF = np.sqrt(data['solvent']['dDeltaF']**2 + data['complex']['dDeltaF']**2) logger.info("") logger.info("Binding free energy : %16.3f +- %.3f kT (%16.3f +- %.3f kcal/mol)" % (DeltaF, dDeltaF, DeltaF * kT / units.kilocalories_per_mole, dDeltaF * kT / units.kilocalories_per_mole)) logger.info("") #logger.info("DeltaG vacuum : %16.3f +- %.3f kT" % (data['vacuum']['DeltaF'], data['vacuum']['dDeltaF'])) logger.info("DeltaG solvent : %16.3f +- %.3f kT" % (data['solvent']['DeltaF'], data['solvent']['dDeltaF'])) logger.info("DeltaG complex : %16.3f +- %.3f kT" % (data['complex']['DeltaF'], data['complex']['dDeltaF'])) logger.info("DeltaG restraint : %16.3f kT" % DeltaF_restraints) logger.info("") # Compute binding enthalpy DeltaH = data['solvent']['DeltaH'] - DeltaF_restraints - data['complex']['DeltaH'] dDeltaH = np.sqrt(data['solvent']['dDeltaH']**2 + data['complex']['dDeltaH']**2) logger.info("Binding enthalpy : %16.3f +- %.3f kT (%16.3f +- %.3f kcal/mol)" % (DeltaH, dDeltaH, DeltaH * kT / units.kilocalories_per_mole, dDeltaH * kT / units.kilocalories_per_mole))
def test_detectEquil(): x = np.random.normal(size=10000) (t, g, Neff_max) = timeseries.detectEquilibration(x)
def generate_simulation_data(database, parameters, cid): """ Regenerate simulation data for given parameters. ARGUMENTS database (dict) - database of molecules parameters (dict) - dictionary of GBSA parameters keyed on GBSA atom types """ platform = openmm.Platform.getPlatformByName("Reference") from pymbar import timeseries entry = database[cid] molecule = entry["molecule"] iupac_name = entry["iupac"] # Retrieve vacuum system. vacuum_system = copy.deepcopy(entry["system"]) # Retrieve OpenMM System. solvent_system = copy.deepcopy(entry["system"]) # Get nonbonded force. forces = { solvent_system.getForce(index).__class__.__name__: solvent_system.getForce(index) for index in range(solvent_system.getNumForces()) } nonbonded_force = forces["NonbondedForce"] # Add GBSA term gbsa_force = openmm.GBSAOBCForce() gbsa_force.setNonbondedMethod(openmm.GBSAOBCForce.NoCutoff) # set no cutoff gbsa_force.setSoluteDielectric(1) gbsa_force.setSolventDielectric(78) # Build indexable list of atoms. atoms = [atom for atom in molecule.GetAtoms()] natoms = len(atoms) # Assign GBSA parameters. for (atom_index, atom) in enumerate(atoms): [charge, sigma, epsilon] = nonbonded_force.getParticleParameters(atom_index) atomtype = atom.GetStringData("gbsa_type") # GBSA atomtype radius = parameters["%s_%s" % (atomtype, "radius")] * units.angstroms scalingFactor = parameters["%s_%s" % (atomtype, "scalingFactor")] gbsa_force.addParticle(charge, radius, scalingFactor) # Add the force to the system. solvent_system.addForce(gbsa_force) # Create context for solvent system. timestep = 2.0 * units.femtosecond collision_rate = 20.0 / units.picoseconds temperature = entry["temperature"] integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) context = openmm.Context(vacuum_system, integrator, platform) # Set the coordinates. positions = entry["positions"] context.setPositions(positions) # Minimize. openmm.LocalEnergyMinimizer.minimize(context) # Simulate, saving periodic snapshots of configurations. kT = kB * temperature beta = 1.0 / kT initial_time = time.time() nsteps_per_iteration = 2500 niterations = 200 x_n = np.zeros([niterations, natoms, 3], np.float32) # positions, in nm u_n = np.zeros([niterations], np.float64) # energy differences, in kT for iteration in range(niterations): integrator.step(nsteps_per_iteration) state = context.getState(getEnergy=True, getPositions=True) x_n[iteration, :, :] = state.getPositions(asNumpy=True) / units.nanometers u_n[iteration] = beta * state.getPotentialEnergy() if np.any(np.isnan(u_n)): raise Exception("Encountered NaN for molecule %s | %s" % (cid, iupac_name)) final_time = time.time() elapsed_time = final_time - initial_time # Clean up. del context, integrator # Discard initial transient to equilibration. [t0, g, Neff_max] = timeseries.detectEquilibration(u_n) x_n = x_n[t0:, :, :] u_n = u_n[t0:] # Subsample to remove correlation. indices = timeseries.subsampleCorrelatedData(u_n, g=g) x_n = x_n[indices, :, :] u_n = u_n[indices] # Store data. entry["x_n"] = x_n entry["u_n"] = u_n print "%48s | %64s | simulation %12.3f s | %5d samples discarded | %5d independent samples remain" % ( cid, iupac_name, elapsed_time, t0, len(indices), ) return [cid, entry]
def analyze(source_directory, verbose=False): """ Analyze contents of store files to compute free energy differences. Parameters ---------- source_directory : string The location of the NetCDF simulation storage files. verbose : bool, optional, default=False If True, verbose output will be generated. """ # Turn on debug info. # TODO: Control verbosity of logging output using verbose optional flag. logging.basicConfig(level=logging.DEBUG) # Storage for different phases. data = dict() phase_prefixes = ['solvent', 'complex'] suffixes = ['explicit', 'implicit'] # Process each netcdf file. for phase in phase_prefixes: for suffix in suffixes: # Construct full path to NetCDF file. fullpath = os.path.join(source_directory, '%s-%s.nc' % (phase, suffix)) if verbose: print "Attempting to open %s..." % fullpath # Skip if the file doesn't exist. if (not os.path.exists(fullpath)): continue # Open NetCDF file for reading. logger.info("Opening NetCDF trajectory file '%(fullpath)s' for reading..." % vars()) ncfile = netcdf.Dataset(fullpath, 'r') # DEBUG logger.info("dimensions:") for dimension_name in ncfile.dimensions.keys(): logger.info("%16s %8d" % (dimension_name, len(ncfile.dimensions[dimension_name]))) # Read dimensions. niterations = ncfile.variables['positions'].shape[0] nstates = ncfile.variables['positions'].shape[1] natoms = ncfile.variables['positions'].shape[2] logger.info("Read %(niterations)d iterations, %(nstates)d states" % vars()) # Read reference PDB file. #if phase in ['vacuum', 'solvent']: # reference_pdb_filename = os.path.join(source_directory, "ligand.pdb") #else: # reference_pdb_filename = os.path.join(source_directory, "complex.pdb") #atoms = read_pdb(reference_pdb_filename) # Check to make sure no self-energies go nan. #check_energies(ncfile, atoms) # Check to make sure no positions are nan #check_positions(ncfile) # Choose number of samples to discard to equilibration # TODO: Switch to pymbar.timeseries module. from pymbar import timeseries u_n = extract_u_n(ncfile) [nequil, g_t, Neff_max] = timeseries.detectEquilibration(u_n) logger.info([nequil, Neff_max]) # Examine acceptance probabilities. show_mixing_statistics(ncfile, cutoff=0.05, nequil=nequil) # Estimate free energies. (Deltaf_ij, dDeltaf_ij) = estimate_free_energies(ncfile, ndiscard = nequil) # Estimate average enthalpies (DeltaH_i, dDeltaH_i) = estimate_enthalpies(ncfile, ndiscard = nequil) # Accumulate free energy differences entry = dict() entry['DeltaF'] = Deltaf_ij[0,nstates-1] entry['dDeltaF'] = dDeltaf_ij[0,nstates-1] entry['DeltaH'] = DeltaH_i[nstates-1] - DeltaH_i[0] entry['dDeltaH'] = np.sqrt(dDeltaH_i[0]**2 + dDeltaH_i[nstates-1]**2) data[phase] = entry # Get temperatures. ncvar = ncfile.groups['thermodynamic_states'].variables['temperatures'] temperature = ncvar[0] * units.kelvin kT = kB * temperature # Close input NetCDF file. ncfile.close() # Compute hydration free energy (free energy of transfer from vacuum to water) #DeltaF = data['vacuum']['DeltaF'] - data['solvent']['DeltaF'] #dDeltaF = numpy.sqrt(data['vacuum']['dDeltaF']**2 + data['solvent']['dDeltaF']**2) #print "Hydration free energy: %.3f +- %.3f kT (%.3f +- %.3f kcal/mol)" % (DeltaF, dDeltaF, DeltaF * kT / units.kilocalories_per_mole, dDeltaF * kT / units.kilocalories_per_mole) # Compute enthalpy of transfer from vacuum to water #DeltaH = data['vacuum']['DeltaH'] - data['solvent']['DeltaH'] #dDeltaH = numpy.sqrt(data['vacuum']['dDeltaH']**2 + data['solvent']['dDeltaH']**2) #print "Enthalpy of hydration: %.3f +- %.3f kT (%.3f +- %.3f kcal/mol)" % (DeltaH, dDeltaH, DeltaH * kT / units.kilocalories_per_mole, dDeltaH * kT / units.kilocalories_per_mole) # Read standard state correction free energy. DeltaF_restraints = 0.0 phase = 'complex' fullpath = os.path.join(source_directory, phase + '.nc') ncfile = netcdf.Dataset(fullpath, 'r') DeltaF_restraints = ncfile.groups['metadata'].variables['standard_state_correction'][0] ncfile.close() # Compute binding free energy. DeltaF = data['solvent']['DeltaF'] - DeltaF_restraints - data['complex']['DeltaF'] dDeltaF = np.sqrt(data['solvent']['dDeltaF']**2 + data['complex']['dDeltaF']**2) logger.info("") logger.info("Binding free energy : %16.3f +- %.3f kT (%16.3f +- %.3f kcal/mol)" % (DeltaF, dDeltaF, DeltaF * kT / units.kilocalories_per_mole, dDeltaF * kT / units.kilocalories_per_mole)) logger.info("") #logger.info("DeltaG vacuum : %16.3f +- %.3f kT" % (data['vacuum']['DeltaF'], data['vacuum']['dDeltaF'])) logger.info("DeltaG solvent : %16.3f +- %.3f kT" % (data['solvent']['DeltaF'], data['solvent']['dDeltaF'])) logger.info("DeltaG complex : %16.3f +- %.3f kT" % (data['complex']['DeltaF'], data['complex']['dDeltaF'])) logger.info("DeltaG restraint : %16.3f kT" % DeltaF_restraints) logger.info("") # Compute binding enthalpy DeltaH = data['solvent']['DeltaH'] - DeltaF_restraints - data['complex']['DeltaH'] dDeltaH = np.sqrt(data['solvent']['dDeltaH']**2 + data['complex']['dDeltaH']**2) logger.info("Binding enthalpy : %16.3f +- %.3f kT (%16.3f +- %.3f kcal/mol)" % (DeltaH, dDeltaH, DeltaH * kT / units.kilocalories_per_mole, dDeltaH * kT / units.kilocalories_per_mole))
def overlap_check(reference_system, positions, receptor_atoms, ligand_atoms, platform_name=None, annihilate_electrostatics=True, annihilate_sterics=False, precision=None, nsteps=50, nsamples=200): """ Test overlap between reference system and alchemical system by running a short simulation. Parameters ---------- reference_system : simtk.openmm.System The reference System object to compare with positions : simtk.unit.Quantity with units compatible with nanometers The positions to assess energetics for. receptor_atoms : list of int The list of receptor atoms. ligand_atoms : list of int The list of ligand atoms to alchemically modify. platform_name : str, optional, default=None The name of the platform to use for benchmarking. annihilate_electrostatics : bool, optional, default=True If True, electrostatics will be annihilated; if False, decoupled. annihilate_sterics : bool, optional, default=False If True, sterics will be annihilated; if False, decoupled. nsteps : int, optional, default=50 Number of molecular dynamics steps between samples. nsamples : int, optional, default=100 Number of samples to collect. """ # Create a fully-interacting alchemical state. factory = AbsoluteAlchemicalFactory(reference_system, ligand_atoms=ligand_atoms) alchemical_state = AlchemicalState() alchemical_system = factory.createPerturbedSystem(alchemical_state) temperature = 300.0 * units.kelvin collision_rate = 5.0 / units.picoseconds timestep = 2.0 * units.femtoseconds kT = (kB * temperature) # Select platform. platform = None if platform_name: platform = openmm.Platform.getPlatformByName(platform_name) # Create integrators. reference_integrator = openmm.LangevinIntegrator(temperature, collision_rate, timestep) alchemical_integrator = openmm.VerletIntegrator(timestep) # Create contexts. if platform: reference_context = openmm.Context(reference_system, reference_integrator, platform) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator, platform) else: reference_context = openmm.Context(reference_system, reference_integrator) alchemical_context = openmm.Context(alchemical_system, alchemical_integrator) # Collect simulation data. reference_context.setPositions(positions) du_n = np.zeros([nsamples], np.float64) # du_n[n] is the for sample in range(nsamples): # Run dynamics. reference_integrator.step(nsteps) # Get reference energies. reference_state = reference_context.getState(getEnergy=True, getPositions=True) reference_potential = reference_state.getPotentialEnergy() # Get alchemical energies. alchemical_context.setPositions(reference_state.getPositions()) alchemical_state = alchemical_context.getState(getEnergy=True) alchemical_potential = alchemical_state.getPotentialEnergy() du_n[sample] = (alchemical_potential - reference_potential) / kT # Clean up. del reference_context, alchemical_context # Discard data to equilibration and subsample. from pymbar import timeseries [t0, g, Neff] = timeseries.detectEquilibration(du_n) indices = timeseries.subsampleCorrelatedData(du_n, g=g) du_n = du_n[indices] # Compute statistics. from pymbar import EXP [DeltaF, dDeltaF] = EXP(du_n) # Raise an exception if the error is larger than 3kT. MAX_DEVIATION = 3.0 # kT if (dDeltaF > MAX_DEVIATION): report = "DeltaF = %12.3f +- %12.3f kT (%5d samples, g = %6.1f)" % (DeltaF, dDeltaF, Neff, g) raise Exception(report) return
def analyze(source_directory): """ Analyze contents of store files to compute free energy differences. Parameters ---------- source_directory : string The location of the NetCDF simulation storage files. """ analysis_script_path = os.path.join(source_directory, 'analysis.yaml') if not os.path.isfile(analysis_script_path): err_msg = 'Cannot find analysis.yaml script in {}'.format(source_directory) logger.error(err_msg) raise RuntimeError(err_msg) with open(analysis_script_path, 'r') as f: analysis = yaml.load(f) phases = [phase_name for phase_name, sign in analysis] # Storage for different phases. data = dict() # Process each netcdf file. for phase in phases: ncfile_path = os.path.join(source_directory, phase + '.nc') # Open NetCDF file for reading. logger.info("Opening NetCDF trajectory file %(ncfile_path)s for reading..." % vars()) try: ncfile = netcdf.Dataset(ncfile_path, 'r') logger.debug("dimensions:") for dimension_name in ncfile.dimensions.keys(): logger.debug("%16s %8d" % (dimension_name, len(ncfile.dimensions[dimension_name]))) # Read dimensions. niterations = ncfile.variables['positions'].shape[0] nstates = ncfile.variables['positions'].shape[1] logger.info("Read %(niterations)d iterations, %(nstates)d states" % vars()) DeltaF_restraints = 0.0 if 'metadata' in ncfile.groups: # Read phase direction and standard state correction free energy. # Yank sets correction to 0 if there are no restraints DeltaF_restraints = ncfile.groups['metadata'].variables['standard_state_correction'][0] # Choose number of samples to discard to equilibration MIN_ITERATIONS = 10 # minimum number of iterations to use automatic detection if niterations > MIN_ITERATIONS: from pymbar import timeseries u_n = extract_u_n(ncfile) u_n = u_n[1:] # discard initial frame of zero energies TODO: Get rid of initial frame of zero energies [nequil, g_t, Neff_max] = timeseries.detectEquilibration(u_n) nequil += 1 # account for initial frame of zero energies logger.info([nequil, Neff_max]) else: nequil = 1 # discard first frame g_t = 1 Neff_max = niterations # Examine acceptance probabilities. show_mixing_statistics(ncfile, cutoff=0.05, nequil=nequil) # Extract equilibrated, decorrelated energies, check for fully interacting state (u_kln, N_k, u_n) = extract_ncfile_energies(ncfile, ndiscard=nequil, g=g_t) # Create MBAR object to use for free energy and entropy states mbar = initialize_MBAR(ncfile, u_kln=u_kln, N_k=N_k) # Estimate free energies, use fully interacting state if present (Deltaf_ij, dDeltaf_ij) = estimate_free_energies(ncfile, mbar=mbar) # Estimate average enthalpies (DeltaH_i, dDeltaH_i) = estimate_enthalpies(ncfile, mbar=mbar) # Accumulate free energy differences entry = dict() entry['DeltaF'] = Deltaf_ij[0, -1] entry['dDeltaF'] = dDeltaf_ij[0, -1] entry['DeltaH'] = DeltaH_i[0, -1] entry['dDeltaH'] = dDeltaH_i[0, -1] entry['DeltaF_restraints'] = DeltaF_restraints data[phase] = entry # Get temperatures. ncvar = ncfile.groups['thermodynamic_states'].variables['temperatures'] temperature = ncvar[0] * units.kelvin kT = kB * temperature finally: ncfile.close() # Compute free energy and enthalpy DeltaF = 0.0 dDeltaF = 0.0 DeltaH = 0.0 dDeltaH = 0.0 for phase, sign in analysis: DeltaF -= sign * (data[phase]['DeltaF'] + data[phase]['DeltaF_restraints']) dDeltaF += data[phase]['dDeltaF']**2 DeltaH -= sign * (data[phase]['DeltaH'] + data[phase]['DeltaF_restraints']) dDeltaH += data[phase]['dDeltaH']**2 dDeltaF = np.sqrt(dDeltaF) dDeltaH = np.sqrt(dDeltaH) # Attempt to guess type of calculation calculation_type = '' for phase in phases: if 'complex' in phase: calculation_type = ' of binding' elif 'solvent1' in phase: calculation_type = ' of solvation' # Print energies logger.info("") logger.info("Free energy{}: {:16.3f} +- {:.3f} kT ({:16.3f} +- {:.3f} kcal/mol)".format( calculation_type, DeltaF, dDeltaF, DeltaF * kT / units.kilocalories_per_mole, dDeltaF * kT / units.kilocalories_per_mole)) logger.info("") for phase in phases: logger.info("DeltaG {:<25} : {:16.3f} +- {:.3f} kT".format(phase, data[phase]['DeltaF'], data[phase]['dDeltaF'])) if data[phase]['DeltaF_restraints'] != 0.0: logger.info("DeltaG {:<25} : {:25.3f} kT".format('restraint', data[phase]['DeltaF_restraints'])) logger.info("") logger.info("Enthalpy{}: {:16.3f} +- {:.3f} kT ({:16.3f} +- {:.3f} kcal/mol)".format( calculation_type, DeltaH, dDeltaH, DeltaH * kT / units.kilocalories_per_mole, dDeltaH * kT / units.kilocalories_per_mole))
def extract_trajectory(output_path, nc_path, state_index=None, replica_index=None, start_frame=0, end_frame=-1, skip_frame=1, keep_solvent=True, discard_equilibration=False, image_molecules=False): """Extract phase trajectory from the NetCDF4 file. Parameters ---------- output_path : str Path to the trajectory file to be created. The extension of the file determines the format. nc_path : str Path to the NetCDF4 file containing the trajectory. state_index : int, optional The index of the alchemical state for which to extract the trajectory. One and only one between state_index and replica_index must be not None (default is None). replica_index : int, optional The index of the replica for which to extract the trajectory. One and only one between state_index and replica_index must be not None (default is None). start_frame : int, optional Index of the first frame to include in the trajectory (default is 0). end_frame : int, optional Index of the last frame to include in the trajectory. If negative, will count from the end (default is -1). skip_frame : int, optional Extract one frame every skip_frame (default is 1). keep_solvent : bool, optional If False, solvent molecules are ignored (default is True). discard_equilibration : bool, optional If True, initial equilibration frames are discarded (see the method pymbar.timeseries.detectEquilibration() for details, default is False). """ # Check correct input if (state_index is None) == (replica_index is None): raise ValueError('One and only one between "state_index" and ' '"replica_index" must be specified.') if not os.path.isfile(nc_path): raise ValueError('Cannot find file {}'.format(nc_path)) # Import simulation data try: nc_file = netcdf.Dataset(nc_path, 'r') # Extract topology and system serialization serialized_system = nc_file.groups['metadata'].variables['reference_system'][0] serialized_topology = nc_file.groups['metadata'].variables['topology'][0] # Determine if system is periodic from simtk import openmm reference_system = openmm.XmlSerializer.deserialize(str(serialized_system)) is_periodic = reference_system.usesPeriodicBoundaryConditions() logger.info('Detected periodic boundary conditions: {}'.format(is_periodic)) # Get dimensions n_iterations = nc_file.variables['positions'].shape[0] n_atoms = nc_file.variables['positions'].shape[2] logger.info('Number of iterations: {}, atoms: {}'.format(n_iterations, n_atoms)) # Determine frames to extract if start_frame <= 0: # TODO yank saves first frame with 0 energy! start_frame = 1 if end_frame < 0: end_frame = n_iterations + end_frame + 1 frame_indices = range(start_frame, end_frame, skip_frame) if len(frame_indices) == 0: raise ValueError('No frames selected') logger.info('Extracting frames from {} to {} every {}'.format( start_frame, end_frame, skip_frame)) # Discard equilibration samples if discard_equilibration: u_n = extract_u_n(nc_file)[frame_indices] n_equil, g, n_eff = timeseries.detectEquilibration(u_n) logger.info(("Discarding initial {} equilibration samples (leaving {} " "effectively uncorrelated samples)...").format(n_equil, n_eff)) frame_indices = frame_indices[n_equil:-1] # Extract state positions and box vectors positions = np.zeros((len(frame_indices), n_atoms, 3)) if is_periodic: box_vectors = np.zeros((len(frame_indices), 3, 3)) if state_index is not None: logger.info('Extracting positions of state {}...'.format(state_index)) # Deconvolute state indices state_indices = np.zeros(len(frame_indices)) for i, iteration in enumerate(frame_indices): replica_indices = nc_file.variables['states'][iteration, :] state_indices[i] = np.where(replica_indices == state_index)[0][0] # Extract state positions and box vectors for i, iteration in enumerate(frame_indices): replica_index = state_indices[i] positions[i, :, :] = nc_file.variables['positions'][iteration, replica_index, :, :].astype(np.float32) if is_periodic: box_vectors[i, :, :] = nc_file.variables['box_vectors'][iteration, replica_index, :, :].astype(np.float32) else: # Extract replica positions and box vectors logger.info('Extracting positions of replica {}...'.format(replica_index)) for i, iteration in enumerate(frame_indices): positions[i, :, :] = nc_file.variables['positions'][iteration, replica_index, :, :].astype(np.float32) if is_periodic: box_vectors[i, :, :] = nc_file.variables['box_vectors'][iteration, replica_index, :, :].astype(np.float32) finally: nc_file.close() # Create trajectory object logger.info('Creating trajectory object...') topology = utils.deserialize_topology(serialized_topology) trajectory = mdtraj.Trajectory(positions, topology) if is_periodic: trajectory.unitcell_vectors = box_vectors # Force periodic boundary conditions to molecules positions if image_molecules: logger.info('Applying periodic boundary conditions to molecules positions...') trajectory.image_molecules(inplace=True) # Remove solvent if not keep_solvent: logger.info('Removing solvent molecules...') trajectory = trajectory.remove_solvent() # Detect format extension = os.path.splitext(output_path)[1][1:] # remove dot try: save_function = getattr(trajectory, 'save_' + extension) except AttributeError: raise ValueError('Cannot detect format from extension of file {}'.format(output_path)) # Create output directory and save trajectory logger.info('Creating trajectory file: {}'.format(output_path)) output_dir = os.path.dirname(output_path) if output_dir != '' and not os.path.isdir(output_dir): os.makedirs(output_dir) save_function(output_path)
out = open("logd_bayes.txt", 'w') used_samples = open("mcmc_sampling_details.txt","w") out.write("Molecule, Log D +/-, HPD95%[low, high]\n") debug.write("Molecule mean - median = difference") used_samples.write("Molecule, equilibration, N samples") # curdir = os.getcwd() # os.makedirs("plots", ) # os.chdir("plots") for mol in sorted(list(x.logd.keys())): print("Processing {}".format(mol)) # sns.plt.figure() trace = numpy.asarray(mc.trace("LogD_{}".format(mol))[:]) # Burn in and thinning estimated using pymbar burnin = detectEquilibration(trace)[0] trace= trace[burnin:] uncorrelated_indices = subsampleCorrelatedData(trace) trace=trace[uncorrelated_indices] median = pymc.utils.quantiles(trace)[50] mean = numpy.mean(trace) lower, upper = pymc.utils.hpd(trace, 0.05) lower_s = to_precision(lower,2) # string of number with 2 sig digits upper_s = to_precision(upper,2) logd = ufloat(mean, numpy.std(trace)) # Formats the mean and error by the correct amount of significant digits out.write("{0}, {1:.1u}, [{2}, {3}]\n".format(mol, logd, lower_s, upper_s )) debug.write("{}: {} - {} = {}".format(mol, mean, median, mean-median)) used_samples.write("{}, {}, {}".format(mol, burnin, len(uncorrelated_indices)))