def test_statistical_inefficiency_fft(): X, Y, energy = generate_data() timeseries.statisticalInefficiency_fft(X[0]) timeseries.statisticalInefficiency_fft(X[0]**2) timeseries.statisticalInefficiency_fft(energy[0]) g0 = timeseries.statisticalInefficiency_fft(X[0]) g1 = timeseries.statisticalInefficiency(X[0]) g2 = timeseries.statisticalInefficiency(X[0], X[0]) g3 = timeseries.statisticalInefficiency(X[0], fft=True) eq(g0, g1) eq(g0, g2) eq(g0, g3)
def test_statistical_inefficiency_fft(): X, Y, energy = generate_data() timeseries.statisticalInefficiency_fft(X[0]) timeseries.statisticalInefficiency_fft(X[0] ** 2) timeseries.statisticalInefficiency_fft(energy[0]) g0 = timeseries.statisticalInefficiency_fft(X[0]) g1 = timeseries.statisticalInefficiency(X[0]) g2 = timeseries.statisticalInefficiency(X[0], X[0]) g3 = timeseries.statisticalInefficiency(X[0], fft=True) eq(g0, g1) eq(g0, g2) eq(g0, g3)
def detect_equilibration(A_t): """ Automatically detect equilibrated region. ARGUMENTS A_t (np.array) - timeseries RETURNS t (int) - start of equilibrated data g (float) - statistical inefficiency of equilibrated data Neff_max (float) - number of uncorrelated samples """ T = A_t.size # Special case if timeseries is constant. if A_t.std() == 0.0: return (0, 1, T) g_t = np.ones([T - 1], np.float32) Neff_t = np.ones([T - 1], np.float32) for t in range(T - 1): g_t[t] = timeseries.statisticalInefficiency(A_t[t:T]) Neff_t[t] = (T - t + 1) / g_t[t] Neff_max = Neff_t.max() t = Neff_t.argmax() g = g_t[t] return (t, g, Neff_max)
def get_decorrelation_time(timeseries_to_analyze): """ Compute the decorrelation times given a timeseries. See the ``pymbar.timeseries.statisticalInefficiency`` for full documentation """ return timeseries.statisticalInefficiency(timeseries_to_analyze)
def prepWindow(filename, tstart=0, tstop=None): """ Read window .traj file, compute correlation times, subsample data. Parameters ---------- filename: string name of the file to process. For *.traj file, assumes all lines are data (e.g. no comment lines). tstart: integer nanosecond start time tstop: integer nanosecond stop time Returns ------- counts: int, number of entries for this particular window winZ: numpy list containing SUBSAMPLED data for this window from tstart to tstop """ # Parse data. n, z_sub = parseWindow(filename, tstart, tstop) # Compute correlation times for z (actual spring center position) timeseries. g = timeseries.statisticalInefficiency(z_sub) print "Correlation time for %s is %10.3f" % (re.split('\W+', filename)[1], g) indices = timeseries.subsampleCorrelatedData(z_sub, g) # Subsample data. zsublen = len(indices) z_sub = z_sub[indices] return zsublen, z_sub
def test_statistical_inefficiency(): """Test the statistical inefficiency calculation utility.""" data_size = 200000 random_array = np.random.rand(data_size) numpy_vector_array = [] for i in range(data_size): numpy_vector_array.append([random_array[i]]) a = np.array(numpy_vector_array) statistical_inefficiency = timeseries.calculate_statistical_inefficiency( a, minimum_samples=3) pymbar_statistical_inefficiency = pymbar_timeseries.statisticalInefficiency( a, mintime=3) print( "utils: {}, pymbar: {}", statistical_inefficiency, pymbar_statistical_inefficiency, ) assert abs(statistical_inefficiency - pymbar_statistical_inefficiency) < 0.00001
def detect_equilibration(A_t): """ Automatically detect equilibrated region. ARGUMENTS A_t (np.array) - timeseries RETURNS t (int) - start of equilibrated data g (float) - statistical inefficiency of equilibrated data Neff_max (float) - number of uncorrelated samples """ T = A_t.size # Special case if timeseries is constant. if A_t.std() == 0.0: return (0, 1, T) g_t = np.ones([T-1], np.float32) Neff_t = np.ones([T-1], np.float32) for t in range(T-1): g_t[t] = timeseries.statisticalInefficiency(A_t[t:T]) Neff_t[t] = (T-t+1) / g_t[t] Neff_max = Neff_t.max() t = Neff_t.argmax() g = g_t[t] return (t, g, Neff_max)
def subsample(x, y_mat, num_cols=None): """ Parameters ---------- x : numpy array 1-dimensional array with x-data, such as timestep. y_mat : can take various forms: - list of numpy arrays, such as grouping 1-column data into smaller data series - 1D numpy array, such as subsampling 1-column data - multidimensional numpy array, if data has many columns num_cols : int (opt.) Number of data series for the input y_mat. Use this value to loop over the input data, since it can be formatted as 1- or N-dimensional list or numpy array. If num_cols not specified, the value will be extracted from input data using find_num_cols function. Returns ------- x_mat : list multi-dimensional array of the same shape as z_mat z_mat : list multi-dimesional array in which z_mat[i][j] is the jth value in the ith data series. """ from pymbar import timeseries x_mat = [] z_mat = [] # subsampled y_mat if num_cols is None: num_cols = find_num_cols(y_mat) for i in range(num_cols): # list of np arrays if type(y_mat) is list and len(y_mat[0]) > 1: y = y_mat[i] # 1D np array elif type(y_mat) is np.ndarray and len(y_mat.shape) == 1: y = y_mat # multidimensional np array else: y = y_mat[:,i] # compute correlation times g = timeseries.statisticalInefficiency(y) indices = timeseries.subsampleCorrelatedData(y, g) # subsample data y_sub = y[indices] x_sub = x[indices] z_mat.append(y_sub) x_mat.append(x_sub) print("\nLength of original timeseries data: %d" % len(y) ) print("\nLength of subsampled timeseries data: %d" % len(y_sub) ) return x_mat, z_mat
def equilibrium_detection(df, series=None, lower=None, upper=None, step=None): """Subsample a DataFrame using automated equilibrium detection on a timeseries. If `series` is ``None``, then this function will behave the same as :func:`slicing`. Parameters ---------- df : DataFrame DataFrame to subsample according to equilibrium detection on `series`. series : Series Series to detect equilibration on. If ``None``, no equilibrium detection-based subsampling will be performed. lower : float Lower bound to pre-slice `series` data from. upper : float Upper bound to pre-slice `series` to (inclusive). step : int Step between `series` items to pre-slice by. Returns ------- DataFrame `df` subsampled according to subsampled `series`. See Also -------- pymbar.timeseries.detectEquilibration : detailed background """ if _check_multiple_times(df): raise KeyError("Duplicate time values found; equilibrium detection " "is only meaningful for a single, contiguous, " "and sorted timeseries.") if not _check_sorted(df): raise KeyError("Equilibrium detection only works as expected if " "values are sorted by time, increasing.") if series is not None: series = slicing(series, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series statinef = statisticalInefficiency(series) # calculate statistical inefficiency of series, with equilibrium detection t, statinef, Neff_max = detectEquilibration(series.values) # we round up statinef = int(np.rint(statinef)) # subsample according to statistical inefficiency series = series.iloc[t::statinef] df = df.loc[series.index] else: df = slicing(df, lower=lower, upper=upper, step=step) return df
def mean_and_uncertainty(series: Series, inefficiency=None) -> (float, float): from pymbar import timeseries ave = np.mean(series) array = np.array(series) if inefficiency == None: inefficiency = timeseries.statisticalInefficiency(array) return ave, np.std(array, ddof=1) / math.sqrt(len(array) / inefficiency)
def subsample_gradients(self): r''' method to subsample gradients and get a better estiamte. ''' if self.percentage == 100 and not self.subsample: warnings.warn( "You are not subsampling your data according to the statistical inefficiency nor are " "you discarding initial data. Please set percentage to another value than 100!" ) percentage_removal = (self._N_k * (1 - self.percentage / 100.0)).astype('int32') self._subsampled_N_k_gradients = self._N_k - percentage_removal N_max = int(numpy.max(self._subsampled_N_k_gradients)) self._subsampled_grad_kn = numpy.zeros(shape=(self._N_k.shape[0], N_max)) for p in range(percentage_removal.shape[0]): start = percentage_removal[p] finish = percentage_removal[p] + N_max self._subsampled_grad_kn[p, :] = self._gradients_kn[p, start:finish] if N_max <= 50: warnings.warn( "You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option." ) #if subsampling is percentage, then we are done here, otherwise we will now subsample according to timeseries if self.subsample: print( "#Subsampling gradients according to statistical inefficiency") #first we compute statistical inefficiency self._gradients_kn = self._subsampled_grad_kn.copy() self._N_k = self._subsampled_N_k_gradients.copy() g_k = numpy.zeros(shape=(self._gradients_kn.shape[0])) self._subsampled_N_k_gradients = numpy.zeros( shape=(self._gradients_kn.shape[0])) for i in range(g_k.shape[0]): g_k[i] = timeseries.statisticalInefficiency( self._gradients_kn[i, :]) g = int(numpy.max(g_k)) #now we need to figure out what the indices in the data are for subsampling indices_k = [] for i in range(g_k.shape[0]): indices_k.append( timeseries.subsampleCorrelatedData( self._gradients_kn[i, :], g=g)) self._subsampled_N_k_gradients[i] = len(indices_k[i]) N_max = int(numpy.max(self._subsampled_N_k_gradients)) if N_max <= 50: warnings.warn( "You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option." ) self._subsampled_grad_kn = numpy.zeros( [self._gradients_kn.shape[0], N_max], numpy.float64) for k in range(self._gradients_kn.shape[0]): self._subsampled_grad_kn[k, :] = self._gradients_kn[ k, indices_k[k]]
def si_data_bar_dhdl(data_bar_dhdl): si_l = {} for l in data_bar_dhdl: si_l[l] = [] for i in range(len(data_bar_dhdl[l])): temp_si = timeseries.statisticalInefficiency( data_bar_dhdl[l][i].loc[:, l]) si_l[l].append(temp_si) return si_l
def decorrelate(traj, verbose=False, name=None): traj = np.array(traj) if traj.ndim == 1: idx = timeseries.subsampleCorrelatedData(traj) n0 = traj.size n1 = len(idx) res = traj[idx] elif traj.ndim == 2: # pymbar doesn't offer to decorrelate two samples, so let's do it ourselves # and just use the decorrelation of the sample more strongly correlated # # calculate (maximal) inefficiency g1 = timeseries.statisticalInefficiency(traj[0]) g2 = timeseries.statisticalInefficiency(traj[1]) g = np.max([g1, g2]) # calculate index n0 = traj.shape[1] idx = np.unique( np.array(np.round(np.arange(0, int(n0 / g + .5)) * g), dtype=int)) idx = idx[idx < n0] n1 = len(idx) res = traj[:, idx] else: raise NotImplementedError( 'trajectory.decorrelate() is not implemented for ' 'trajectories with more than 1 dimension.') if verbose: n = n0 - n1 if not name: name = 'Trajectory' if n == 0: print('{:s} decorrelation: No frames discarded for decorrelation.'. format(name)) elif n == 1: print('{:s} decorrelation: 1 frame ({:.1%} of ' 'trajectory) discarded for decorrelation.'.format( name, 1 / n0)) else: print('{:s} decorrelation: {:d} frames ({:.1%} of ' 'trajectory) discarded for decorrelation.'.format( name, n, n / n0)) return res
def si_skips_data_dEs(dEs, nfr_mul, skip=1): c = 0 skips = [] for i in range(len(nfr_mul)): n_frms_list = nfr_mul[i] for n_frms in n_frms_list: if n_frms: c_end = c + n_frms temp_si = timeseries.statisticalInefficiency(dEs[i][c:c_end]) skips.append(int(temp_si * skip) + 1) c = c_end return skips
def test_statistical_inefficiency_fft_gaussian(): # Run multiple times to get things with and without negative "spikes" at C(1) for i in range(5): x = np.random.normal(size=100000) g0 = timeseries.statisticalInefficiency(x, fast=False) g1 = timeseries.statisticalInefficiency(x, x, fast=False) g2 = timeseries.statisticalInefficiency_fft(x) g3 = timeseries.statisticalInefficiency(x, fft=True) eq(g0, g1, decimal=5) eq(g0, g2, decimal=5) eq(g0, g3, decimal=5) eq(np.log(g0), np.log(1.0), decimal=1) for i in range(5): x = np.random.normal(size=100000) x = np.repeat( x, 3 ) # e.g. Construct correlated gaussian e.g. [a, b, c] -> [a, a, a, b, b, b, c, c, c] g0 = timeseries.statisticalInefficiency(x, fast=False) g1 = timeseries.statisticalInefficiency(x, x, fast=False) g2 = timeseries.statisticalInefficiency_fft(x) g3 = timeseries.statisticalInefficiency(x, fft=True) eq(g0, g1, decimal=5) eq(g0, g2, decimal=5) eq(g0, g3, decimal=5) eq(np.log(g0), np.log(3.0), decimal=1)
def test_statistical_inefficiency_fft_gaussian(): # Run multiple times to get things with and without negative "spikes" at C(1) for i in range(5): x = np.random.normal(size=100000) g0 = timeseries.statisticalInefficiency(x, fast=False) g1 = timeseries.statisticalInefficiency(x, x, fast=False) g2 = timeseries.statisticalInefficiency_fft(x) g3 = timeseries.statisticalInefficiency(x, fft=True) eq(g0, g1, decimal=5) eq(g0, g2, decimal=5) eq(g0, g3, decimal=5) eq(np.log(g0), np.log(1.0), decimal=1) for i in range(5): x = np.random.normal(size=100000) x = np.repeat(x, 3) # e.g. Construct correlated gaussian e.g. [a, b, c] -> [a, a, a, b, b, b, c, c, c] g0 = timeseries.statisticalInefficiency(x, fast=False) g1 = timeseries.statisticalInefficiency(x, x, fast=False) g2 = timeseries.statisticalInefficiency_fft(x) g3 = timeseries.statisticalInefficiency(x, fft=True) eq(g0, g1, decimal=5) eq(g0, g2, decimal=5) eq(g0, g3, decimal=5) eq(np.log(g0), np.log(3.0), decimal=1)
def subsampletimeseries(timeser, xyzn, N_k): """ Return a subsampled timeseries based on statistical inefficiency calculations. Parameters ---------- timeser: the timeseries to be subsampled xyzn: the coordinates associated with each frame of the timeseries to be subsampled N_k: original # of samples in each timeseries Returns --------- N_k_sub: new number of samples per timeseries ts_sub: the subsampled timeseries xyz_sub: the subsampled configuration series """ # Make a copy of the timeseries and make sure is numpy array of floats ts = timeser xyz = xyzn # initialize array of statistical inefficiencies g = np.zeros(len(ts), np.float64) for i, t in enumerate(ts): if np.count_nonzero(t) == 0: g[i] = np.float(1.) print "WARNING FLAG" else: g[i] = timeseries.statisticalInefficiency(t) N_k_sub = np.array([ len(timeseries.subsampleCorrelatedData(t, g=b)) for t, b in zip(ts, g) ]) ind = [timeseries.subsampleCorrelatedData(t, g=b) for t, b in zip(ts, g)] if (N_k_sub == N_k).all(): ts_sub = ts xyz_sub = xyz print "No sub-sampling occurred" else: print "Sub-sampling..." ts_sub = np.array([ t[timeseries.subsampleCorrelatedData(t, g=b)] for t, b in zip(ts, g) ]) #for c in xyz: # xyz_sub = [c[timeseries.subsampleCorrelatedData(t,g=b)] for t,b in zip(ts,g)] for i, j in enumerate(xyz): xyz_sub = [j[ii] for ii in ind[i]] return ts_sub, N_k_sub, xyz_sub, ind
def getNkandUkln(): # u_kln = u_klt # N_k = [maxn]*K # return (N_k, u_kln) """Identifies uncorrelated samples and updates the arrays of the reduced potential energy and dhdlt retaining data entries of these samples only.""" u_kln = np.zeros([K,K,maxn], np.float64) # u_kln[k,m,n] is the reduced potential energy of uncorrelated sample index n from state k evaluated at state m N_k = np.zeros(K, int) # N_k[k] is the number of uncorrelated samples from state k g = np.zeros(K,float) # autocorrelation times for the data print "Number of correlated and uncorrelated samples:\n\n%8s %10s %12s %12s" % ('Lambda', 'N', 'N_k', 'N/N_k') for k in range(K): if k == 0: g[k] = timeseries.statisticalInefficiency(u_klt[k,k+1,:]) indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k+1,:])) # indices of uncorrelated samples else: g[k] = timeseries.statisticalInefficiency(u_klt[k,k-1,:]) indices = np.array(timeseries.subsampleCorrelatedData(u_klt[k,k-1,:])) N = len(indices) # number of uncorrelated samples N_k[k] = N # Store the number of uncorrelated samples from state k. for l in range(K): u_kln[k,l,0:N] = u_klt[k,l,indices] print "%6.2f %12s %12s %12.2f" % (l_list[k], maxn, N_k[k], g[k]) print '' return (N_k, u_kln)
def subsample_energies(self): r''' This subsamples u_kln according to percentage, i.e. remove initial equilibration data and then can additionally subsample according to timeseries ''' #removing percent if self.percentage == 100 and not self.subsample: warnings.warn("You are not subsampling your data according to the statistical inefficiency nor are " "you discarding initial data. Please set percentage to another value than 100!") percentage_removal = (self._N_k*(1-self.percentage/100.0)).astype('int32') self._subsampled_N_k_energies = self._N_k-percentage_removal N_max = int(numpy.max(self._subsampled_N_k_energies)) self._subsampled_u_kln = numpy.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max)) self._subsampled_energies_kn = numpy.zeros(shape=(self._N_k.shape[0], N_max)) for k in range(0, self._N_k.shape[0]): self._subsampled_u_kln[k] = self._u_kln[k,:,percentage_removal[k]:percentage_removal[k]+N_max] self._subsampled_energies_kn[k] = self._energies_kn[k,percentage_removal[k]:percentage_removal[k]+N_max] if N_max <=50: warnings.warn("You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.") #Now we are doing some additional subsampling according to timeseries analysis if self.subsample: print("#Subsampling energies according to statistical inefficiency for pymbar") self._u_kln = self._subsampled_u_kln.copy() self._N_k = self._subsampled_N_k_energies.copy() self._energies_kn = self._subsampled_energies_kn.copy() #first we compute statistical inefficiency g_k = numpy.zeros(shape=(self._energies_kn.shape[0])) for i in range(g_k.shape[0]): g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,percentage_removal[i]:]) g = numpy.max(g_k) #now we need to figure out what the indices in the data are for subsampling indices_k = [] self._subsampled_N_k_energies = numpy.zeros(shape=(self._energies_kn.shape[0])) for i in range(g_k.shape[0]): indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g)) self._subsampled_N_k_energies[i]=len(indices_k[i]) #self._subsampled_N_k_energies = (numpy.ceil(self._N_k / g)).astype(int) N_max = int(numpy.max(self._subsampled_N_k_energies)) if N_max <=50: warnings.warn("You have reduced your data to less than 50 samples, the results from these might not " "be trustworthy. If you don't want to add more samples consider rerunning the analysis using the percentage option.") self._subsampled_u_kln = numpy.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], numpy.float64) for k in range(self._gradients_kn.shape[0]): self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def detect_equilibration(A_t, nskip=1, method='fft'): """ Automatically detect equilibrated region. ARGUMENTS A_t (numpy.array) - timeseries OPTIONAL ARGUMENTS nskip (int) - resolution of analysis for determining equilibration (default: 1) method (string) - method to use for statistical inefficiency calculation (default: 'fft') RETURNS t0 (int) - start of equilibrated data g (float) - statistical inefficiency of equilibrated data Neff_max (float) - number of uncorrelated samples """ T = A_t.size # Special case if timeseries is constant. if A_t.std() == 0.0: return (0, 1, T) indices = range(0, T-1, nskip) N = len(indices) t0_n = numpy.ones([N], numpy.float32) g_n = numpy.ones([N], numpy.float32) Neff_n = numpy.ones([N], numpy.float32) for n in range(N): t0 = nskip*n t0_n[n] = t0 g_n[n] = timeseries.statisticalInefficiency(A_t[t0:T], method=method) Neff_n[n] = (T-t0) / g_n[n] Neff_max = Neff_n.max() n = Neff_n.argmax() t0 = t0_n[n] g = g_n[n] return (t0, g, Neff_max)
def detect_equilibration(A_t, nskip=1, method='fft'): """ Automatically detect equilibrated region. ARGUMENTS A_t (numpy.array) - timeseries OPTIONAL ARGUMENTS nskip (int) - resolution of analysis for determining equilibration (default: 1) method (string) - method to use for statistical inefficiency calculation (default: 'fft') RETURNS t0 (int) - start of equilibrated data g (float) - statistical inefficiency of equilibrated data Neff_max (float) - number of uncorrelated samples """ T = A_t.size # Special case if timeseries is constant. if A_t.std() == 0.0: return (0, 1, T) indices = range(0, T - 1, nskip) N = len(indices) t0_n = numpy.ones([N], numpy.float32) g_n = numpy.ones([N], numpy.float32) Neff_n = numpy.ones([N], numpy.float32) for n in range(N): t0 = nskip * n t0_n[n] = t0 g_n[n] = timeseries.statisticalInefficiency(A_t[t0:T], method=method) Neff_n[n] = (T - t0) / g_n[n] Neff_max = Neff_n.max() n = Neff_n.argmax() t0 = t0_n[n] g = g_n[n] return (t0, g, Neff_max)
def subsample_energies(self): if self.subsample_method!='timeseries': print("We are only eliminating samples from the beginning of the data and are still working with highly" " correlated data!") if self.percentage ==100: RuntimeWarning("You are not subsampling your data according to the statistical inefficiency nor are" "you discarding initial data. Please set percentage to another value than 100!") percentage_removal = self._N_k*(1-self.percentage/100.0) self._subsampled_N_k_energies = self._N_k-percentage_removal N_max = np.max(self._subsampled_N_k_energies) self._subsampled_u_kln = np.zeros(shape=(self._N_k.shape[0], self._N_k.shape[0], N_max)) for i in range(percentage_removal.shape[0]): for j in range(percentage_removal.shape[0]): self._subsampled_u_kln[i,j,:] = self._u_kln[i,j,percentage_removal[j]:] if N_max <=100: RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not " "be trustworthy. ") else: print("We are doing a timeseries analysis using the timeseries analysis module in pymbar and will subsample" " according to that.") #first we compute statistical inefficiency g_k = np.zeros(shape=(self._energies_kn.shape[0])) for i in range(g_k.shape[0]): g_k[i] = timeseries.statisticalInefficiency(self._energies_kn[i,:]) g = np.max(g_k) #now we need to figure out what the indices in the data are for subsampling indices_k = [] self._subsampled_N_k_energies = np.zeros(shape=(self._gradients_kn.shape[0])) for i in range(g_k.shape[0]): indices_k.append(timeseries.subsampleCorrelatedData(self._energies_kn[i,:], g=g)) self._subsampled_N_k_energies[i]=len(indices_k[i]) #self._subsampled_N_k_energies = (np.ceil(self._N_k / g)).astype(int) N_max = np.max(self._subsampled_N_k_energies) if N_max <=100: RuntimeWarning("You have reduced your data to less than 100 samples, the results from these might not " "be trustworthy. ") self._subsampled_u_kln = np.zeros([self._gradients_kn.shape[0],self._gradients_kn.shape[0], N_max], np.float64) for k in range(self._gradients_kn.shape[0]): self._subsampled_u_kln[k,:,:] = self._u_kln[k,:,indices_k[k]].transpose()
def get_equilibration_data_per_sample(timeseries_to_analyze, fast=True, nskip=1): """ Compute the correlation time and n_effective per sample. This is exactly what ``pymbar.timeseries.detectEquilibration`` does, but returns the per sample data See the ``pymbar.timeseries.detectEquilibration`` function for full documentation """ A_t = timeseries_to_analyze T = A_t.size g_t = np.ones([T - 1], np.float32) Neff_t = np.ones([T - 1], np.float32) for t in range(0, T - 1, nskip): try: g_t[t] = timeseries.statisticalInefficiency(A_t[t:T], fast=fast) except: g_t[t] = (T - t + 1) Neff_t[t] = (T - t + 1) / g_t[t] return g_t, Neff_t
def subsampling(self, integratedACF=True): """ Performs inline subsampling based on the statistical inefficiency ``g`` of the specified attribute `acfun` of :class:`sample`, aiming at obtaining a sample of :term:`IID` configurations. Subsampling is done via jumps of varying sizes around ``g``, so that the sample size decays by a factor of approximately ``1/g``. Parameters ---------- integratedACF : bool, optional, default=True If true, the integrated :term:`ACF` method :cite:`Chodera_2007` will be used for computing the statistical inefficiency. Otherwise, the :term:`OBM` method will be used instead. Returns ------- :class:`sample` Although the subsampling is done inline, the new sample is returned for chaining purposes. """ n = len(self.dataset) if mics.verbose: info("\n=== Subsampling via %s ===" % ("integrated ACF" if integratedACF else "OBM")) info("Original sample size:", n) if integratedACF: y = multimap([self.acfun.lambdify()], self.dataset) g = timeseries.statisticalInefficiency(y[0]) else: g = n / self.neff new = timeseries.subsampleCorrelatedData(self.dataset.index, g) self.dataset = self.dataset.reindex(new) self.neff = len(new) if mics.verbose: info("Statistical inefficiency:", g) info("New sample size:", self.neff) return self
def get_equilibration_data_per_sample(timeseries_to_analyze, fast=True, max_subset=100): """ Compute the correlation time and n_effective per sample with tuning to how you want your data formatted This is a modified pass-through to ``pymbar.timeseries.detectEquilibration`` does, returning the per sample data. It has been modified to specify the maximum number of time points to consider, evenly spaced over the timeseries. This is different than saying "I want analysis done every X for total points Y = len(timeseries)/X", this is "I want Y total analysis points" See the ``pymbar.timeseries.detectEquilibration`` function for full algorithm documentation Parameters ---------- timeseries_to_analyze : np.ndarray 1-D timeseries to analyze for equilibration max_subset : int >= 1 or None, optional, default: 100 Maximum number of points in the ``timeseries_to_analyze`` on which to analyze the equilibration on. These are distributed uniformly over the timeseries so the final output will be size max_subset where indices are placed approximately every ``(len(timeseries_to_analyze) - 1) / max_subset``. The full timeseries is used if the timeseries is smaller than ``max_subset`` or if ``max_subset`` is None fast : bool, optional. Default: True If True, will use faster (but less accurate) method to estimate correlation time passed on to timeseries module. Returns ------- i_t : np.ndarray of int Indices of the timeseries which were sampled from g_i : np.ndarray of float Estimated statistical inefficiency at t in units of index count. Equal to 1 + 2 tau, where tau is the correlation time Will always be >= 1 e.g. If g_i[x] = 4.3, then choosing x as your equilibration point means the every ``ceil(4.3)`` in ``timeseries_to_analyze`` will be decorrelated, so the fully equilibrated decorrelated timeseries would be indexed by [x, x+5, x+10, ..., X) where X is the final point in the ``timeseries_to_analyze``. The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t`` n_effective_i : np.ndarray of float Number of effective samples by subsampling every ``g_i`` from index t, does include fractional value, so true number of points will be the floor of this output. The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t`` """ # Cast to array if not already series = np.array(timeseries_to_analyze) # Special trap for constant series time_size = series.size set_size = time_size - 1 # Cannot analyze the last entry # Set maximum if max_subset is None or set_size < max_subset: max_subset = set_size # Special trap for series of size 1 if max_subset == 0: max_subset = 1 # Special trap for constant or size 1 series if series.std() == 0.0 or max_subset == 1: return (np.arange(max_subset, dtype=int), # i_t np.array([1]*max_subset), # g_i np.arange(time_size, time_size-max_subset, -1) # n_effective_i ) g_i = np.ones([max_subset], np.float32) n_effective_i = np.ones([max_subset], np.float32) counter = np.arange(max_subset) i_t = np.floor(counter * time_size / max_subset).astype(int) for i, t in enumerate(i_t): try: g_i[i] = timeseries.statisticalInefficiency(series[t:], fast=fast) except: g_i[i] = (time_size - t + 1) n_effective_i[i] = (time_size - t + 1) / g_i[i] return i_t, g_i, n_effective_i
# Parse data. n = 0 for line in lines: if line[0] != '#' and line[0] != '@': tokens = line.split() print(tokens) u_kn[k, n] = beta_k[k] * ( float(tokens[2]) - float(tokens[1]) ) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k, :], u_kn[k, 0:N_k[k]]) print("Correlation time for set %5d is %10.3f" % (k, g_k[k])) indices = timeseries.subsampleCorrelatedData(u_kn[k, 0:N_k[k]]) else: chi_radians = chi_kn[k, 0:N_k[k]] / (180.0 / numpy.pi) g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians)) g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians)) print("g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin)) g_k[k] = max(g_cos, g_sin) print("Correlation time for set %5d is %10.3f" % (k, g_k[k])) indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) # Subsample data. N_k[k] = len(indices) u_kn[k, 0:N_k[k]] = u_kn[k, indices] chi_kn[k, 0:N_k[k]] = chi_kn[k, indices]
def get_equilibration_data_per_sample(timeseries_to_analyze, fast=True, max_subset=100): """ Compute the correlation time and n_effective per sample with tuning to how you want your data formatted This is a modified pass-through to ``pymbar.timeseries.detectEquilibration`` does, returning the per sample data. It has been modified to specify the maximum number of time points to consider, evenly spaced over the timeseries. This is different than saying "I want analysis done every X for total points Y = len(timeseries)/X", this is "I want Y total analysis points" See the ``pymbar.timeseries.detectEquilibration`` function for full algorithm documentation Parameters ---------- timeseries_to_analyze : np.ndarray 1-D timeseries to analyze for equilibration max_subset : int >= 1 or None, optional, default: 100 Maximum number of points in the ``timeseries_to_analyze`` on which to analyze the equilibration on. These are distributed uniformly over the timeseries so the final output will be size max_subset where indices are placed approximately every ``(len(timeseries_to_analyze) - 1) / max_subset``. The full timeseries is used if the timeseries is smaller than ``max_subset`` or if ``max_subset`` is None fast : bool, optional. Default: True If True, will use faster (but less accurate) method to estimate correlation time passed on to timeseries module. Returns ------- i_t : np.ndarray of int Indices of the timeseries which were sampled from g_i : np.ndarray of float Estimated statistical inefficiency at t in units of index count. Equal to 1 + 2 tau, where tau is the correlation time Will always be >= 1 e.g. If g_i[x] = 4.3, then choosing x as your equilibration point means the every ``ceil(4.3)`` in ``timeseries_to_analyze`` will be decorrelated, so the fully equilibrated decorrelated timeseries would be indexed by [x, x+5, x+10, ..., X) where X is the final point in the ``timeseries_to_analyze``. The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t`` n_effective_i : np.ndarray of float Number of effective samples by subsampling every ``g_i`` from index t, does include fractional value, so true number of points will be the floor of this output. The "index count" in this case is the by count of the ``timeseries_to_analyze`` indices, NOT the ``i_t`` """ # Cast to array if not already series = np.array(timeseries_to_analyze) # Special trap for constant series time_size = series.size set_size = time_size - 1 # Cannot analyze the last entry # Set maximum if max_subset is None or set_size < max_subset: max_subset = set_size # Special trap for series of size 1 if max_subset == 0: max_subset = 1 # Special trap for constant or size 1 series if series.std() == 0.0 or max_subset == 1: return ( np.arange(max_subset, dtype=int), # i_t np.array([1] * max_subset), # g_i np.arange(time_size, time_size - max_subset, -1) # n_effective_i ) g_i = np.ones([max_subset], np.float32) n_effective_i = np.ones([max_subset], np.float32) counter = np.arange(max_subset) i_t = np.floor(counter * time_size / max_subset).astype(int) for i, t in enumerate(i_t): try: g_i[i] = timeseries.statisticalInefficiency(series[t:], fast=fast) except: g_i[i] = (time_size - t + 1) n_effective_i[i] = (time_size - t + 1) / g_i[i] return i_t, g_i, n_effective_i
cluster_bin_kn = -1*numpy.ones([K,N_samples], numpy.int32) # cluster_bin_kn[k,n] is the cluster bin index of snapshot n of umbrella simulation k N_k = numpy.zeros([K], numpy.int32) # N_k[k] is the number of uncorrelated samples from simulation index k reduced_expectation_data = [] if len(expectation_columns) > 0: for i in range(len(expectation_columns)): reduced_expectation_data.append(numpy.zeros([K,N_samples], numpy.float64)) reduced_fep_data = [] if len(fep_columns) > 0: for i in range(len(fep_columns)): reduced_fep_data.append(numpy.zeros([K,N_samples], numpy.float64)) for k in range(K): # Extract timeseries. A_t = biasing_variable_kt[0][k,:] # Compute statistical inefficiency. try: g = timeseries.statisticalInefficiency(A_t) except Exception as e: print str(e) print A_t # Subsample data. if subsample_trajectories: indices = timeseries.subsampleCorrelatedData(A_t, g=g) else: indices = timeseries.subsampleCorrelatedData(A_t, g=1) N = len(indices) # number of uncorrelated samples print "k = %5d : g = %.1f, N = %d" % (k, g, N) for i in range(nbiases): biasing_variable_kn[i][k,0:N] = biasing_variable_kt[i][k,indices] for i in range(nperturbations+1): U_kn[i][k,0:N] = U_kt[i][k,indices]
#======================================================================== #------------------------------------------------------------------------ # Read Data From File #------------------------------------------------------------------------ print("") print("Preparing data:") T_from_file = read_simulation_temps(simulation,NumTemps) E_from_file = read_total_energies(simulation,TE_COL_NUM) K = len(T_from_file) N_k = numpy.zeros(K,numpy.int32) g = numpy.zeros(K,numpy.float64) for k in range(K): # subsample the energies g[k] = timeseries.statisticalInefficiency(E_from_file[k]) indices = numpy.array(timeseries.subsampleCorrelatedData(E_from_file[k],g=g[k])) # indices of uncorrelated samples N_k[k] = len(indices) # number of uncorrelated samples E_from_file[k,0:N_k[k]] = E_from_file[k,indices] #------------------------------------------------------------------------ # Insert Intermediate T's and corresponding blank U's and E's #------------------------------------------------------------------------ Temp_k = T_from_file minT = T_from_file[0] maxT = T_from_file[len(T_from_file) - 1] #beta = 1/(k*BT) #T = 1/(kB*beta) if dertype == 'temperature': minv = minT maxv = maxT
def subtest_mcmc_expectation(testsystem, move_set): if debug: print testsystem.__class__.__name__ print str(move_set) # Test settings. temperature = 298.0 * units.kelvin pressure = 1.0 * units.atmospheres nequil = 10 # number of equilibration iterations niterations = 20 # number of production iterations # Retrieve system and positions. [system, positions] = [testsystem.system, testsystem.positions] platform_name = 'Reference' from simtk.openmm import Platform platform = Platform.getPlatformByName(platform_name) # Compute properties. kB = units.BOLTZMANN_CONSTANT_kB * units.AVOGADRO_CONSTANT_NA kT = kB * temperature ndof = 3*system.getNumParticles() - system.getNumConstraints() # Create thermodynamic state from repex.thermodynamics import ThermodynamicState thermodynamic_state = ThermodynamicState(system=testsystem.system, temperature=temperature, pressure=pressure) # Create MCMC sampler. from repex.mcmc import MCMCSampler sampler = MCMCSampler(thermodynamic_state, move_set=move_set, platform=platform) # Create sampler state. from repex.mcmc import SamplerState sampler_state = SamplerState(system=testsystem.system, positions=testsystem.positions, platform=platform) # Equilibrate for iteration in range(nequil): #print "equilibration iteration %d / %d" % (iteration, nequil) # Update sampler state. sampler_state = sampler.run(sampler_state, 1) # Accumulate statistics. x_n = np.zeros([niterations], np.float64) # x_n[i] is the x position of atom 1 after iteration i, in angstroms potential_n = np.zeros([niterations], np.float64) # potential_n[i] is the potential energy after iteration i, in kT kinetic_n = np.zeros([niterations], np.float64) # kinetic_n[i] is the kinetic energy after iteration i, in kT temperature_n = np.zeros([niterations], np.float64) # temperature_n[i] is the instantaneous kinetic temperature from iteration i, in K volume_n = np.zeros([niterations], np.float64) # volume_n[i] is the volume from iteration i, in K for iteration in range(niterations): if debug: print "iteration %d / %d" % (iteration, niterations) # Update sampler state. sampler_state = sampler.run(sampler_state, 1) # Get statistics. potential_energy = sampler_state.potential_energy kinetic_energy = sampler_state.kinetic_energy total_energy = sampler_state.total_energy instantaneous_temperature = kinetic_energy * 2.0 / ndof / (units.BOLTZMANN_CONSTANT_kB * units.AVOGADRO_CONSTANT_NA) volume = sampler_state.volume #print "potential %8.1f kT | kinetic %8.1f kT | total %8.1f kT | volume %8.3f nm^3 | instantaneous temperature: %8.1f K" % (potential_energy/kT, kinetic_energy/kT, total_energy/kT, volume/(units.nanometers**3), instantaneous_temperature/units.kelvin) # Accumulate statistics. x_n[iteration] = sampler_state.positions[0,0] / units.angstroms potential_n[iteration] = potential_energy / kT kinetic_n[iteration] = kinetic_energy / kT temperature_n[iteration] = instantaneous_temperature / units.kelvin volume_n[iteration] = volume / (units.nanometers**3) # Compute expected statistics. if ('get_potential_expectation' in dir(testsystem)): # Skip this check if the std dev is zero. skip_test = False if (potential_n.std() == 0.0): skip_test = True if debug: print "Skipping potential test since variance is zero." if not skip_test: potential_expectation = testsystem.get_potential_expectation(thermodynamic_state) / kT potential_mean = potential_n.mean() g = timeseries.statisticalInefficiency(potential_n, fast=True) dpotential_mean = potential_n.std() / np.sqrt(niterations / g) potential_error = potential_mean - potential_expectation nsigma = abs(potential_error) / dpotential_mean test_passed = True if (nsigma > NSIGMA_CUTOFF): test_passed = False if debug or (test_passed is False): print "Potential energy expectation" print "observed %10.5f +- %10.5f kT | expected %10.5f | error %10.5f +- %10.5f (%.1f sigma)" % (potential_mean, dpotential_mean, potential_expectation, potential_error, dpotential_mean, nsigma) if test_passed: print "TEST PASSED" else: print "TEST FAILED" print "----------------------------------------------------------------------------" if ('get_volume_expectation' in dir(testsystem)): # Skip this check if the std dev is zero. skip_test = False if (volume_n.std() == 0.0): skip_test = True if debug: print "Skipping volume test." if not skip_test: volume_expectation = testsystem.get_volume_expectation(thermodynamic_state) / (units.nanometers**3) volume_mean = volume_n.mean() g = timeseries.statisticalInefficiency(volume_n, fast=True) dvolume_mean = volume_n.std() / np.sqrt(niterations / g) volume_error = volume_mean - volume_expectation nsigma = abs(volume_error) / dvolume_mean test_passed = True if (nsigma > NSIGMA_CUTOFF): test_passed = False if debug or (test_passed is False): print "Volume expectation" print "observed %10.5f +- %10.5f kT | expected %10.5f | error %10.5f +- %10.5f (%.1f sigma)" % (volume_mean, dvolume_mean, volume_expectation, volume_error, dvolume_mean, nsigma) if test_passed: print "TEST PASSED" else: print "TEST FAILED" print "----------------------------------------------------------------------------"
def dA_Lambda_MBAR(plot_out=True, MinL=0, MaxL=100, dL=5, GAMMA=100, exponent=4, polymorphs='p1 p2', Molecules=72, Independent=4, Temp=200, Pressure=1, potential='oplsaa', hinge='DefaultHinge'): if (plot_out): import matplotlib # for making plots, version 'matplotlib-1.1.0-1'; errors may pop up when using earlier versions import matplotlib.pyplot as plt font = {'family': 'normal', 'weight': 'normal', 'size': 16} matplotlib.rc('font', **font) # ============================================================================================= # ENSURE THAT USER INPUTS ARE SENSIBLE # ============================================================================================= # TEMPERATURE if Temp < 0: print("Invalid Temperature: " + str(Temp)) sys.exit() if Pressure < 0: print("Invalid Pressure: " + str(Pressure)) sys.exit() # LAMBDA if (MinL == -1) and (MaxL == -1) and (dL == -1) and (exponent == 1): print("Using default values!") # The Lambda points sampled Lambdas = [ '000L', '010L', '020L', '030L', '040L', '050L', '060L', '070L', '080L', '090L', '100L' ] elif MinL < 0 or MaxL < 0 or dL < 0 or MinL > MaxL: print("Invalid Lambda Specifications") sys.exit() else: RawLambda = 0 Lambdas = [] lambda_names = np.arange(MinL, MaxL + dL, dL) Lambda_names = [] Lambda_indicies = [] index = 0 while RawLambda < MaxL: if RawLambda >= MinL: Lambda_indicies.append(index) index += 1 else: index += 1 RawLambda = RawLambda + dL continue if exponent >= 0: Lambda = int(100 * (float(RawLambda) / float(MaxL))**abs(exponent)) else: Lambda = int( 100 * (1 - (float(MaxL - RawLambda) / float(MaxL))**abs(exponent))) Lambdas.append(Lambda) # Format the lambda point name if RawLambda < 10: Lambda_names.append('00' + str(int(RawLambda)) + 'L') elif RawLambda < 100: Lambda_names.append('0' + str(int(RawLambda)) + 'L') else: Lambda_names.append('100L') RawLambda = RawLambda + dL # Catch the final lambda point Lambdas.append(MaxL) Lambda_indicies.append(index) if MaxL < 10: Lambda_names.append('00' + str(int(MaxL)) + 'L') elif MaxL < 100: Lambda_names.append('0' + str(int(MaxL)) + 'L') else: Lambda_names.append('100L') # GAMMA if GAMMA < 0 or GAMMA > 100: print("Invalid Gamma Point: " + str(GAMMA)) sys.exit() # POLYMORPH polymorphs = polymorphs.split() polymorph = [] polymorph_short = [] for i, token in enumerate(polymorphs): polymorph.append('Polymorph ' + str(token)) polymorph_short.append(token) # POTENTIAL if potential not in [ "oplsaa", "gromos", "designeda", "oplsaafakeg", "oplsaafakea" ]: print("Invalid Potential") print( "Supported potentials: oplsaa gromos designeda oplsaafakeg oplsaafakea" ) sys.exit() # ============================================================================================= # FORMAT INPUTS # ============================================================================================= # POTENTIAL PotNAME = "" if potential == "oplsaa": PotNAME = "OPLS" elif potential == "gromos": PotNAME = "GROM" elif potential == "designeda": PotNAME = "DESA" elif potential == "oplsaafakeg": PotNAME = "FAKEG" elif potential == "oplsaafakea": PotNAME = "FAKEA" # OPTIONAL HINGE if str(GAMMA) == "100": hingeLetter = "L" else: hingeLetter = "R" if hinge == "DefaultHinge": hinges = ["_" + hingeLetter] else: # Read in each job hinges = [] hingevect = hinge.split() for i, token in enumerate(hingevect): hinges.append("_" + hingeLetter + "_" + str(token)) # ============================================================================================= # READ IN RAW DATA # ============================================================================================= # Constants. kB = 1.3806488e-23 * 6.0221413e23 / (1000.0 * 4.184 ) # Boltzmann constant in kcal/mol omitK = [] # Parameters T_k = Temp * np.ones(len(Lambdas), float) # Convert temperatures to floats g_k = np.zeros([len(Lambdas)], float) K = len(Lambdas) # How many states? # total number of states examined; none are unsampled Kbig = K + 0 # maximum number of snapshots/simulation (could make this automated) - doesn't matter, as long as it's long enough. N_max = 200000 # beta factor for the different temperatures beta_k = 1.0 / (kB * T_k) dA = np.zeros([len(polymorph), len(Lambdas)], float) ddA = np.zeros([len(polymorph), len(Lambdas)], float) convert_units = (0.2390057) * np.ones( len(Lambdas), float) # Convert all energies to kcal/mol # Lines to ignore when reading in energies for i, poly in enumerate(polymorph): # Allocate storage for simulation data # N_k[k] is the total number of snapshots from alchemical state k N_k = np.zeros([Kbig], np.int32) # N_k_s[k,s] is the total number of snapshots from alchemical state k from seed s in 'unflipped segment j' N_ksj = np.zeros([Kbig, len(hinges), 100], np.int32) # u_kln[k,l,n] is the adjusted energy of snapshot n from simulation k u_kln = np.zeros([K, Kbig, N_max], np.float64) # dhdl_kln[k,l,n] is the restraint energy value of snapshop n from simulation k dhdl_kln = np.zeros([K, Kbig, N_max], np.float64) # dhdl_kn[k,n] is the derivative of energy with respect to lambda of snapshot n from simulation k dhdl_kn = np.zeros([K, N_max], np.float64) # Load in the data for each run for k in range(K): n = 0 for s, hinge in enumerate(hinges): keepconfigs = np.arange( N_max ) # The index of each configuration to keep in the MBAR analysis # cycle through all the input total energy data dirpath = polymorph_short[i] + '/restraints/' + str( lambda_names[k]) fname = dirpath + '/PROD.edr' dhdlname = dirpath + '/dhdl_PROD.xvg' if k not in omitK: potential_energy = panedr.edr_to_df( fname)['Potential'].values print("loading " + fname) dhdl_energy = np.loadtxt(dhdlname, comments=['#', '$', '@', '!']) print("loading " + dhdlname) # Removing any non-equilibrated points of the simulation [start_production, _, _] = timeseries.detectEquilibration(potential_energy) potential_energy = potential_energy[start_production:] dhdl_energy = dhdl_energy[start_production:] # the energy of every configuration from each state evaluated at its sampled state n = len(potential_energy) u_kln[k, :, :n] = (float(Independent) / Molecules) * ( potential_energy.reshape( (n, 1)) + dhdl_energy[:, 5:]).T * convert_units[k] dhdl_kln[k, :, :n] = dhdl_energy[:, 5:].T * convert_units[k] dhdl_kn[k, :n] = ( float(Independent) / Molecules) * dhdl_energy[:, 4].T * convert_units[k] # NSA: Can this go? symbolcounter = 0 # Truncate the kept configuration list to be less than n keepconfigs = [ j for j in keepconfigs if j < (len(potential_energy) - symbolcounter) and j >= 0 ] # Split up the retained configurations into connected segments j = 0 for a in range(len(keepconfigs)): if a == 0: continue elif int(keepconfigs[a - 1]) + 1 != int( keepconfigs[a]): N_ksj[k, s, j] = a - (sum(N_ksj[k, s, 0:j])) j += 1 # Catch the final segment N_ksj[k, s, j] = len(keepconfigs) - sum(N_ksj[k, s, 0:j]) j += 1 N_k[k] = n # convert to nondimensional units from kcal/mol u_kln *= beta_k[0] # all data loaded from the three sets u_kln_save = u_kln.copy() g_k = np.zeros([K]) # Ignore the first state due to jumping print("Number of retained samples") print(N_k) # ============================================================================================= # COMPUTE FREE ENERGY DIFFERENCE USING MBAR # ============================================================================================= # Initialize MBAR. print("Running MBAR...") # generate the weights of each of the umbrella set mbar = pymbar.MBAR(u_kln, N_k, verbose=True, subsampling_protocol=[{ 'method': 'L-BFGS-B' }]) print("MBAR Converged...") for k in range(Kbig): w = np.exp(mbar.Log_W_nk[:, k]) print("max weight in state %d is %12.7f" % (k, np.max(w))) neff = 1 / np.sum(w**2) print("Effective number of sample in state %d is %10.3f" % (k, neff)) print("Efficiency for state %d is %d/%d = %10.4f" % (k, neff, len(w), neff / len(w))) # extract self-consistent weights and uncertainties (df_i, ddf_i, theta_i) = mbar.getFreeEnergyDifferences() print("Free Energies Optained...") # convert PMF to kcal/mol and normalize by the number of molecules df_i /= (beta_k[0] * float(Independent)) ddf_i /= (beta_k[0] * float(Independent)) dA[i, :] = df_i[-1] # ============================================================================================= # COMPUTE UNCERTAINTY USING THE UNCORRELATED DATA # ============================================================================================= for k in range(K): # For each restraint state N_k[k] = 0 n_old = 0 if k not in omitK: for s in range( len(hinges) ): # For each independent trajectory of this restraint state for j in range( 100 ): # For each untossed segment of each independent trajectory of this restraint state if N_ksj[k, s, j] == 0: continue # Feed in the segment and calculate correlation time g_k[k] = timeseries.statisticalInefficiency( dhdl_kn[k, n_old:(n_old + N_ksj[k, s, j])]) print( "Correlation time for sampled state %d is %10.3f" % (k, g_k[k])) # subsample the data to get statistically uncorrelated data # subsample indices within the segment indices = np.array( timeseries.subsampleCorrelatedData( u_kln[k, k, n_old:(n_old + N_ksj[k, s, j])], g=g_k[k])).astype(int) # Apphend the uncorrelated configurations in the segment to the u_kln matrix u_kln[k, :, N_k[k]:(N_k[k] + len(indices))] = u_kln_save[k, :, ( indices + n_old)].transpose() N_k[k] = N_k[k] + len(indices) n_old += N_ksj[k, s, j] print("Number of retained samples") print(N_k) print("Number of retained samples from each seed") print(N_ksj) # generate the weights of each of the umbrella set mbar = pymbar.MBAR(u_kln, N_k, verbose=True, subsampling_protocol=[{ 'method': 'L-BFGS-B' }]) print("MBAR Converged...") # testing # extract self-consistent weights and uncertainties (df_u, ddf_u, theta_i) = mbar.getFreeEnergyDifferences() print("Free Energies Optained...") # convert PMF to kcal/mol and normalize by the number of molecules df_u /= (beta_k[0] * float(Independent)) ddf_u /= (beta_k[0] * float(Independent)) ddA[i, :] = ddf_u[-1] # Write out free energy differences print("Free Energy Difference (in units of kcal/mol)") print(" dA(Lambda) = A(Lambda) - A(Fully Restrained)") for k in range(Kbig): print("%8.3f %8.3f" % (df_i[k, -1], ddf_u[k, -1])) # ============================================================================================= # PRINT THE FINAL DATA # ============================================================================================= out_dA = np.zeros(len(polymorph)) out_ddA = np.zeros(len(polymorph)) for i, poly in enumerate(polymorph): out_dA[i] = dA[i, 0] #Kbig - 1] out_ddA[i] = ddA[i, 0] #Kbig - 1] # ============================================================================================= # PLOT THE FINAL DATA # ============================================================================================= if (plot_out) and polymorphs == 'all': # now plot the free energy change as a function of temperature fig = plt.figure(4) ax = fig.add_subplot(111) xlabel = 'Restraint Strength, $\lambda$' ylabel = 'Relative Free Energy (kcal/mol)' plt.xlabel(xlabel) plt.ylabel(ylabel) Xaxis = [float(j / 100.0) for j in Lambdas] if os.path.isfile('BootstrapStd_' + PotNAME + '_Polymorph1_' + str(Molecules) + '_' + Tname + '_' + Pname + '_dAvsL_All'): ddA[0, :] = MBARBootstrap.ExtractBootstrap('BootstrapStd_' + PotNAME + '_Polymorph1_' + str(Molecules) + '_' + Tname + '_' + Pname + '_dAvsL_All') elif os.path.isfile('BootstrapStd_' + PotNAME + '_Polymorph2_' + str(Molecules) + '_' + Tname + '_' + Pname + '_dAvsL_All'): ddA[1, :] = MBARBootstrap.ExtractBootstrap('BootstrapStd_' + PotNAME + '_Polymorph2_' + str(Molecules) + '_' + Tname + '_' + Pname + '_dAvsL_All') elif os.path.isfile('BootstrapStd_' + PotNAME + '_Polymorph2_' + str(Molecules) + '_' + Tname + '_' + Pname + '_dAvsL_All'): ddA[2, :] = MBARBootstrap.ExtractBootstrap('BootstrapStd_' + PotNAME + '_Polymorph3_' + str(Molecules) + '_' + Tname + '_' + Pname + '_dAvsL_All') ax.errorbar(Xaxis, dA[0, :], color='b', yerr=ddA[0, :], label='Benzene I') ax.errorbar(Xaxis, dA[1, :], color='g', yerr=ddA[1, :], label='Benzene II') ax.errorbar(Xaxis, dA[2, :], color='r', yerr=ddA[2, :], label='Benzene III') plt.legend(loc='upper left') if len(hinges) > 1: filename = PotNAME + '_' + str( Molecules) + '_' + Tname + '_dAvsL.pdf' else: filename = PotNAME + '_' + str( Molecules) + '_' + Tname + hinge + '_dAvsL.pdf' plt.show() return out_dA, out_ddA
def DoBAR(fwds, revs, label, verbose): """ BAR to combine fwd and rev data of dGs. Here, don't multiply dGs_R by -1 since BAR calls for reverse work value. Parameters ---------- fwds: dictionary of forward work values for each window revs: dictionary of reverse work values for each window label: string label of what it is (only for printing output) Returns ------- dgs: 1D list of accumulated list of energy values. Ex. if each step was 2, then dgs would be [0,2,4...] gsdlist: 1D list of accompanying stdevs to the dgs list """ fwd_ss = {} # subsampled version of fwds rev_ss = {} # subsampled version of revs dg_bar = np.zeros([len(fwds)], np.float64) # allocate storage: dG steps gsd_bar = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev steps dgs = np.zeros([len(fwds)], np.float64) # allocate storage: dG accumulated gsdlist = np.zeros([len(fwds)], np.float64) # allocate storage: dG stdev accum #corr_time = np.zeros([len(fwds)], np.float64) corr_time = {} for key, value in fwds.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me # compute correlation time g = timeseries.statisticalInefficiency(value) corr_time[key] = [g] # compute indices of UNcorrelated timeseries, then extract those samples indices = timeseries.subsampleCorrelatedData(value, g) fwd_ss[key] = value[indices] for key, value in revs.items(): # this notation changes in python3: http://tinyurl.com/j3uq3me # compute correlation time g = timeseries.statisticalInefficiency(value) corr_time[key].append(g) # compute indices of UNcorrelated timeseries, then extract those samples indices = timeseries.subsampleCorrelatedData(value, g) rev_ss[key] = value[indices] bar = {} # then apply BAR estimator to get dG for each step for kF, kR in zip(sorted(fwd_ss.keys()), sorted(list(rev_ss.keys()), reverse=True)): dg_bar[kF], gsd_bar[kF] = BAR(fwd_ss[kF],rev_ss[kR]) bar[kF] = [ np.sum(dg_bar), dg_bar[kF], gsd_bar[kF] ] # calculate the net dG standard deviation = sqrt[ sum(s_i^2) ] gsd = (np.sum(np.power(gsd_bar, 2)))**0.5 net = 0. netsd = 0. for i, g in enumerate(dg_bar): # accumulate net dGs into running sums (plot this) dgs[i] = dg_bar[i] + net net = dgs[i] # combine the stdevs: s = sqrt(s1^2 + s2^2 + ...) gsdlist[i] = ((gsd_bar[i])**2.+(netsd)**2.)**0.5 netsd = gsdlist[i] if verbose == True: print('\n\n#####---Correlation Times for dG_{}--#####'.format(label)) print('Window'.rjust(3), 'F'.rjust(5), 'R'.rjust(9)) for k,v in corr_time.items(): print("{:3d} {:10.3f} {:10.3f}".format(k, v[0], v[1]) ) print("\n\n#####---BAR estimator for dG_{}---#####".format(label)) print('Window'.rjust(3), 'dG'.rjust(5), 'ddG'.rjust(11), "Uncert.".rjust(11)) print("---------------------------------------------------------") for k, v in bar.items(): str = '{:3d} {:10.4f} {:10.4f} +- {:3.4f}'.format(k, v[0], v[1], v[2]) print(str) print(("\nNet dG_{} energy difference = {:.4f} +- {:.4f} kcal/mol".format(label, np.sum(dg_bar), gsd))) return dgs, gsdlist
infile = open(filename, 'r') lines = infile.readlines() infile.close() # Parse data. n = 0 for line in lines: if line[0] != '#' and line[0] != '@': tokens = line.split() u_kn[k,n] = beta_k[k] * (float(tokens[2]) - float(tokens[1])) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k,:], u_kn[k,0:N_k[k]]) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(u_kn[k,0:N_k[k]]) else: chi_radians = chi_kn[k,0:N_k[k]]/(180.0/numpy.pi) g_cos = timeseries.statisticalInefficiency(numpy.cos(chi_radians)) g_sin = timeseries.statisticalInefficiency(numpy.sin(chi_radians)) print "g_cos = %.1f | g_sin = %.1f" % (g_cos, g_sin) g_k[k] = max(g_cos, g_sin) print "Correlation time for set %5d is %10.3f" % (k,g_k[k]) indices = timeseries.subsampleCorrelatedData(chi_radians, g=g_k[k]) # Subsample data. N_k[k] = len(indices) u_kn[k,0:N_k[k]] = u_kn[k,indices] chi_kn[k,0:N_k[k]] = chi_kn[k,indices]
def doStatistics( filename ): array = np.genfromtxt( filename, skip_header = 100 , usecols = 1, dtype = float) return np.mean(array), np.std(array) / np.sqrt(len(array)/statisticalInefficiency(array))
#======================================================================== #------------------------------------------------------------------------ # Read Data From File #------------------------------------------------------------------------ print("") print("Preparing data:") T_from_file = read_simulation_temps(simulation, NumTemps) E_from_file = read_total_energies(simulation, TE_COL_NUM) K = len(T_from_file) N_k = numpy.zeros(K, numpy.int32) g = numpy.zeros(K, numpy.float64) for k in range(K): # subsample the energies g[k] = timeseries.statisticalInefficiency(E_from_file[k]) indices = numpy.array( timeseries.subsampleCorrelatedData( E_from_file[k], g=g[k])) # indices of uncorrelated samples N_k[k] = len(indices) # number of uncorrelated samples E_from_file[k, 0:N_k[k]] = E_from_file[k, indices] #------------------------------------------------------------------------ # Insert Intermediate T's and corresponding blank U's and E's #------------------------------------------------------------------------ Temp_k = T_from_file minT = T_from_file[0] maxT = T_from_file[len(T_from_file) - 1] #beta = 1/(k*BT) #T = 1/(kB*beta) if dtype == 'temperature':
def statistical_inefficiency(df, series=None, lower=None, upper=None, step=None, conservative=True, drop_duplicates=False, sort=False): """Subsample a DataFrame based on the calculated statistical inefficiency of a timeseries. If `series` is ``None``, then this function will behave the same as :func:`slicing`. Parameters ---------- df : DataFrame DataFrame to subsample according statistical inefficiency of `series`. series : Series Series to use for calculating statistical inefficiency. If ``None``, no statistical inefficiency-based subsampling will be performed. lower : float Lower bound to pre-slice `series` data from. upper : float Upper bound to pre-slice `series` to (inclusive). step : int Step between `series` items to pre-slice by. conservative : bool ``True`` use ``ceil(statistical_inefficiency)`` to slice the data in uniform intervals (the default). ``False`` will sample at non-uniform intervals to closely match the (fractional) statistical_inefficieny, as implemented in :func:`pymbar.timeseries.subsampleCorrelatedData`. drop_duplicates : bool Drop the duplicated lines based on time. sort : bool Sort the Dataframe based on the time column. Returns ------- DataFrame `df` subsampled according to subsampled `series`. Warning ------- The `series` and the data to be sliced, `df`, need to have the same number of elements because the statistical inefficiency is calculated based on the index of the series (and not an associated time). At the moment there is no automatic conversion from a time to an index. Note ---- For a non-integer statistical ineffciency :math:`g`, the default value ``conservative=True`` will provide _fewer_ data points than allowed by :math:`g` and thus error estimates will be _higher_. For large numbers of data points and converged free energies, the choice should not make a difference. For small numbers of data points, ``conservative=True`` decreases a false sense of accuracy and is deemed the more careful and conservative approach. See Also -------- pymbar.timeseries.statisticalInefficiency : detailed background pymbar.timeseries.subsampleCorrelatedData : used for subsampling .. versionchanged:: 0.2.0 The ``conservative`` keyword was added and the method is now using ``pymbar.timeseries.statisticalInefficiency()``; previously, the statistical inefficiency was _rounded_ (instead of ``ceil()``) and thus one could end up with correlated data. """ if _check_multiple_times(df): if drop_duplicates: if isinstance(df, pd.Series): # remove the duplicate based on time drop_duplicates_series = df.reset_index('time', name='').\ drop_duplicates('time') # Rest the time index lambda_names = [ 'time', ] lambda_names.extend(drop_duplicates_series.index.names) df = drop_duplicates_series.set_index('time', append=True).\ reorder_levels(lambda_names) else: # remove the duplicate based on time drop_duplicates_df = df.reset_index('time').drop_duplicates( 'time') # Rest the time index lambda_names = [ 'time', ] lambda_names.extend(drop_duplicates_df.index.names) df = drop_duplicates_df.set_index('time', append=True).\ reorder_levels(lambda_names) # Do the same withing with the series if series is not None: # remove the duplicate based on time drop_duplicates_series = series.reset_index('time', name='').\ drop_duplicates('time') # Rest the time index lambda_names = [ 'time', ] lambda_names.extend(drop_duplicates_series.index.names) series = drop_duplicates_series.set_index('time', append=True).\ reorder_levels(lambda_names) else: raise KeyError( "Duplicate time values found; statistical inefficiency " "only works on a single, contiguous, " "and sorted timeseries.") if not _check_sorted(df): if sort: df = df.sort_index(level='time') if series is not None: series = series.sort_index(level='time') else: raise KeyError( "Statistical inefficiency only works as expected if " "values are sorted by time, increasing.") if series is not None: if (len(series) != len(df) or not all( series.reset_index()['time'] == df.reset_index()['time'])): raise ValueError( "series and data must be sampled at the same times") series = slicing(series, lower=lower, upper=upper, step=step) # calculate statistical inefficiency of series (could use fft=True but needs test) statinef = statisticalInefficiency(series, fast=False) # use the subsampleCorrelatedData function to get the subsample index indices = subsampleCorrelatedData(series, g=statinef, conservative=conservative) df = df.iloc[indices] else: df = slicing(df, lower=lower, upper=upper, step=step) return df
infile.close() # Parse data. n = 0 for line in lines: if line[0] != '#' and line[0] != '@': tokens = line.split() u_kn[k, n] = beta_k[k] * ( float(tokens[2]) - float(tokens[1]) ) # reduced potential energy without umbrella restraint n += 1 # Compute correlation times for potential energy and chi # timeseries. If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi if (DifferentTemperatures): g_k[k] = timeseries.statisticalInefficiency(u_kn[k, :], u_kn[k, 0:N_k[k]]) print("Correlation time for set %5d is %10.3f" % (k, g_k[k])) indices = timeseries.subsampleCorrelatedData(u_kn[k, 0:N_k[k]]) else: d = d_kn[k, 0:N_k[k]] g_k[k] = timeseries.statisticalInefficiency(d) print("Correlation time for set %5d is %10.3f" % (k, g_k[k])) indices = timeseries.subsampleCorrelatedData(d, g=g_k[k]) # Subsample data. N_k[k] = len(indices) u_kn[k, 0:N_k[k]] = u_kn[k, indices] d_kn[k, 0:N_k[k]] = d_kn[k, indices] N_max = numpy.max(N_k) # shorten the array size u_kln = numpy.zeros( [K, K, N_max], numpy.float64
def main(): usage = """ usage: %prog [options] <metadata file> """ parser = optparse.OptionParser(usage) parser.add_option("-o", "--outfile", dest="output_file", default='mbar_pmf.out', help="Output file for PMF [default: %default]") parser.add_option("-t", "--temperature", dest="temperature", default=300., type="float", help="Initial temperature in K [default: %default K]") parser.add_option("-b", "--bins", dest="bins", default=50, type="int", help="Number of bins for 1D PMF [default: %default]") parser.add_option("-d", "--double", dest="double_k", default=False, action='store_true', help="Double the k values [default: %default]") parser.add_option("-c", "--kcal", dest="kcal_k", default=False, action='store_true', help="Convert k values from kcal to kJ [default: %default]") parser.add_option("-s", "--skip-subsampling", dest="skip_subsampling", default=False, action='store_true', help="Skip data subsampling [default: %default]") parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true", help="Verbose output from PyMBAR [default: %default]") (options, args) = parser.parse_args() if len(args) < 1: parser.error('No metadata file passed') elif not os.path.exists(args[0]): parser.error('Metadata file not found') metadata = [] # stores metadata per umbrella N_max = 0 # the max number of snapshots per umbrella different_temperatures = False # flag to know if we are reading in energies for the snapshots # open the wham metadata file print "Opening metadata file %s" % args[0] f = open(args[0], 'r') metadata_lines = f.readlines() f.close() # first get all the metadata and count the max number of snapshots per umbrella for line in metadata_lines: # skip comments if line.startswith('#'): continue # split lines based on spaces, but convert tabs to spaces first clean_split = filter(None, line.strip().expandtabs().split(' ')) if not os.path.exists(clean_split[0]): print "Data file %s doesn't exist, skipping this replica" % clean_split[0] continue else: # get the number of snapshots for the replica nsnapshots = file_len(clean_split[0]) # /path/to/timeseries/file loc_win_min spring [correl time] [temperature] k = float(clean_split[2]) if options.double_k: k = k*2.0 if options.kcal_k: k = k*4.184 current_meta = { 'path': clean_split[0], 'coord': float(clean_split[1]), 'k': k, 'n': nsnapshots } # K_k[k] = float(tokens[1]) * (numpy.pi/180)**2 # spring constant (read in kJ/mol/rad**2, converted to kJ/mol/deg**2) if len(clean_split) >= 4: # TODO: temperature the 4rd or 5th value??? # temperature might be the 4th value... current_meta['t'] = float(clean_split[3]) different_temperatures = True metadata.append(current_meta) N_max = numpy.max([ w['n'] for w in metadata ]) print "Max number of snapshots %d" % N_max # now allocate the memory for the arrays K = len(metadata) T_k = numpy.ones(K,float)*options.temperature # inital temperatures are all equal beta_k = 1.0/(kB*T_k) # beta factor for the different temperatures data = numpy.zeros([K,N_max], numpy.float64) # the snapshot data u_kn = numpy.zeros([K,N_max], numpy.float64) # u_kn[k,n] is the reduced potential energy without umbrella restraints of snapshot n of umbrella simulation k u_kln = numpy.zeros([K,K,N_max], numpy.float64) # u_kln[k,l,n] is the reduced potential energy of snapshot n from umbrella simulation k evaluated at umbrella l g_k = numpy.zeros([K],numpy.float32) # correlation time data_min = [] # will set the min and max data values later data_max = [] # Now loop through each datafile and extract the data for i, w in enumerate(metadata): print "Reading %s..." % w['path'] f = open(w['path'], 'r') lines = f.readlines() f.close() clean_split_lines = [ filter(None, line.strip().expandtabs().split(' ')) for line in lines if not line.startswith('#') ] if different_temperatures: raise Exception('Differen\'t temperatures aren\'t supported yet') # if different temperatures are specified the metadata file, # then we need the energies to compute the PMF, found in the third column # for j,l in enumerate(clean_split_lines): # data[i,j] = float(l[1]) # second column is the coordinate # # third column will be the system's potential energy # potential_energy = float(l[2]) # dchi = w['coord']-float(l[1]) # restraint_potential = k_multiplier*w['k']*(dchi**2) # # TODO: given the coordinate and the restraining potential, calculate the umbrella restraint # u_kn[i,j] = beta_k[i] * (potential_energy-restraint_potential) # reduced potential energy without umbrella restraint # # # Compute correlation times for potential energy and timeseries. # # If the temperatures differ, use energies to determine samples; otherwise, use the cosine of chi # g_k[i] = timeseries.statisticalInefficiency(u_kn[i,:], u_kn[i,:]) # indices = timeseries.subsampleCorrelatedData(u_kn[i,:]) else: # no temperature column for j,l in enumerate(clean_split_lines): data[i,j] = float(l[1]) dataset = numpy.cos(data[i,:w['n']]) g_k[i] = timeseries.statisticalInefficiency(dataset,dataset) if not options.skip_subsampling: indices = timeseries.subsampleCorrelatedData(dataset) if options.skip_subsampling: data_max.append(numpy.max(data[i])) data_min.append(numpy.min(data[i])) w['n'] = len(data[i]) u_kn[i,0:w['n']] = u_kn[i] data[i,0:w['n']] = data[i] else: # get min and max for data, used for binning ranges data_max.append(numpy.max(data[i,indices])) data_min.append(numpy.min(data[i,indices])) # Subsample the data w['n'] = len(indices) u_kn[i,0:w['n']] = u_kn[i,indices] data[i,0:w['n']] = data[i,indices] print "Correlation time for set %5d is %10.3f" % (i,g_k[i]) print "Finished reading data files" # Set zero of u_kn -- this is arbitrary. u_kn -= u_kn.min() # Construct torsion bins print "Binning data..." data_min = numpy.min(data_min) data_max = numpy.max(data_max) delta = (data_max - data_min) / float(options.bins) print "Min coord: %f" % data_min print "Max coord: %f" % data_max print "Delta for binning %f" % delta # compute bin centers bin_center_i = numpy.zeros([options.bins], numpy.float64) for i in range(options.bins): bin_center_i[i] = data_min + delta/2 + delta * i # Bin data bin_kn = numpy.zeros([K,N_max], numpy.int32)-1 # for each window for k in range(K): # for 0 to the number of snapshots in the window k for n in range(metadata[k]['n']): # Compute bin assignment. bin_kn[k,n] = int((data[k,n] - data_min) / delta) for l in range(K): # Compute minimum-image torsion deviation from umbrella center l dchi = data[k,n] - metadata[l]['coord'] # Compute energy of snapshot n from simulation k in umbrella potential l u_kln[k,l,n] = u_kn[k,n] + beta_k[k]*metadata[l]['k']*(dchi**2) for i in range(options.bins): if numpy.sum(bin_kn==i) == 0: for j in range(options.bins): print "Bin: %d" % j print numpy.sum(bin_kn==j) raise Exception("At least one bin has no samples. Adjust bin sizes or eliminate empty bins to ensure at least one sample per bin.") # Initialize MBAR. print "Running MBAR..." N_k = numpy.array([ w['n'] for w in metadata ], numpy.int32) mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose, initialize='BAR') #mbar = pymbar.MBAR(u_kln, N_k, verbose=options.verbose) #mbar = pymbar.MBAR(u_kln, N_k, verbose = True, method = 'Newton-Raphson') # Compute PMF in unbiased potential (in units of kT). (f_i, df_i) = mbar.computePMF(u_kn, bin_kn, options.bins) # Write out PMF and save to file print "Saving PMF to file: %s" % options.output_file f = open(options.output_file, 'w') print "PMF (in units of kT)" print "%8s %8s %8s" % ('bin', 'f', 'df') f.write("#Coor Free +/-\n") for i in range(options.bins): print "%8.1f %8.3f %8.3f" % (bin_center_i[i], f_i[i], df_i[i]) f.write("%8.1f %8.3f %8.3f\n" % (bin_center_i[i], f_i[i], df_i[i])) f.close()
bp = get_probs(data_file)[skip:,:] n_samples, n_bins = bp.shape bi = np.arange(n_bins) pmf = -0.6*np.log(np.mean(bp, axis=0)) pmf_mean = pmf # Calculate statistical inefficiency try: g = np.load(stat_ineff_file) except: g = np.zeros((n_bins,)) for k in xrange(n_bins): g[k] = timeseries.statisticalInefficiency(bp[:,k]) np.save(stat_ineff_file, g) N_eff = np.ceil(n_samples / np.max(g)) tstat = scipy.stats.t.ppf(.975, N_eff - 1) print 'N_eff: ', N_eff pmf_err = np.empty((n_bins)) for k in xrange(n_bins): blks = np.array_split(bp[:,k], N_eff) blk_mean = np.array(map(np.mean, blks)) blk_pmf = -0.6*np.log(blk_mean) pmf_err[k] = tstat*np.std(blk_pmf)/np.sqrt(blk_pmf.size) pmf_min = np.min(pmf)
def test_statistical_inefficiency_single(): X, Y, energy = generate_data() timeseries.statisticalInefficiency(X[0]) timeseries.statisticalInefficiency(X[0], X[0]) timeseries.statisticalInefficiency(X[0]**2) timeseries.statisticalInefficiency(X[0]**2, X[0]**2) timeseries.statisticalInefficiency(energy[0]) timeseries.statisticalInefficiency(energy[0], energy[0]) timeseries.statisticalInefficiency(X[0], X[0]**2)
def test_statistical_inefficiency_single(): X, Y, energy = generate_data() timeseries.statisticalInefficiency(X[0]) timeseries.statisticalInefficiency(X[0], X[0]) timeseries.statisticalInefficiency(X[0] ** 2) timeseries.statisticalInefficiency(X[0] ** 2, X[0] ** 2) timeseries.statisticalInefficiency(energy[0]) timeseries.statisticalInefficiency(energy[0], energy[0]) timeseries.statisticalInefficiency(X[0], X[0] ** 2)
def dA_MBAR(minimum=0, maximum=100, spacing=10, exponent=2, polymorphs='p1 p2', Molecules=72, Independent=4, Temp=200, bonds=False, primary_directory='.', added_directories=[]): # ============================================================================================= # Setting up the values for gamma or lambda states # ============================================================================================= # raw_value = minimum # values = [] directory_names = np.arange(minimum, maximum + spacing, spacing) directory_names = np.sort(np.append(directory_names, added_directories)) # while raw_value <= maximum: # if exponent >= 0: # value = int(100 * (float(raw_value) / float(maximum)) ** abs(exponent)) # else: # value = int(100 * (1 - (float(maximum - raw_value) / float(maximum)) ** abs(exponent))) # values.append(value) # raw_value = raw_value + spacing # print(values) # print(directory_names) # exit() # POLYMORPH polymorphs = polymorphs.split() # ============================================================================================= # READ IN RAW DATA # ============================================================================================= # Constants. kB = 1.3806488e-23 * 6.0221413e23 / (1000.0 * 4.184 ) # Boltzmann constant in kcal/mol # Parameters T_k = Temp * np.ones(len(directory_names), float) # Convert temperatures to floats print(T_k) # print(values) K = len(directory_names) # How many states? # total number of states examined; 0 are unsampled if bonds are left on, 1 is unsampled if the bonds are removed Kbig = K # maximum number of snapshots/simulation (could make this automated) - doesn't matter, as long as it's long enough. N_max = 5000 # beta factor for the different temperatures beta_k = 1.0 / (kB * T_k) dA = np.zeros([len(polymorphs), Kbig], float) ddA = np.zeros([len(polymorphs), Kbig], float) convert_units = 0.2390057 * np.ones( Kbig, float) # Convert all energies to kcal/mol # Allocate storage for simulation data for i, poly in enumerate(polymorphs): # N_k[k] is the total number of snapshots from alchemical state k N_k = np.zeros([Kbig], np.int32) # N_k_s[k,s] is the total number of snapshots from alchemical state k from seed s N_k_s = np.zeros([Kbig], np.int32) # u_kln[k,l,n] is the adjusted energy of snapshot n from simulation k u_kln = np.zeros([K, Kbig, N_max], np.float64) # dhdl_kn[k,n] is the derivative of energy with respect to lambda of snapshot n from simulation k dhdl_kn = np.zeros([K, N_max], np.float64) #Load in the data for each run for k in range(K): n = 0 # cycle through all the input total energy data if directory_names[k] == int(directory_names[k]): dirpath = polymorphs[i] + '/' + primary_directory + '/' + str( int(directory_names[k])) else: dirpath = polymorphs[i] + '/' + primary_directory + '/' + str( directory_names[k]) if os.path.isdir(dirpath): fname = dirpath + '/PROD.edr' dhdlname = dirpath + '/dhdl_PROD.xvg' potential_energy = panedr.edr_to_df(fname)['Potential'].values print("loading " + fname) dhdl_energy = np.loadtxt(dhdlname, comments=['#', '$', '@', '!']) print("loading " + dhdlname) # Removing any non-equilibrated points of the simulation [start_production, _, _] = timeseries.detectEquilibration(potential_energy) potential_energy = potential_energy[start_production:] dhdl_energy = dhdl_energy[start_production:, :] # Cutting points if they exceed N_max if len(potential_energy) > N_max: potential_energy = potential_energy[len(potential_energy) - N_max:] dhdl_energy = dhdl_energy[len(dhdl_energy) - N_max:, :] # the energy of every configuration from each state evaluated at its sampled state n = len(potential_energy) dhdl_placement = len(dhdl_energy[0, :]) - K u_kln[k, :K, :n] = (potential_energy.reshape( (n, 1)) + dhdl_energy[:, dhdl_placement:] ).T * convert_units[k] dhdl_kn[k, :n] = (float(Independent) / Molecules) * \ np.sum(dhdl_energy[:, 2:dhdl_placement], axis=1) * convert_units[k] N_k_s[k] = n N_k[k] = n # convert to nondimensional units from kcal/mol u_kln *= beta_k[0] #u_kln_save = u_kln.copy() u_kln_save = u_kln[:] g_k = np.zeros([K]) print("Number of retained samples") print(N_k) print("Number of retained samples from each seed") print(N_k_s) # ============================================================================================= # COMPUTE FREE ENERGY DIFFERENCE USING MBAR # ============================================================================================= # Initialize MBAR. print("Running MBAR...") # generate the weights of each of the umbrella set mbar = pymbar.MBAR(u_kln, N_k, verbose=True, subsampling_protocol=[{ 'method': 'L-BFGS-B' }]) print("MBAR Converged...") # testing for k in range(Kbig): w = np.exp(mbar.Log_W_nk[:, k]) print("max weight in state %d is %12.7f" % (k, np.max(w))) neff = 1 / np.sum(w**2) print("Effective number of sample in state %d is %10.3f" % (k, neff)) print("Efficiency for state %d is %d/%d = %10.4f" % (k, neff, len(w), neff / len(w))) # extract self-consistent weights and uncertainties (df_i, ddf_i, theta_i) = mbar.getFreeEnergyDifferences() print("Free Energies Optained...") # convert PMF to kcal/mol and normalize by the number of molecules df_i /= (beta_k[0] * float(Independent)) ddf_i /= (beta_k[0] * float(Independent)) dA[i, :] = df_i[-1] # ============================================================================================= # COMPUTE UNCERTAINTY USING THE UNCORRELATED DATA # ============================================================================================= for k in range(K): N_k[k] = 0 n_old = 0 g_k[k] = timeseries.statisticalInefficiency( dhdl_kn[k, n_old:(n_old + N_k_s[k])]) print("Correlation time for sampled state %d is %10.3f" % (k, g_k[k])) # subsample the data to get statistically uncorrelated data indices = np.array( timeseries.subsampleCorrelatedData(u_kln[k, k, n_old:(n_old + N_k_s[k])], g=g_k[k])) # subsample # not sure why we have to transpose if indices != []: u_kln[k, :, N_k[k]:(N_k[k] + len(indices))] = u_kln_save[k, :, (indices + n_old)].transpose() N_k[k] = N_k[k] + len(indices) n_old += N_k_s[k] print("Number of retained samples") print(N_k) print("Number of retained samples from each seed") print(N_k_s) # generate the weights of each of the umbrella set mbar = pymbar.MBAR(u_kln, N_k, verbose=True, subsampling_protocol=[{ 'method': 'L-BFGS-B' }]) print("MBAR Converged...") # extract self-consistent weights and uncertainties try: (df_u, ddf_u, theta_i) = mbar.getFreeEnergyDifferences() except ValueError: pass print("Free Energies Optained...") # convert PMF to kcal/mol and normalize by the number of molecules df_u /= (beta_k[0] * float(Independent)) ddf_u /= (beta_k[0] * float(Independent)) ddA[i, :] = ddf_u[-1] # ddA[i, :] = ddf_i[-1] # Write out free energy differences print("Free Energy Difference (in units of kcal/mol)") print(" dA(Gamma) = A(Gamma) - A(Interactions Off)") for k in range(Kbig): print("%8.3f %8.3f" % (df_i[k, -1], ddf_u[k, -1])) del N_k del N_k_s del u_kln del dhdl_kn out_dA = np.zeros(len(polymorphs)) out_ddA = np.zeros(len(polymorphs)) for i, poly in enumerate(polymorphs): out_dA[i] = dA[i, 0] out_ddA[i] = ddA[i, 0] return out_dA, out_ddA
mask_kt[k,0:T_k[k]] = True # Create a list from this mask. all_data_indices = where(mask_kt) # Construct equal-frequency extension bins print("binning data...") bin_kt = zeros([K, T_max], int32) (bin_left_boundary_i, bin_center_i, bin_width_i, bin_assignments) = construct_nonuniform_bins(x_kt[all_data_indices], nbins) bin_kt[all_data_indices] = bin_assignments # Compute correlation times. N_max = 0 g_k = zeros([K], float64) for k in range(K): # Compute statistical inefficiency for extension timeseries g = timeseries.statisticalInefficiency(x_kt[k,0:T_k[k]], x_kt[k,0:T_k[k]]) # store statistical inefficiency g_k[k] = g print("timeseries %d : g = %.1f, %.0f uncorrelated samples (of %d total samples)" % (k+1, g, floor(T_k[k] / g), T_k[k])) N_max = max(N_max, ceil(T_k[k] / g) + 1) # Subsample trajectory position data. x_kn = zeros([K, N_max], float64) bin_kn = zeros([K, N_max], int32) N_k = zeros([K], int32) for k in range(K): # Compute correlation times for potential energy and chi timeseries. indices = timeseries.subsampleCorrelatedData(x_kt[k,0:T_k[k]]) # Store subsampled positions. N_k[k] = len(indices) x_kn[k,0:N_k[k]] = x_kt[k,indices]
) # N_k[k] is the number of uncorrelated samples from simulation index k reduced_expectation_data = [] if len(expectation_columns) > 0: for i in range(len(expectation_columns)): reduced_expectation_data.append( numpy.zeros([K, N_samples], numpy.float64)) reduced_fep_data = [] if len(fep_columns) > 0: for i in range(len(fep_columns)): reduced_fep_data.append(numpy.zeros([K, N_samples], numpy.float64)) for k in range(K): # Extract timeseries. A_t = biasing_variable_kt[0][k, :] # Compute statistical inefficiency. try: g = timeseries.statisticalInefficiency(A_t) except Exception as e: print str(e) print A_t # Subsample data. if subsample_trajectories: indices = timeseries.subsampleCorrelatedData(A_t, g=g) else: indices = timeseries.subsampleCorrelatedData(A_t, g=1) N = len(indices) # number of uncorrelated samples print "k = %5d : g = %.1f, N = %d" % (k, g, N) for i in range(nbiases): biasing_variable_kn[i][k, 0:N] = biasing_variable_kt[i][k, indices] for i in range(nperturbations + 1): U_kn[i][k, 0:N] = U_kt[i][k, indices]