def sample_indexes_by_cluster(self, clusters, nsample, replace=True): """Samples trajectory/time indexes according to the given sequence of states. Parameters ---------- clusters : iterable of integers It contains the cluster indexes to be sampled nsample : int Number of samples per cluster. If replace = False, the number of returned samples per cluster could be smaller if less than nsample indexes are available for a cluster. replace : boolean, optional Whether the sample is with or without replacement Returns ------- indexes : list of ndarray( (N, 2) ) List of the sampled indices by cluster. Each element is an index array with a number of rows equal to N=len(sequence), with rows consisting of a tuple (i, t), where i is the index of the trajectory and t is the time index within the trajectory. """ # Check if the catalogue (index_states) if len(self._index_states) == 0: # has never been run self._index_states = index_states(self.dtrajs) return sample_indexes_by_state(self._index_states[clusters], nsample, replace=replace)
def test_sample_by_state_replace(self): dtraj =[0,1,2,3,2,1,0] idx = dt.index_states(dtraj) sidx = dt.sample_indexes_by_state(idx, 5) for i in range(4): assert(sidx[i].shape[0] == 5) for t in range(sidx[i].shape[0]): assert(dtraj[sidx[i][t,1]] == i)
def test_sample_by_state_replace(self): dtraj = [0, 1, 2, 3, 2, 1, 0] idx = dt.index_states(dtraj) sidx = dt.sample_indexes_by_state(idx, 5) for i in range(4): assert (sidx[i].shape[0] == 5) for t in range(sidx[i].shape[0]): assert (dtraj[sidx[i][t, 1]] == i)
def test_sample_by_sequence(self): dtraj =[0,1,2,3,2,1,0] idx = dt.index_states(dtraj) seq = [0,1,1,1,0,0,0,0,1,1] sidx = dt.sample_indexes_by_sequence(idx, seq) assert(np.alltrue(sidx.shape == (len(seq),2))) for t in range(sidx.shape[0]): assert(sidx[t,0] == 0) # did we pick the right traj? assert(dtraj[sidx[t,1]] == seq[t]) # did we pick the right states?
def test_twotraj(self): dtrajs = [[0,1,2,3,2,1,0], [3,4,5]] # should be a ValueError because this is not a subset res = dt.index_states(dtrajs) expected = [np.array([[0,0],[0,6]]),np.array([[0,1],[0,5]]),np.array([[0,2],[0,4]]),np.array([[0,3],[1,0]]),np.array([[1,1]]),np.array([[1,2]])] assert(len(res) == len(expected)) for i in range(len(res)): assert(res[i].shape == expected[i].shape) assert(np.alltrue(res[i] == expected[i]))
def test_sample_by_state_replace_subset(self): dtraj =[0,1,2,3,2,1,0] idx = dt.index_states(dtraj) subset = [1,2] sidx = dt.sample_indexes_by_state(idx, 5, subset=subset) for i in range(len(subset)): assert(sidx[i].shape[0] == 5) for t in range(sidx[i].shape[0]): assert(dtraj[sidx[i][t,1]] == subset[i])
def active_state_indexes(self): """ Ensures that the connected states are indexed and returns the indices """ self._check_is_estimated() if not hasattr(self, '_active_state_indexes'): from pyemma.util.discrete_trajectories import index_states self._active_state_indexes = index_states(self.discrete_trajectories_active) return self._active_state_indexes
def test_sample_by_state_replace_subset(self): dtraj = [0, 1, 2, 3, 2, 1, 0] idx = dt.index_states(dtraj) subset = [1, 2] sidx = dt.sample_indexes_by_state(idx, 5, subset=subset) for i in range(len(subset)): assert (sidx[i].shape[0] == 5) for t in range(sidx[i].shape[0]): assert (dtraj[sidx[i][t, 1]] == subset[i])
def test_onetraj_sub(self): dtraj = [0, 1, 2, 3, 2, 1, 0] # should be a ValueError because this is not a subset res = dt.index_states(dtraj, subset=[2, 3]) expected = [np.array([[0, 2], [0, 4]]), np.array([[0, 3]])] assert (len(res) == len(expected)) for i in range(len(res)): assert (res[i].shape == expected[i].shape) assert (np.alltrue(res[i] == expected[i]))
def test_onetraj_sub(self): dtraj =[0,1,2,3,2,1,0] # should be a ValueError because this is not a subset res = dt.index_states(dtraj, subset=[2,3]) expected = [np.array([[0,2],[0,4]]),np.array([[0,3]])] assert(len(res) == len(expected)) for i in range(len(res)): assert(res[i].shape == expected[i].shape) assert(np.alltrue(res[i] == expected[i]))
def observable_state_indexes(self): """ Ensures that the observable states are indexed and returns the indices """ try: # if we have this attribute, return it return self._observable_state_indexes except AttributeError: # didn't exist? then create it. import pyemma.util.discrete_trajectories as dt self._observable_state_indexes = dt.index_states(self.discrete_trajectories_obs) return self._observable_state_indexes
def active_state_indexes(self): """ Ensures that the connected states are indexed and returns the indices """ self._check_is_estimated() try: # if we have this attribute, return it return self._active_state_indexes except: # didn't exist? then create it. import pyemma.util.discrete_trajectories as dt self._active_state_indexes = dt.index_states(self.discrete_trajectories_full, subset=self.active_set) return self._active_state_indexes
def test_performance(self): import pyemma.util.discrete_trajectories as dt state = np.random.RandomState(42) n_states = 10000 dtrajs = [state.randint(0, n_states, size=100000) for _ in range(500)] selection = np.random.choice(np.arange(n_states), size=(500,), replace=False) with timing('pyemma'): out2 = dt.index_states(dtrajs, selection) with timing('cpp'): out = sample.compute_index_states(dtrajs, selection) assert len(out) == len(out2) for o1, o2 in zip(out, out2): np.testing.assert_array_almost_equal(o1, o2)
def index_clusters(self): """Returns trajectory/time indexes for all the clusters Returns ------- indexes : list of ndarray( (N_i, 2) ) For each state, all trajectory and time indexes where this cluster occurs. Each matrix has a number of rows equal to the number of occurrences of the corresponding state, with rows consisting of a tuple (i, t), where i is the index of the trajectory and t is the time index within the trajectory. """ if len(self._dtrajs) == 0: # nothing assigned yet, doing that now self._dtrajs = self.assign() if len(self._index_states) == 0: # has never been run self._index_states = index_states(self._dtrajs) return self._index_states
def test_big(self): dtraj = dt.read_discrete_trajectory(testpath+'2well_traj_100K.dat') # just run these to see if there's any exception dt.index_states(dtraj)
def test_big(self): import pyemma.datasets dtraj = pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10 # just run these to see if there's any exception dt.index_states(dtraj)
def _estimate(self, dtrajs): if self.E is None or self.w is None or self.m is None: raise ValueError("E, w or m was not specified. Stopping.") # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate AMM.') from pyemma.util.discrete_trajectories import index_states self._active_state_indexes = index_states(dtrajs, subset=self.active_set) # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # slice out active states from E matrix _dset = list(set(_np.concatenate(dtrajs))) _rras = [_dset.index(s) for s in self.active_set] self.E_active = self.E[_rras] if not self.sparse: self._C_active = self._C_active.toarray() self._C_full = self._C_full.toarray() # reversibly counted self._C2 = 0.5 * (self._C_active + self._C_active.T) self._nz = _np.nonzero(self._C2) self._csum = _np.sum(self._C_active, axis=1) # row sums C # get ranges of Markov model expectation values if self.support_ci == 1: self.E_min = _np.min(self.E_active, axis=0) self.E_max = _np.max(self.E_active, axis=0) else: # PyEMMA confidence interval calculation fails sometimes with conf=1.0 self.E_min, self.E_max = _ci(self.E_active, conf=self.support_ci) # dimensions of E matrix self.n_mstates_active, self.n_exp_active = _np.shape(self.E_active) assert self.n_exp_active == len(self.w) assert self.n_exp_active == len(self.m) self.count_outside = [] self.count_inside = [] self._lls = [] i = 0 # Determine which experimental values are outside the support as defined by the Confidence interval for emi, ema, mm, mw in zip(self.E_min, self.E_max, self.m, self.w): if mm < emi or ema < mm: self.logger.info( "Experimental value %f is outside the support (%f,%f)" % (mm, emi, ema)) self.count_outside.append(i) else: self.count_inside.append(i) i = i + 1 self.logger.info( "Total experimental constraints outside support %d of %d" % (len(self.count_outside), len(self.E_min))) # A number of initializations self.P, self.pi = msmest.tmatrix(self._C_active, reversible=True, return_statdist=True) self.lagrange = _np.zeros(self.m.shape) self._pihat = self.pi.copy() self._update_mhat() self._dmhat = 1e-1 * _np.ones(_np.shape(self.mhat)) # Determine number of slices of R-tensors computable at once with the given cache size self._slicesz = _np.floor(self.max_cache / (self.P.nbytes / 1.e6)).astype(int) # compute first bundle of slices self._update_Rslices(0) self._ll_old = self._log_likelihood_biased(self._C_active, self.P, self.m, self.mhat, self.w) self._lls = [self._ll_old] # make sure everything is initialized self._update_pihat() self._update_mhat() self._update_Q() self._update_X_and_pi() self._ll_old = self._log_likelihood_biased(self._C_active, self.P, self.m, self.mhat, self.w) self._update_G() # # Main estimation algorithm # 2-step algorithm, lagrange multipliers and pihat have different convergence criteria # when the lagrange multipliers have converged, pihat is updated until the log-likelihood has converged (changes are smaller than 1e-3). # These do not always converge together, but usually within a few steps of each other. # A better heuristic for the latter may be necessary. For realistic cases (the two ubiquitin examples in [1]) # this yielded results very similar to those with more stringent convergence criteria (changes smaller than 1e-9) with convergence times # which are seconds instead of tens of minutes. # converged = False # Convergence flag for lagrange multipliers i = 0 die = False while i <= self.maxiter: pihat_old = self._pihat.copy() self._update_pihat() if not _np.all(self._pihat > 0): self._pihat = pihat_old.copy() die = True self.logger.warning( "pihat does not have a finite probability for all states, terminating" ) self._update_mhat() self._update_Q() if i > 1: X_old = self.X.copy() self._update_X_and_pi() if _np.any(self.X[self._nz] < 0) and i > 0: die = True self.logger.warning( "Warning: new X is not proportional to C... reverting to previous step and terminating" ) self.X = X_old.copy() if not converged: self._newton_lagrange() else: # once Lagrange multipliers are converged compute likelihood here P = self.X / self.pi[:, None] _ll_new = self._log_likelihood_biased(self._C_active, P, self.m, self.mhat, self.w) self._lls.append(_ll_new) # General case fixed-point iteration if len(self.count_outside) > 0: if i > 1 and _np.all( (_np.abs(self._dmhat) / self.sigmas) < self.eps) and not converged: self.logger.info( "Converged Lagrange multipliers after %i steps..." % i) converged = True # Special case else: if _np.abs(self._lls[-2] - self._lls[-1]) < 1e-8: self.logger.info( "Converged Lagrange multipliers after %i steps..." % i) converged = True # if Lagrange multipliers are converged, check whether log-likelihood has converged if converged and _np.abs(self._lls[-2] - self._lls[-1]) < 1e-8: self.logger.info("Converged pihat after %i steps..." % i) die = True if die: break if i == self.maxiter: self.logger.info("Failed to converge within %i iterations. " "Consider increasing max_iter(now=%i)" % (i, self.max_iter)) i += 1 _P = msmest.tmatrix(self._C_active, reversible=True, mu=self._pihat) self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=_P, pi=self._pihat, reversible=True, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
with open("Qtanh_0_05_profile/T_used.dat","r") as fin: T = float(fin.read()) tempdirs = [ "T_{:.2f}_{}".format(T, x) for x in [1,2,3] ] topfile = tempdirs[0] + "/" + topname trajfiles = [ x + "/" + trajname for x in tempdirs ] # initialize traj input info. feat = coor.featurizer(topfile) inp = coor.source(trajfiles, feat) # Load MSM's that have already been calculated. dirs, dtrajs, lagtimes, models = util.load_markov_state_models() model_msm = models[7] # lagtime of 200 # Determine the number of clusters by the number of timescales. n_pcca = 2 n_sample = 100 # Grab frames from pcca clustering model_msm.pcca(n_pcca) pcca_dist = model_msm.metastable_distributions active_state_indexes = dt.index_states(dtrajs) pcca_samples = dt.sample_indexes_by_distribution(active_state_indexes, pcca_dist, n_sample) outfiles = [ 'msm/pcca{}.xtc'.format(x) for x in range(1, n_pcca + 1) ] coor.save_trajs(inp, pcca_samples, outfiles=outfiles)
def test_subset_error(self): dtraj = [0, 1, 2, 3, 2, 1, 0] # should be a ValueError because this is not a subset with self.assertRaises(ValueError): dt.index_states(dtraj, subset=[3, 4, 5])
def test_subset_error(self): dtraj =[0,1,2,3,2,1,0] # should be a ValueError because this is not a subset with self.assertRaises(ValueError): dt.index_states(dtraj, subset=[3,4,5])