def calculate_all_to_all_mfpt(tprob, populations=None): """ Calculate the all-states by all-state matrix of mean first passage times. This uses the fundamental matrix formalism, and should be much faster than GetMFPT for calculating many MFPTs. Parameters ---------- tprob : matrix transition probability matrix populations : array_like, float optional argument, the populations of each state. If not supplied, it will be computed from scratch Returns ------- MFPT : array, float MFPT in time units of LagTime, square array for MFPT from i -> j See Also -------- GetMFPT : function for calculating a subset of the MFPTs, with functionality for including a set of sinks """ msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): tprob = tprob.toarray() logger.warning('calculate_all_to_all_mfpt does not support sparse linear algebra') if populations is None: eigens = msm_analysis.get_eigenvectors(tprob, 5) if np.count_nonzero(np.imag(eigens[1][:,0])) != 0: raise ValueError('First eigenvector has imaginary parts') populations = np.real(eigens[1][:,0]) # ensure that tprob is a transition matrix msm_analysis.check_transition(tprob) num_states = len(populations) if tprob.shape[0] != num_states: raise ValueError("Shape of tprob and populations vector don't match") eye = np.transpose( np.matrix(np.ones(num_states)) ) limiting_matrix = eye * populations #z = scipy.linalg.inv(scipy.sparse.eye(num_states, num_states) - (tprob - limiting_matrix)) z = scipy.linalg.inv(np.eye(num_states) - (tprob - limiting_matrix)) # mfpt[i,j] = z[j,j] - z[i,j] / pi[j] mfpt = -z for j in range(num_states): mfpt[:, j] += z[j, j] mfpt[:, j] /= populations[j] return mfpt
def calculate_mfpt(sinks, tprob, lag_time=1.): """ Gets the Mean First Passage Time (MFPT) for all states to a *set* of sinks. Parameters ---------- sinks : array, int indices of the sink states tprob : matrix transition probability matrix LagTime : float the lag time used to create T (dictates units of the answer) Returns ------- MFPT : array, float MFPT in time units of LagTime, for each state (in order of state index) See Also -------- calculate_all_to_all_mfpt : function A more efficient way to calculate all the MFPTs in a network """ sinks = _ensure_iterable(sinks) msm_analysis.check_transition(tprob) n = tprob.shape[0] if scipy.sparse.isspmatrix(tprob): tprob = tprob.tolil() for state in sinks: tprob[state,:] = 0.0 tprob[state,state] = 2.0 if scipy.sparse.isspmatrix(tprob): tprob = tprob - scipy.sparse.eye(n,n) tprob = tprob.tocsr() else: tprob = tprob - np.eye(n) RHS = -1 * np.ones(n) for state in sinks: RHS[state] = 0.0 if scipy.sparse.isspmatrix(tprob): MFPT = lag_time * scipy.sparse.linalg.spsolve(tprob, RHS) else: MFPT = lag_time * np.linalg.solve(tprob, RHS) return MFPT
def calculate_net_fluxes(sources, sinks, tprob, populations=None, committors=None): """ Computes the transition path theory net flux matrix. Parameters ---------- sources : array_like, int The set of unfolded/reactant states. sinks : array_like, int The set of folded/product states. tprob : mm_matrix The transition matrix. Returns ------ net_fluxes : mm_matrix The net flux matrix Optional Parameters ------------------- populations : nd_array, float The equilibrium populations, if not provided is re-calculated committors : nd_array, float The committors associated with `sources`, `sinks`, and `tprob`. If not provided, is calculated from scratch. If provided, `sources` and `sinks` are ignored. """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): dense = False else: dense = True n = tprob.shape[0] flux = calculate_fluxes(sources, sinks, tprob, populations, committors) ind = flux.nonzero() if dense: net_flux = np.zeros((n, n)) else: net_flux = scipy.sparse.lil_matrix((n, n)) for k in range(len(ind[0])): i, j = ind[0][k], ind[1][k] forward = flux[i, j] reverse = flux[j, i] net_flux[i, j] = max(0, forward - reverse) return net_flux
def calculate_ensemble_mfpt(sources, sinks, tprob, lag_time): """ Calculates the average 'Folding Time' of an MSM defined by T and a LagTime. The Folding Time is the average of the MFPTs (to F) of all the states in U. Note here 'Folding Time' is defined as the avg MFPT of {U}, to {F}. Consider this carefully. This is probably NOT the experimental folding time! Parameters ---------- sources : array, int indices of the source states sinks : array, int indices of the sink states tprob : matrix transition probability matrix lag_time : float the lag time used to create T (dictates units of the answer) Returns ------- avg : float the average of the MFPTs std : float the standard deviation of the MFPTs References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) X = calculate_mfpt(sinks, tprob, lag_time) times = np.zeros(len(sources)) for i in range(len(sources)): times[i] = X[sources[i]] return np.average(times), np.std(times)
def calculate_ensemble_mfpt(sources, sinks, tprob, lag_time): """ Calculates the average 'Folding Time' of an MSM defined by T and a LagTime. The Folding Time is the average of the MFPTs (to F) of all the states in U. Note here 'Folding Time' is defined as the avg MFPT of {U}, to {F}. Consider this carefully. This is probably NOT the experimental folding time! Parameters ---------- sources : array, int indices of the source states sinks : array, int indices of the sink states tprob : matrix transition probability matrix lag_time : float the lag time used to create T (dictates units of the answer) Returns ------- avg : float the average of the MFPTs std : float the standard deviation of the MFPTs """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) X = calculate_mfpt(sinks, tprob, lag_time) times = np.zeros(len(sources)) for i in range(len(sources)): times[i] = X[ sources[i] ] return np.average(times), np.std(times)
def find_top_paths(sources, sinks, tprob, num_paths=10, node_wipe=False, net_flux=None): r""" Calls the Dijkstra algorithm to find the top 'NumPaths'. Does this recursively by first finding the top flux path, then cutting that path and relaxing to find the second top path. Continues until NumPaths have been found. Parameters ---------- sources : array_like, int The indices of the source states sinks : array_like, int Indices of sink states num_paths : int The number of paths to find Returns ------- Paths : list of lists The nodes transversed in each path Bottlenecks : list of tuples The nodes between which exists the path bottleneck Fluxes : list of floats The flux through each path Optional Parameters ------------------- node_wipe : bool If true, removes the bottleneck-generating node from the graph, instead of just the bottleneck (not recommended, a debugging functionality) net_flux : sparse matrix Matrix of the net flux from `sources` to `sinks`, see function `net_flux`. If not provided, is calculated from scratch. If provided, `tprob` is ignored. To Do ----- -- Add periodic flow check References ---------- .. [1] Dijkstra, E. W. (1959). "A note on two problems in connexion with graphs". Numerische Mathematik 1: 269–271. doi:10.1007/BF01386390. """ # first, do some checking on the input, esp. `sources` and `sinks` # we want to make sure all objects are iterable and the sets are disjoint sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) # check to see if we get net_flux for free, otherwise calculate it if not net_flux: net_flux = calculate_net_fluxes(sources, sinks, tprob) # initialize objects paths = [] fluxes = [] bottlenecks = [] if scipy.sparse.issparse(net_flux): net_flux = net_flux.tolil() # run the initial Dijkstra pass pi, b = Dijkstra(sources, sinks, net_flux) logger.info("Path Num | Path | Bottleneck | Flux") i = 1 done = False while not done: # First find the highest flux pathway (path, (b1, b2), flux) = _backtrack(sinks, b, pi, net_flux) # Add each result to a Paths, Bottlenecks, Fluxes list if flux == 0: logger.info("Only %d possible pathways found. Stopping backtrack.", i) break paths.append(path) bottlenecks.append((b1, b2)) fluxes.append(flux) logger.info("%s | %s | %s | %s ", i, path, (b1, b2), flux) # Cut the bottleneck, start relaxing from B side of the cut if node_wipe: net_flux[:, b2] = 0 logger.info("Wiped node: %s", b2) else: net_flux[b1, b2] = 0 G = scipy.sparse.find(net_flux) Q = [b2] b, pi, net_flux = _back_relax(b2, b, pi, net_flux) # Then relax the graph and repeat # But only if we still need to if i != num_paths - 1: while len(Q) > 0: w = Q.pop() for v in G[1][np.where(G[0] == w)]: if pi[v] == w: b, pi, net_flux = _back_relax(v, b, pi, net_flux) Q.append(v) Q = sorted(Q, key=lambda v: b[v]) i += 1 if i == num_paths + 1: done = True if flux == 0: logger.info("Only %d possible pathways found. Stopping backtrack.", i) done = True return paths, bottlenecks, fluxes
def calculate_all_to_all_mfpt(tprob, populations=None): """ Calculate the all-states by all-state matrix of mean first passage times. This uses the fundamental matrix formalism, and should be much faster than GetMFPT for calculating many MFPTs. Parameters ---------- tprob : matrix transition probability matrix populations : array_like, float optional argument, the populations of each state. If not supplied, it will be computed from scratch Returns ------- MFPT : array, float MFPT in time units of LagTime, square array for MFPT from i -> j See Also -------- GetMFPT : function for calculating a subset of the MFPTs, with functionality for including a set of sinks References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): tprob = tprob.toarray() logger.warning('calculate_all_to_all_mfpt does not support sparse linear algebra') if populations is None: eigens = msm_analysis.get_eigenvectors(tprob, 1) if np.count_nonzero(np.imag(eigens[1][:, 0])) != 0: raise ValueError('First eigenvector has imaginary parts') populations = np.real(eigens[1][:, 0]) # ensure that tprob is a transition matrix msm_analysis.check_transition(tprob) num_states = len(populations) if tprob.shape[0] != num_states: raise ValueError("Shape of tprob and populations vector don't match") eye = np.transpose(np.matrix(np.ones(num_states))) limiting_matrix = eye * populations #z = scipy.linalg.inv(scipy.sparse.eye(num_states, num_states) - (tprob - limiting_matrix)) z = scipy.linalg.inv(np.eye(num_states) - (tprob - limiting_matrix)) # mfpt[i,j] = z[j,j] - z[i,j] / pi[j] mfpt = -z for j in range(num_states): mfpt[:, j] += z[j, j] mfpt[:, j] /= populations[j] return mfpt
def calculate_mfpt(sinks, tprob, lag_time=1.): """ Gets the Mean First Passage Time (MFPT) for all states to a *set* of sinks. Parameters ---------- sinks : array, int indices of the sink states tprob : matrix transition probability matrix LagTime : float the lag time used to create T (dictates units of the answer) Returns ------- MFPT : array, float MFPT in time units of LagTime, for each state (in order of state index) See Also -------- calculate_all_to_all_mfpt : function A more efficient way to calculate all the MFPTs in a network References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ sinks = _ensure_iterable(sinks) msm_analysis.check_transition(tprob) n = tprob.shape[0] if scipy.sparse.isspmatrix(tprob): tprob = tprob.tolil() for state in sinks: tprob[state, :] = 0.0 tprob[state, state] = 2.0 if scipy.sparse.isspmatrix(tprob): tprob = tprob - scipy.sparse.eye(n, n) tprob = tprob.tocsr() else: tprob = tprob - np.eye(n) RHS = -1 * np.ones(n) for state in sinks: RHS[state] = 0.0 if scipy.sparse.isspmatrix(tprob): MFPT = lag_time * scipy.sparse.linalg.spsolve(tprob, RHS) else: MFPT = lag_time * np.linalg.solve(tprob, RHS) return MFPT
def calculate_avg_TP_time(sources, sinks, tprob, lag_time): """ Calculates the Average Transition Path Time for MSM with: T, LagTime. The TPTime is the average of the MFPTs (to F) of all the states immediately adjacent to U, with the U states effectively deleted. Note here 'TP Time' is defined as the avg MFPT of all adjacent states to {U}, to {F}, ignoring {U}. Consider this carefully. Parameters ---------- sources : array, int indices of the unfolded states sinks : array, int indices of the folded states tprob : matrix transition probability matrix lag_time : float the lag time used to create T (dictates units of the answer) Returns ------- avg : float the average of the MFPTs std : float the standard deviation of the MFPTs References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) n = tprob.shape[0] if scipy.sparse.issparse(tprob): T = tprob.tolil() P = scipy.sparse.lil_matrix((n, n)) else: P = np.zeros((n, n)) for u in sources: for i in range(n): if i not in sources: P[u, i] = T[u, i] for u in sources: T[u, :] = np.zeros(n) T[:, u] = 0 for i in sources: N = T[i, :].sum() T[i, :] = T[i, :] / N X = calculate_mfpt(sinks, tprob, lag_time) TP = P * X.T TPtimes = [] for time in TP: if time != 0: TPtimes.append(time) return np.average(TPtimes), np.std(TPtimes)
def calculate_net_fluxes(sources, sinks, tprob, populations=None, committors=None): """ Computes the transition path theory net flux matrix. Parameters ---------- sources : array_like, int The set of unfolded/reactant states. sinks : array_like, int The set of folded/product states. tprob : mm_matrix The transition matrix. Returns ------ net_fluxes : mm_matrix The net flux matrix Optional Parameters ------------------- populations : nd_array, float The equilibrium populations, if not provided is re-calculated committors : nd_array, float The committors associated with `sources`, `sinks`, and `tprob`. If not provided, is calculated from scratch. If provided, `sources` and `sinks` are ignored. References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): dense = False else: dense = True n = tprob.shape[0] flux = calculate_fluxes(sources, sinks, tprob, populations, committors) ind = flux.nonzero() if dense: net_flux = np.zeros((n, n)) else: net_flux = scipy.sparse.lil_matrix((n, n)) for k in range(len(ind[0])): i, j = ind[0][k], ind[1][k] forward = flux[i, j] reverse = flux[j, i] net_flux[i, j] = max(0, forward - reverse) return net_flux
def calculate_fluxes(sources, sinks, tprob, populations=None, committors=None): """ Compute the transition path theory flux matrix. Parameters ---------- sources : array_like, int The set of unfolded/reactant states. sinks : array_like, int The set of folded/product states. tprob : mm_matrix The transition matrix. Returns ------ fluxes : mm_matrix The flux matrix Optional Parameters ------------------- populations : nd_array, float The equilibrium populations, if not provided is re-calculated committors : nd_array, float The committors associated with `sources`, `sinks`, and `tprob`. If not provided, is calculated from scratch. If provided, `sources` and `sinks` are ignored. """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): dense = False else: dense = True # check if we got the populations if populations is None: eigens = msm_analysis.get_eigenvectors(tprob, 5) if np.count_nonzero(np.imag(eigens[1][:,0])) != 0: raise ValueError('First eigenvector has imaginary components') populations = np.real(eigens[1][:,0]) # check if we got the committors if committors is None: committors = calculate_committors(sources, sinks, tprob) # perform the flux computation Indx, Indy = tprob.nonzero() n = tprob.shape[0] if dense: X = np.zeros((n, n)) Y = np.zeros((n, n)) X[(np.arange(n), np.arange(n))] = populations * (1.0 - committors) Y[(np.arange(n), np.arange(n))] = committors else: X = scipy.sparse.lil_matrix((n,n)) Y = scipy.sparse.lil_matrix((n,n)) X.setdiag( populations * (1.0 - committors)) Y.setdiag(committors) if dense: fluxes = np.dot(np.dot(X, tprob), Y) fluxes[(np.arange(n), np.arange(n))] = np.zeros(n) else: fluxes = np.dot(np.dot(X.tocsr(), tprob.tocsr()), Y.tocsr()) fluxes = fluxes.tolil() fluxes.setdiag(np.zeros(n)) return fluxes
def calculate_hub_score(tprob, waypoint): """ Calculate the hub score for the states `waypoint`. The "hub score" is a measure of how well traveled a certain state or set of states is in a network. Specifically, it is the fraction of times that a walker visits a state en route from some state A to another state B, averaged over all combinations of A and B. Parameters ---------- tprob : matrix The transition probability matrix waypoints : int The indices of the intermediate state(s) Returns ------- Hc : float The hub score for the state composed of `waypoints` See Also -------- calculate_fraction_visits : function Calculate the fraction of times a state is visited on pathways going from a set of "sources" to a set of "sinks". calculate_all_hub_scores : function A more efficient way to compute the hub score for every state in a network. Notes ----- Employs dense linear algebra, memory use scales as N^2 cycle use scales as N^5 References ---------- ..[1] Dickson & Brooks (2012), J. Chem. Theory Comput., Article ASAP DOI: 10.1021/ct300537s """ msm_analysis.check_transition(tprob) # typecheck if type(waypoint) != int: if hasattr(waypoint, '__len__'): if len(waypoint) == 1: waypoint = waypoint[0] else: raise ValueError('Must pass waypoints as int or list/array of ints') else: raise ValueError('Must pass waypoints as int or list/array of ints') # find out which states to include in A, B (i.e. everything but C) N = tprob.shape[0] states_to_include = list(range(N)) states_to_include.remove(waypoint) # calculate the hub score Hc = 0.0 for s1 in states_to_include: for s2 in states_to_include: if (s1 != s2) and (s1 != waypoint) and (s2 != waypoint): Hc += calculate_fraction_visits(tprob, waypoint, s1, s2, return_cond_Q=False) Hc /= ((N - 1) * (N - 2)) return Hc
def calculate_committors(sources, sinks, tprob): """ Get the forward committors of the reaction sources -> sinks. Parameters ---------- sources : array_like, int The set of unfolded/reactant states. sinks : array_like, int The set of folded/product states. tprob : mm_matrix The transition matrix. Returns ------- Q : array_like The forward committors for the reaction U -> F. """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): dense = False tprob = tprob.tolil() else: dense = True # construct the committor problem n = tprob.shape[0] if dense: T = np.eye(n) - tprob else: T = scipy.sparse.eye(n, n, 0, format='lil') - tprob T = T.tolil() for a in sources: T[a,:] = 0.0 #np.zeros(n) T[:,a] = 0.0 T[a,a] = 1.0 for b in sinks: T[b,:] = 0.0 # np.zeros(n) T[:,b] = 0.0 T[b,b] = 1.0 IdB = np.zeros(n) IdB[sinks] = 1.0 if dense: RHS = np.dot(tprob, IdB) else: RHS = tprob * IdB RHS[sources] = 0.0 RHS[sinks] = 1.0 # solve for the committors if dense == False: Q = scipy.sparse.linalg.spsolve(T.tocsr(), RHS) else: Q = np.linalg.solve(T, RHS) assert np.all( Q <= 1.0 ) assert np.all( Q >= 0.0 ) return Q
def calculate_avg_TP_time(sources, sinks, tprob, lag_time): """ Calculates the Average Transition Path Time for MSM with: T, LagTime. The TPTime is the average of the MFPTs (to F) of all the states immediately adjacent to U, with the U states effectively deleted. Note here 'TP Time' is defined as the avg MFPT of all adjacent states to {U}, to {F}, ignoring {U}. Consider this carefully. Parameters ---------- sources : array, int indices of the unfolded states sinks : array, int indices of the folded states tprob : matrix transition probability matrix lag_time : float the lag time used to create T (dictates units of the answer) Returns ------- avg : float the average of the MFPTs std : float the standard deviation of the MFPTs """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) n = tprob.shape[0] if scipy.sparse.issparse(tprob): T = tprob.tolil() P = scipy.sparse.lil_matrix((n, n)) else: p = np.zeros((n, n)) for u in sources: for i in range(n): if i not in sources: P[u, i] = T[u, i] for u in sources: T[u, :] = np.zeros(n) T[:, u] = 0 for i in sources: N = T[i, :].sum() T[i,:] = T[i, :]/N X = calculate_mfpt(sinks, tprob, lag_time) TP = P * X.T TPtimes = [] for time in TP: if time != 0: TPtimes.append(time) return np.average(TPtimes), np.std(TPtimes)
def calculate_committors(sources, sinks, tprob): """ Get the forward committors of the reaction sources -> sinks. Parameters ---------- sources : array_like, int The set of unfolded/reactant states. sinks : array_like, int The set of folded/product states. tprob : mm_matrix The transition matrix. Returns ------- Q : array_like The forward committors for the reaction U -> F. References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): dense = False tprob = tprob.tolil() else: dense = True # construct the committor problem n = tprob.shape[0] if dense: T = np.eye(n) - tprob else: T = scipy.sparse.eye(n, n, 0, format='lil') - tprob T = T.tolil() for a in sources: T[a, :] = 0.0 # np.zeros(n) T[:, a] = 0.0 T[a, a] = 1.0 for b in sinks: T[b, :] = 0.0 # np.zeros(n) T[:, b] = 0.0 T[b, b] = 1.0 IdB = np.zeros(n) IdB[sinks] = 1.0 if dense: RHS = np.dot(tprob, IdB) else: RHS = tprob.dot(IdB) # This should be the same as below #RHS = tprob * IdB RHS[sources] = 0.0 RHS[sinks] = 1.0 # solve for the committors if dense == False: Q = scipy.sparse.linalg.spsolve(T.tocsr(), RHS) else: Q = np.linalg.solve(T, RHS) epsilon = 0.001 assert np.all(Q <= 1.0 + epsilon) assert np.all(Q >= 0.0 - epsilon) return Q
def calculate_fluxes(sources, sinks, tprob, populations=None, committors=None): """ Compute the transition path theory flux matrix. Parameters ---------- sources : array_like, int The set of unfolded/reactant states. sinks : array_like, int The set of folded/product states. tprob : mm_matrix The transition matrix. Returns ------ fluxes : mm_matrix The flux matrix Optional Parameters ------------------- populations : nd_array, float The equilibrium populations, if not provided is re-calculated committors : nd_array, float The committors associated with `sources`, `sinks`, and `tprob`. If not provided, is calculated from scratch. If provided, `sources` and `sinks` are ignored. References ---------- .. [1] Metzner, P., Schutte, C. & Vanden-Eijnden, E. Transition path theory for Markov jump processes. Multiscale Model. Simul. 7, 1192–1219 (2009). .. [2] Berezhkovskii, A., Hummer, G. & Szabo, A. Reactive flux and folding pathways in network models of coarse-grained protein dynamics. J. Chem. Phys. 130, 205102 (2009). """ sources, sinks = _check_sources_sinks(sources, sinks) msm_analysis.check_transition(tprob) if scipy.sparse.issparse(tprob): dense = False else: dense = True # check if we got the populations if populations is None: eigens = msm_analysis.get_eigenvectors(tprob, 1) if np.count_nonzero(np.imag(eigens[1][:, 0])) != 0: raise ValueError('First eigenvector has imaginary components') populations = np.real(eigens[1][:, 0]) # check if we got the committors if committors is None: committors = calculate_committors(sources, sinks, tprob) # perform the flux computation Indx, Indy = tprob.nonzero() n = tprob.shape[0] if dense: X = np.zeros((n, n)) Y = np.zeros((n, n)) X[(np.arange(n), np.arange(n))] = populations * (1.0 - committors) Y[(np.arange(n), np.arange(n))] = committors else: X = scipy.sparse.lil_matrix((n, n)) Y = scipy.sparse.lil_matrix((n, n)) X.setdiag(populations * (1.0 - committors)) Y.setdiag(committors) if dense: fluxes = np.dot(np.dot(X, tprob), Y) fluxes[(np.arange(n), np.arange(n))] = np.zeros(n) else: fluxes = (X.tocsr().dot(tprob.tocsr())).dot(Y.tocsr()) # This should be the same as below, but it's a bit messy... #fluxes = np.dot(np.dot(X.tocsr(), tprob.tocsr()), Y.tocsr()) fluxes = fluxes.tolil() fluxes.setdiag(np.zeros(n)) return fluxes
def calculate_fraction_visits(tprob, waypoint, source, sink, return_cond_Q=False): """ Calculate the fraction of times a walker on `tprob` going from `sources` to `sinks` will travel through the set of states `waypoints` en route. Computes the conditional committors q^{ABC^+} and uses them to find the fraction of paths mentioned above. The conditional committors can be Note that in the notation of Dickson et. al. this computes h_c(A,B), with sources = A sinks = B waypoint = C Parameters ---------- tprob : matrix The transition probability matrix waypoint : int The index of the intermediate state sources : nd_array, int or int The indices of the source state(s) sinks : nd_array, int or int The indices of the sink state(s) return_cond_Q : bool Whether or not to return the conditional committors Returns ------- fraction_paths : float The fraction of times a walker going from `sources` -> `sinks` stops by `waypoints` on its way. cond_Q : nd_array, float (optional) Optionally returned (`return_cond_Q`) See Also -------- calculate_hub_score : function Compute the 'hub score', the weighted fraction of visits for an entire network. calculate_all_hub_scores : function Wrapper to compute all the hub scores in a network. Notes ----- Employs dense linear algebra, memory use scales as N^2 cycle use scales as N^3 References ---------- ..[1] Dickson & Brooks (2012), J. Chem. Theory Comput., Article ASAP DOI: 10.1021/ct300537s """ # do some typechecking - we need to be sure that the lumped sources are in # the second to last row, and the lumped sinks are in the last row # check `tprob` msm_analysis.check_transition(tprob) if type(tprob) != np.ndarray: try: tprob = tprob.todense() except AttributeError as e: raise TypeError('Argument `tprob` must be convertable to a dense' 'numpy array. \n%s' % e) # typecheck for data in [source, sink, waypoint]: if type(data) == int: pass elif hasattr(data, 'len'): if len(data) == 1: data = data[0] else: raise TypeError('Arguments source/sink/waypoint must be an int') if (source == waypoint) or (sink == waypoint) or (sink == source): raise ValueError('source, sink, waypoint must all be disjoint!') N = tprob.shape[0] Q = calculate_committors([source], [sink], tprob) # permute the transition matrix into cannonical form - send waypoint the the # last row, and source + sink to the end after that Bsink_indices = [source, sink, waypoint] perm = np.arange(N) perm = np.delete(perm, Bsink_indices) perm = np.append(perm, Bsink_indices) T = MSMLib.permute_mat(tprob, perm) # extract P, R n = N - len(Bsink_indices) P = T[:n, :n] R = T[:n, n:] # calculate the conditional committors ( B = N*R ), B[i,j] is the prob # state i ends in j, where j runs over the source + sink + waypoint # (waypoint is position -1) B = np.dot(np.linalg.inv(np.eye(n) - P), R) # Not sure if this is sparse or not... # add probs for the sinks, waypoint / b[i] is P( i --> {C & not A, B} ) b = np.append(B[:, -1].flatten(), [0.0] * (len(Bsink_indices) - 1) + [1.0]) cond_Q = b * Q[waypoint] epsilon = 1e-6 # some numerical give, hard-coded assert cond_Q.shape == (N,) assert np.all(cond_Q <= 1.0 + epsilon) assert np.all(cond_Q >= 0.0 - epsilon) assert np.all(cond_Q <= Q[perm] + epsilon) # finally, calculate the fraction of paths h_C(A,B) (eq. 7 in [1]) fraction_paths = np.sum(T[-3, :] * cond_Q) / np.sum(T[-3, :] * Q[perm]) assert fraction_paths <= 1.0 assert fraction_paths >= 0.0 if return_cond_Q: cond_Q = cond_Q[np.argsort(perm)] # put back in orig. order return fraction_paths, cond_Q else: return fraction_paths
def _run_trial(arg_dict): # inject the arg_dict into the local namespace - may be a bad idea... for key in arg_dict.keys(): exec(key + " = arg_dict['" + key + "']") # initialize data structures to hold output distance_to_target = np.zeros(rounds_of_sampling) obs_distance = np.zeros(rounds_of_sampling) # the assignments array will hold all of the output of all simulations assignments = -1.0 * np.ones((rounds_of_sampling * simultaneous_samplers + 1, max(size_of_intial_data, length_of_sampling_trajs+1) )) # initialize the "true" transition matrix if not transition_matrix: assert num_states > 0 C_rand = np.random.randint( 0, 100, (num_states, num_states) ) C_rand += C_rand.T T = MSMLib.estimate_transition_matrix( C_rand ) else: T = transition_matrix num_states = T.shape[0] T = sparse.csr_matrix(T) msm_analysis.check_transition(T) if observable_function: try: obs_goal = observable_function(T) except Exception as e: print >> sys.stderr, e raise Exception("Error evaluating function: %s" % observable_function.__name__) assignments[0,:size_of_intial_data] = msm_analysis.sample(T, None, size_of_intial_data) # iterate, adding simulation time for sampling_round in range(rounds_of_sampling): # apply the adaptive sampling method - we need to be true to what a # real simulation would actually see for the counts matrix mod_assignments = assignments.copy() mapping = MSMLib.renumber_states( mod_assignments ) C_mod = MSMLib.get_count_matrix_from_assignments( mod_assignments ) T_mod = MSMLib.estimate_transition_matrix(C_mod) adaptive_sampling_multivariate = SamplerObject.sample(C_mod) # choose the states to sample from (in the original indexing) state_inds = np.arange(len(adaptive_sampling_multivariate)) sampler = stats.rv_discrete(name='sampler', values=[state_inds, adaptive_sampling_multivariate]) starting_states = sampler.rvs( size=simultaneous_samplers ) starting_states = mapping[starting_states] # start new 'simulations' in each of those states for i,init_state in enumerate(starting_states): a_ind = sampling_round * simultaneous_samplers + i + 1 s_ind = length_of_sampling_trajs + 1 assignments[a_ind,:s_ind] = msm_analysis.sample(T, init_state, s_ind) # build a new MSM from all the simulation so far C_raw = MSMLib.get_count_matrix_from_assignments( assignments, n_states=num_states ) C_raw = C_raw + C_raw.T # might want to add trimming, etc. T_pred = MSMLib.estimate_transition_matrix(C_raw) # calculate the error between the real transition matrix and our best prediction assert T.shape == T_pred.shape distance_to_target[sampling_round] = np.sqrt( ((T_pred - T).data ** 2).sum() ) \ / float(num_states) if observable_function: obs_distance[sampling_round] = np.abs(observable_function(T_mod) - obs_goal) return distance_to_target, obs_distance