def get_count_matrix_from_assignments(assignments, n_states=None, lag_time=1, sliding_window=True, use_mask=False): """ Calculate counts matrix from `assignments`. Parameters ---------- assignments : ndarray A 2d ndarray containing the state assignments. n_states : int, optional Can be automatically determined, unless you want a model with more states than are observed lag_time: int, optional the LagTime with which to estimate the count matrix. Default: 1 sliding_window: bool, optional Use a sliding window. Default: True use_mask: bool, optional Use a mask object in computing the count matrix. Default: False Returns ------- counts : sparse matrix `Counts[i,j]` stores the number of times in the assignments that a trajectory went from state i to state j in `LagTime` frames Notes ----- assignments are input as iterables over numpy 1-d arrays of integers. For example a 2-d array where assignments[i,j] gives the ith trajectory, jth frame. The beginning and end of each trajectory may be padded with negative ones, which will be ignored. If the number of states is not given explitly, it will be determined as one plus the largest state index of the Assignments. Sliding window yields non-independent samples, but wastes less data. """ check_assignment_array_input(assignments) if not n_states: n_states = 1 + int( np.max([np.max(a) for a in assignments])) # Lutz: a single np.max is not enough, b/c it can't handle a list of 1-d arrays of different lengths if n_states < 1: raise ValueError() C = scipy.sparse.lil_matrix((int(n_states), int(n_states)), dtype='float32') # Lutz: why are we using float for count matrices? for A in assignments: FirstEntry = np.where(A != -1)[0] # New Code by KAB to skip pre-padded negative ones. # This should solve issues with Tarjan trimming results. if len(FirstEntry) >= 1: FirstEntry = FirstEntry[0] A = A[FirstEntry:] mask = None if use_mask != False: traj_num = np.where(assignments == A)[0][0] mask = mask.get_dihedral_mask(traj_num) C = C + get_counts_from_traj(A, n_states, lag_time=lag_time, sliding_window=sliding_window, mask=mask) # .tolil() return C
def get_count_matrix_from_assignments(assignments, n_states=None, lag_time=1, sliding_window=True): """ Calculate counts matrix from `assignments`. Parameters ---------- assignments : ndarray A 2d ndarray containing the state assignments. n_states : int, optional Can be automatically determined, unless you want a model with more states than are observed lag_time: int, optional the LagTime with which to estimate the count matrix. Default: 1 sliding_window: bool, optional Use a sliding window. Default: True Returns ------- counts : sparse matrix `Counts[i,j]` stores the number of times in the assignments that a trajectory went from state i to state j in `LagTime` frames Notes ----- assignments are input as iterables over numpy 1-d arrays of integers. For example a 2-d array where assignments[i,j] gives the ith trajectory, jth frame. The beginning and end of each trajectory may be padded with negative ones, which will be ignored. If the number of states is not given explitly, it will be determined as one plus the largest state index of the Assignments. Sliding window yields non-independent samples, but wastes less data. """ check_assignment_array_input(assignments) if not n_states: # Lutz: a single np.max is not enough, b/c it can't handle a list of 1-d arrays of different lengths n_states = 1 + int(np.max([np.max(a) for a in assignments])) if n_states < 1: raise ValueError() # Lutz: why are we using float for count matrices? C = scipy.sparse.lil_matrix((int(n_states), int(n_states)), dtype='float32') for A in assignments: FirstEntry = np.where(A != -1)[0] # New Code by KAB to skip pre-padded negative ones. # This should solve issues with Tarjan trimming results. if len(FirstEntry) >= 1: FirstEntry = FirstEntry[0] A = A[FirstEntry:] # .tolil() C = C + get_counts_from_traj(A, n_states, lag_time=lag_time, sliding_window=sliding_window) return C
def get_counts_from_traj(states, n_states=None, lag_time=1, sliding_window=True): """Computes the transition count matrix for a sequence of states (single trajectory). Parameters ---------- states : array A one-dimensional array of integers representing the sequence of states. These integers must be in the range [0, n_states] n_states : int The total number of states. If not specified, the largest integer in the states array plus one will be used. lag_time : int, optional The time delay over which transitions are counted sliding_window : bool, optional Use sliding window Returns ------- C : sparse matrix of integers The computed transition count matrix """ check_assignment_array_input(states, ndim=1) if not n_states: n_states = np.max(states) + 1 if sliding_window: from_states = states[: -lag_time: 1] to_states = states[lag_time:: 1] else: from_states = states[: -lag_time: lag_time] to_states = states[lag_time:: lag_time] assert from_states.shape == to_states.shape transitions = np.row_stack((from_states, to_states)) counts = np.ones(transitions.shape[1], dtype=int) try: C = scipy.sparse.coo_matrix((counts, transitions), shape=(n_states, n_states)) except ValueError: # Lutz: if we arrive here, there was probably a state with index -1 # we try to fix it by ignoring transitions in and out of those states # (we set both the count and the indices for those transitions to 0) mask = transitions < 0 counts[mask[0, :] | mask[1, :]] = 0 transitions[mask] = 0 C = scipy.sparse.coo_matrix((counts, transitions), shape=(n_states, n_states)) return C
def invert_assignments(assignments): """Invert an assignments array -- that is, produce a mapping from state -> traj/frame Parameters ---------- assignments : np.ndarray 2D array of MSMBuilder assignments Returns ------- inverse_mapping : collections.defaultdict Mapping from state -> traj,frame, such that inverse_mapping[s] gives the conformations assigned to state s. Notes ----- The assignments array may have -1's, which are simply placeholders we do not add these to the inverted assignments. Therefore, doing the following will raise a KeyError: >>> inv_assignments = MSMLib.invert_assignments(assignments) >>> print inv_assignments[-1] KeyError: -1 """ check_assignment_array_input(assignments) inverse_mapping = defaultdict(lambda: ([], [])) non_neg_inds = np.array(np.where(assignments != -1)).T # we do not care about -1's for (i, j) in non_neg_inds: inverse_mapping[assignments[i, j]][0].append(i) inverse_mapping[assignments[i, j]][1].append(j) # convert from lists to numpy arrays for key, (trajs, frames) in inverse_mapping.iteritems(): inverse_mapping[key] = (np.array(trajs), np.array(frames)) return inverse_mapping
def invert_assignments(assignments): """Invert an assignments array -- that is, produce a mapping from state -> traj/frame Parameters ---------- assignments : np.ndarray 2D array of MSMBuilder assignments Returns ------- inverse_mapping : collections.defaultdict Mapping from state -> traj,frame, such that inverse_mapping[s] gives the conformations assigned to state s. Notes ----- The assignments array may have -1's, which are simply placeholders we do not add these to the inverted assignments. Therefore, doing the following will raise a KeyError: >>> inv_assignments = MSMLib.invert_assignments(assignments) >>> print inv_assignments[-1] KeyError: -1 """ check_assignment_array_input(assignments) inverse_mapping = defaultdict(lambda: ([], [])) non_neg_inds = np.array(np.where(assignments != -1)).T # we do not care about -1's for (i, j) in non_neg_inds: inverse_mapping[assignments[i, j]][0].append(i) inverse_mapping[assignments[i, j]][1].append(j) # convert from lists to numpy arrays for key, (trajs, frames) in iteritems(inverse_mapping): inverse_mapping[key] = (np.array(trajs), np.array(frames)) return inverse_mapping
def apply_mapping_to_assignments(assignments, mapping): """Remap the states in an assignments file according to a mapping. Parameters ---------- assignments : ndarray Standard 2D assignments array mapping : ndarray 1D numpy array of length equal to the number of states in Assignments. Mapping[a] = b means that the frames currently in state a will be mapped to state b Returns ------- NewAssignments : ndarray Notes ----- This function is useful after performing PCCA or Ergodic Trimming. Also, the state -1 is treated specially -- it always stays -1 and is not remapped. """ check_assignment_array_input(assignments) NewMapping = mapping.copy() # Make a special state for things that get deleted by Ergodic Trimming. NewMapping[np.where(mapping == -1)] = mapping.max() + 1 NegativeOneStates = np.where(assignments == -1) assignments[:] = NewMapping[assignments] WhereEliminatedStates = np.where(assignments == (mapping.max() + 1)) # These are the dangling 'tails' of trajectories (with no actual data) that we denote state -1. assignments[NegativeOneStates] = -1 # These states have typically been "deleted" by the ergodic trimming # algorithm. Can be at beginning or end of trajectory. assignments[WhereEliminatedStates] = -1
def renumber_states(assignments): """Renumber states to be consecutive integers (0, 1, ... , n), performs this transformation in place. Parameters ---------- assignments : ndarray 2D array of msmbuilder assignments Returns ------- mapping : ndarray, int A mapping from the old numbering scheme to the new, such that mapping[new] = old Notes ----- Useful if some states have 0 counts. """ check_assignment_array_input(assignments) unique = list(np.unique(assignments)) if unique[0] == -1: minus_one = np.where(assignments == -1) unique.pop(0) else: minus_one = [] inverse_mapping = invert_assignments(assignments) for i, x in enumerate(unique): assignments[inverse_mapping[x]] = i assignments[minus_one] = -1 mapping = np.array(unique, dtype=int) return mapping