示例#1
0
    def default(dtrajs, n_states: int, lagtime: int, n_samples: int = 100,
                stride: Union[str, int] = 'effective',
                p0_prior: Optional[Union[str, float, np.ndarray]] = 'mixed',
                transition_matrix_prior: Union[str, np.ndarray] = 'mixed',
                separate: Optional[Union[int, List[int]]] = None,
                store_hidden: bool = False,
                reversible: bool = True,
                stationary: bool = False,
                dt_traj: str = '1 step'):
        """
        Computes a default prior for a BHMSM and uses that for error estimation.
        For a more detailed description of the arguments please
        refer to :class:`HMSM <sktime.markovprocess.hidden_markov_model.HMSM>` or
        :class:`BayesianHMSM <sktime.markovprocess.bayesian_hmsm.BayesianHMSM>`.
        """
        dtrajs = ensure_dtraj_list(dtrajs)
        prior_est = BayesianHMSM.default_prior_estimator(n_states=n_states, lagtime=lagtime, stride=stride,
                                                         reversible=reversible, stationary=stationary,
                                                         separate=separate, dt_traj=dt_traj)
        prior = prior_est.fit(dtrajs).fetch_model().submodel_largest(connectivity_threshold='1/n', dtrajs=dtrajs)

        estimator = BayesianHMSM(init_hmsm=prior, n_states=n_states, lagtime=lagtime, n_samples=n_samples,
                                 stride=stride, p0_prior=p0_prior, transition_matrix_prior=transition_matrix_prior,
                                 store_hidden=store_hidden, reversible=reversible,
                                 stationary=stationary)
        return estimator
示例#2
0
def count_states(dtrajs, ignore_negative: bool = False):
    r"""Computes a histogram over the visited states in one or multiple discretized trajectories.

    Parameters
    ----------
    dtrajs : array_like or list of array_like
        Discretized trajectory or list of discretized trajectories
    ignore_negative : bool, default=False
        Ignore negative elements. By default, a negative element will cause an
        exception

    Returns
    -------
    count : ndarray((n), dtype=int)
        the number of occurrences of each state. n=max+1 where max is the largest state index found.

    """
    dtrajs = ensure_dtraj_list(dtrajs)

    max_n_states = 0
    histograms = []
    for discrete_trajectory in dtrajs:
        if ignore_negative:
            discrete_trajectory = discrete_trajectory[np.where(discrete_trajectory >= 0)]
        trajectory_histogram = np.bincount(discrete_trajectory)
        max_n_states = max(max_n_states, trajectory_histogram.shape[0])
        histograms.append(trajectory_histogram)
    # allocate space for histogram
    res = np.zeros(max_n_states, dtype=int)
    # aggregate histograms over trajectories
    for trajectory_histogram in histograms:
        res[:trajectory_histogram.shape[0]] += trajectory_histogram
    return res
示例#3
0
def lag_observations(observations, lag, stride=1):
    r""" Create new trajectories that are subsampled at lag but shifted

    Given a trajectory (s0, s1, s2, s3, s4, ...) and lag 3, this function will generate 3 trajectories
    (s0, s3, s6, ...), (s1, s4, s7, ...) and (s2, s5, s8, ...). Use this function in order to parametrize a MLE
    at lag times larger than 1 without discarding data. Do not use this function for Bayesian estimators, where
    data must be given such that subsequent transitions are uncorrelated.

    Parameters
    ----------
    observations : array_like or list of array_like
        observation trajectories
    lag : int
        lag time
    stride : int, default=1
        will return only one trajectory for every stride. Use this for Bayesian analysis.

    """
    # todo cppify
    observations = ensure_dtraj_list(observations)
    obsnew = []
    for obs in observations:
        for shift in range(0, lag, stride):
            obs_lagged = obs[shift::lag]
            if len(obs_lagged) > 1:
                obsnew.append(obs_lagged)
    return obsnew
def initial_guess_gaussian_from_data(dtrajs, n_hidden_states, reversible):
    r""" Makes an initial guess :class:`HMM <HiddenMarkovStateModel>` with Gaussian output model.

    To this end, a Gaussian mixture model is estimated using `scikit-learn <https://scikit-learn.org/>`_.

    Parameters
    ----------
    dtrajs : array_like or list of array_like
        Trajectories which are used for making the initial guess.
    n_hidden_states : int
        Number of hidden states.
    reversible : bool
        Whether the hidden transition matrix is estimated so that it is reversible.

    Returns
    -------
    hmm_init : HiddenMarkovStateModel
        An initial guess for the HMM

    See Also
    --------
    GaussianOutputModel : The type of output model this heuristic uses.
    initial_guess_discrete_from_data : Initial guess with :class:`Discrete output model <sktime.markov.hmm.DiscreteOutputModel>`.
    initial_guess_discrete_from_msm : Initial guess from an already
                                      existing :class:`MSM <sktime.markov.msm.MarkovStateModel>`
                                      with discrete output model.
    """
    from sklearn.mixture import GaussianMixture
    dtrajs = ensure_dtraj_list(dtrajs)
    collected_observations = np.concatenate(dtrajs)
    gmm = GaussianMixture(n_components=n_hidden_states)
    gmm.fit(collected_observations[:, None])
    output_model = GaussianOutputModel(n_hidden_states,
                                       means=gmm.means_[:, 0],
                                       sigmas=np.sqrt(gmm.covariances_[:, 0]))

    # Compute fractional state memberships.
    Nij = np.zeros((n_hidden_states, n_hidden_states))
    for o_t in dtrajs:
        # length of trajectory
        T = o_t.shape[0]
        # output probability
        pobs = output_model.to_state_probability_trajectory(o_t)
        # normalize
        pobs /= pobs.sum(axis=1)[:, None]
        # Accumulate fractional transition counts from this trajectory.
        for t in range(T - 1):
            Nij += np.outer(pobs[t, :], pobs[t + 1, :])

    # Compute transition matrix maximum likelihood estimate.
    import msmtools.estimation as msmest
    import msmtools.analysis as msmana
    Tij = msmest.transition_matrix(Nij, reversible=reversible)
    pi = msmana.stationary_distribution(Tij)
    return HiddenMarkovStateModel(transition_model=Tij,
                                  output_model=output_model,
                                  initial_distribution=pi)
示例#5
0
 def nonempty_obs(self, dtrajs):
     if dtrajs is None:
         raise ValueError("Needs nonempty dtrajs to evaluate nonempty obs.")
     dtrajs = ensure_dtraj_list(dtrajs)
     dtrajs_lagged_strided = compute_dtrajs_effective(
         dtrajs, self.count_model.lagtime, self.count_model.n_states_full,
         self.stride)
     obs = np.where(count_states(dtrajs_lagged_strided) > 0)[0]
     return obs
示例#6
0
 def test_2state_rev_step(self):
     obs = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int)
     dtrajs = ensure_dtraj_list(obs)
     init_hmm = initial_guess_discrete_from_data(dtrajs,
                                                 2,
                                                 1,
                                                 regularize=False)
     hmm = MaximumLikelihoodHMSM(init_hmm,
                                 lagtime=1).fit(dtrajs).fetch_model()
     # this will generate disconnected count matrices and should fail:
     with self.assertRaises(NotImplementedError):
         BayesianHMSM(hmm).fit(obs)
示例#7
0
def visited_set(dtrajs):
    r"""returns the set of states that have at least one count

    Parameters
    ----------
    dtraj : array_like or list of array_like
        Discretized trajectory or list of discretized trajectories

    Returns
    -------
    vis : ndarray((n), dtype=int)
        the set of states that have at least one count.
    """
    dtrajs = ensure_dtraj_list(dtrajs)
    hist = count_states(dtrajs)
    return np.argwhere(hist > 0)[:, 0]
示例#8
0
def compute_effective_stride(dtrajs, lagtime, n_states) -> int:
    r"""
    Computes the effective stride which is an estimate of the striding required to produce uncorrelated samples.
    By default this is the lagtime (lag sampling). A nonreversible MSM is estimated, if its number of states is larger
    than the number of states provided to this method, stride is set to the minimum of lagtime and two times the
    correlation time of the next neglected timescale.

    Parameters
    ----------
    dtrajs : array_like or list of array_like
        Discretized trajectory or list of discretized trajectories
    lagtime : int
        Lagtime
    n_states : int
        Number of resolved states

    Returns
    -------
    stride : int
        Estimated effective stride to produce approximately uncorrelated samples
    """
    dtrajs = ensure_dtraj_list(dtrajs)
    # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
    # how many uncorrelated counts we can make
    stride = lagtime
    # get a quick fit from the spectral radius of the non-reversible
    from sktime.markovprocess import TransitionCountEstimator
    count_model = TransitionCountEstimator(lagtime=lagtime, count_mode="sliding").fit(dtrajs).fetch_model()
    count_model = count_model.submodel_largest()
    from sktime.markovprocess import MaximumLikelihoodMSM
    msm_non_rev = MaximumLikelihoodMSM(reversible=False, sparse=False).fit(count_model).fetch_model()
    # if we have more than n_states timescales in our MSM, we use the next (neglected) timescale as an
    # fit of the de-correlation time
    if msm_non_rev.n_states > n_states:
        # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning
        import warnings
        from msmtools.util.exceptions import ImaginaryEigenValueWarning
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=ImaginaryEigenValueWarning,
                                    module='msmtools.analysis.dense.decomposition')
            correlation_time = max(1, msm_non_rev.timescales()[n_states - 1])
        # use the smaller of these two pessimistic estimates
        stride = int(min(lagtime, 2 * correlation_time))

    return stride
示例#9
0
    def transform_discrete_trajectories_to_observed_symbols(self, dtrajs):
        r"""A list of integer arrays with the discrete trajectories mapped to the currently used set of observation
        symbols. For example, if there has been a subselection of the model for connectivity='largest', the indices
        will be given within the connected set, frames that do not correspond to a considered symbol are set to -1.

        Parameters
        ----------
        dtrajs : array_like or list of array_like
            discretized trajectories

        Returns
        -------
        array_like or list of array_like
            Curated discretized trajectories so that unconsidered symbols are mapped to -1.
        """

        dtrajs = ensure_dtraj_list(dtrajs)
        mapping = -1 * np.ones(self.n_observation_states_full, dtype=np.int32)
        mapping[self.observation_state_symbols] = np.arange(
            self.n_observation_states)
        return [mapping[dtraj] for dtraj in dtrajs]
示例#10
0
    def nonempty_obs(self, dtrajs) -> np.ndarray:
        r"""
        Computes the set of visited observable states given a set of discrete trajectories.

        Parameters
        ----------
        dtrajs : array_like
            observable trajectory

        Returns
        -------
        symbols : np.ndarray
            The observation symbols which are visited.
        """
        from sktime.markov.util import compute_dtrajs_effective, count_states
        if dtrajs is None:
            raise ValueError("Needs nonempty dtrajs to evaluate nonempty obs.")
        dtrajs = ensure_dtraj_list(dtrajs)
        dtrajs_lagged_strided = compute_dtrajs_effective(
            dtrajs, self.transition_model.lagtime,
            self.transition_model.count_model.n_states_full, self.stride)
        obs = np.where(count_states(dtrajs_lagged_strided) > 0)[0]
        return obs
示例#11
0
def compute_index_states(dtrajs, subset=None) -> typing.List[np.ndarray]:
    """Generates a trajectory/time indices for the given list of states

    Parameters
    ----------
    dtraj : array_like or list of array_like
        Discretized trajectory or list of discretized trajectories. Negative elements will be ignored
    subset : ndarray((n)), optional, default = None
        array of states to be indexed. By default all states in dtrajs will be used

    Returns
    -------
    indices : list of ndarray( (N_i, 2) )
        For each state, all trajectory and time indices where this state occurs.
        Each matrix has a number of rows equal to the number of occurrences of the corresponding state,
        with rows consisting of a tuple (i, t), where i is the index of the trajectory and t is the time index
        within the trajectory.

    """
    # check input
    from . import _markovprocess_bindings as bd
    dtrajs = ensure_dtraj_list(dtrajs)
    return bd.sample.index_states(dtrajs, subset)
示例#12
0
    def compute_viterbi_paths(self, observations) -> List[np.ndarray]:
        r"""
        Computes the Viterbi paths using the current HMM model.

        Parameters
        ----------
        observations : list of array_like or array_like
            observations

        Returns
        -------
        paths : list of np.ndarray
            the computed viterbi paths
        """
        observations = ensure_dtraj_list(observations)
        A = self.transition_model.transition_matrix
        pi = self.initial_distribution
        state_probabilities = [
            self.output_model.to_state_probability_trajectory(obs)
            for obs in observations
        ]
        paths = [viterbi(A, obs, pi) for obs in state_probabilities]
        return paths
示例#13
0
    def fit(self, data, *args, **kw):
        r""" Counts transitions at given lag time according to configuration of the estimator.

        Parameters
        ----------
        data : array_like or list of array_like
            discretized trajectories
        """
        dtrajs = ensure_dtraj_list(data)

        # basic count statistics
        histogram = count_states(dtrajs, ignore_negative=True)

        # Compute count matrix
        count_mode = self.count_mode
        lagtime = self.lagtime
        if count_mode == 'sliding' or count_mode == 'sliding-effective':
            count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=True, sparse_return=self.sparse)
            if count_mode == 'sliding-effective':
                count_matrix /= lagtime
        elif count_mode == 'sample':
            count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=False, sparse_return=self.sparse)
        elif count_mode == 'effective':
            count_matrix = msmest.effective_count_matrix(dtrajs, lagtime)
            if not self.sparse and issparse(count_matrix):
                count_matrix = count_matrix.toarray()
        else:
            raise ValueError('Count mode {} is unknown.'.format(count_mode))

        # initially state symbols, full count matrix, and full histogram can be left None because they coincide
        # with the input arguments
        self._model = TransitionCountModel(
            count_matrix=count_matrix, counting_mode=count_mode, lagtime=lagtime, state_histogram=histogram,
            physical_time=self.physical_time
        )

        return self
    def fit(self, dtrajs, **kwargs):
        dtrajs = ensure_dtraj_list(dtrajs)
        # CHECK LAG
        trajlengths = [len(dtraj) for dtraj in dtrajs]
        if self.lagtime >= np.max(trajlengths):
            raise ValueError(
                f'Illegal lag time {self.lagtime} exceeds longest trajectory length'
            )
        if self.lagtime > np.mean(trajlengths):
            warnings.warn(
                f'Lag time {self.lagtime} is on the order of mean trajectory length '
                f'{np.mean(trajlengths)}. It is recommended to fit four lag times in each '
                'trajectory. HMM might be inaccurate.')

        dtrajs_lagged_strided = compute_dtrajs_effective(
            dtrajs,
            lagtime=self.lagtime,
            n_states=self.n_hidden_states,
            stride=self.stride)

        # INIT HMM
        if isinstance(self.msm_init, str):
            args = dict(observations=dtrajs_lagged_strided,
                        n_states=self.n_hidden_states,
                        lag=1,
                        reversible=self.reversible,
                        stationary=True,
                        regularize=True,
                        separate=self.separate)
            if self.msm_init == 'largest-strong':
                args['method'] = 'lcs-spectral'
            elif self.msm_init == 'all':
                args['method'] = 'spectral'

            hmm_init = init_discrete_hmm(**args)
        elif isinstance(self.msm_init, MarkovStateModel):
            msm_count_model = self.msm_init.count_model
            p0, P0, pobs0 = init_discrete_hmm_spectral(
                msm_count_model.count_matrix.toarray(),
                self.n_hidden_states,
                reversible=self.reversible,
                stationary=True,
                P=self.msm_init.transition_matrix,
                separate=self.separate)
            hmm_init = discrete_hmm(p0, P0, pobs0)
        else:
            raise RuntimeError(
                "msm init was neither a string (largest-strong or spectral) nor "
                "a MarkovStateModel: {}".format(self.msm_init))

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------
        from .bhmm.estimators.maximum_likelihood import MaximumLikelihoodHMM
        hmm_est = MaximumLikelihoodHMM(self.n_hidden_states,
                                       initial_model=hmm_init,
                                       output='discrete',
                                       reversible=self.reversible,
                                       stationary=self.stationary,
                                       accuracy=self.accuracy,
                                       maxit=self.maxit)
        hmm = hmm_est.fit(dtrajs_lagged_strided).fetch_model()
        # observation_state_symbols = np.unique(np.concatenate(dtrajs_lagged_strided))
        # update the count matrix from the counts obtained via the Viterbi paths.
        hmm_count_model = TransitionCountModel(
            count_matrix=hmm.transition_counts,
            lagtime=self.lagtime,
            physical_time=self.physical_time)
        # set model parameters
        self._model = HiddenMarkovStateModel(
            transition_matrix=hmm.transition_matrix,
            observation_probabilities=hmm.output_model.output_probabilities,
            stride=self.stride,
            stationary_distribution=hmm.stationary_distribution,
            initial_counts=hmm.initial_count,
            reversible=self.reversible,
            initial_distribution=hmm.initial_distribution,
            count_model=hmm_count_model,
            bhmm_model=hmm,
            observation_state_symbols=None)
        return self
示例#15
0
    def fit(self, dtrajs, callback=None):
        dtrajs = ensure_dtraj_list(dtrajs)

        model = BayesianHMMPosterior()

        # check if n_states and lag are compatible
        if self.lagtime != self.init_hmsm.lagtime:
            raise ValueError('BayesianHMSM cannot be initialized with init_hmsm with incompatible lagtime.')
        if self.n_states != self.init_hmsm.n_states:
            raise ValueError('BayesianHMSM cannot be initialized with init_hmsm with incompatible n_states.')

        # EVALUATE STRIDE
        init_stride = self.init_hmsm.stride
        if self.stride == 'effective':
            from sktime.markovprocess.util import compute_effective_stride
            self.stride = compute_effective_stride(dtrajs, self.lagtime, self.n_states)

        # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible
        dtrajs_lagged_strided = compute_dtrajs_effective(
            dtrajs, lagtime=self.lagtime, n_states=self.n_states, stride=self.stride
        )
        if self.stride != init_stride:
            symbols = np.unique(np.concatenate(dtrajs_lagged_strided))
            if not np.all(self.init_hmsm.observation_state_symbols == symbols):
                raise ValueError('Choice of stride has excluded a different set of microstates than in '
                                 'init_hmsm. Set of observed microstates in time-lagged strided trajectories '
                                 'must match to the one used for init_hmsm estimation.')

        # as mentioned in the docstring, take init_hmsm observed set observation probabilities
        self.observe_nonempty = False

        # update HMM Model
        model.prior = self.init_hmsm.copy()

        prior = model.prior
        prior_count_model = prior.count_model
        # check if we have a valid initial model
        if self.reversible and not is_connected(prior_count_model.count_matrix):
            raise NotImplementedError(f'Encountered disconnected count matrix:\n{self.count_matrix} '
                                      f'with reversible Bayesian HMM sampler using lag={self.lag}'
                                      f' and stride={self.stride}. Consider using shorter lag, '
                                      'or shorter stride (to use more of the data), '
                                      'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        n_states_full = number_of_states(dtrajs)

        if prior.n_observation_states < n_states_full:
            eps = 0.01 / n_states_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * np.ones((self.n_states, n_states_full), dtype=np.float64)
            # fill active states
            B_init[:, prior.observation_state_symbols] = np.maximum(eps, prior.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = prior.observation_probabilities

        # HMM sampler
        if self.init_hmsm is not None:
            hmm_mle = self.init_hmsm.bhmm_model
        else:
            hmm_mle = discrete_hmm(prior.initial_distribution, prior.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(dtrajs_lagged_strided, hmm_mle, nsample=self.n_samples,
                                   reversible=self.reversible, stationary=self.stationary,
                                   p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior,
                                   store_hidden=self.store_hidden, callback=callback).fetch_model()

        # repackage samples as HMSM objects and re-normalize after restricting to observable set
        samples = []
        for sample in sampled_hmm:  # restrict to observable set if necessary
            P = sample.transition_matrix
            pi = sample.stationary_distribution
            pobs = sample.output_model.output_probabilities
            init_dist = sample.initial_distribution

            Bobs = pobs[:, prior.observation_state_symbols]
            pobs = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(HiddenMarkovStateModel(P, pobs, stationary_distribution=pi,
                                                  count_model=prior_count_model, initial_counts=sample.initial_count,
                                                  reversible=self.reversible, initial_distribution=init_dist))

        # store results
        if self.store_hidden:
            model.hidden_state_trajectories_samples = [s.hidden_state_trajectories for s in sampled_hmm]
        model.samples = samples

        # set new model
        self._model = model

        return self
示例#16
0
 def submodel_populous(self, strong=True, connectivity_threshold='1/n', observe_nonempty=True, dtrajs=None):
     dtrajs = ensure_dtraj_list(dtrajs)
     states = self.prior.states_populous(strong=strong, connectivity_threshold=connectivity_threshold)
     obs = self.prior.nonempty_obs(dtrajs) if observe_nonempty else None
     return self.submodel(states=states, obs=obs)
示例#17
0
def score_cv(estimator: _MSMBaseEstimator,
             dtrajs,
             lagtime,
             n=10,
             count_mode="sliding",
             score_method='VAMP2',
             score_k=10,
             random_state=None):
    r""" Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and cross-validation [3]_ .

    Divides the data into training and test data, fits a MSM using the training
    data using the parameters of this estimator, and scores is using the test
    data.
    Currently only one way of splitting is implemented, where for each n,
    the data is randomly divided into two approximately equally large sets of
    discrete trajectory fragments with lengths of at least the lagtime.

    Currently only implemented using dense matrices - will be slow for large state spaces.

    Parameters
    ----------
    estimator : MSMBaseEstimator like
        estimator to produce models for CV.
    dtrajs : list of array_like
        Test data (discrete trajectories).
    lagtime : int
        lag time
    n : number of samples
        Number of repetitions of the cross-validation. Use large n to get solid
        means of the score.
    count_mode : str, optional, default='sliding'
        counting mode of count matrix estimator, if sliding the trajectory is split in a sliding window fashion.
        Supports 'sliding' and 'sample'.
    score_method : str, optional, default='VAMP2'
        Overwrite scoring method to be used if desired. If `None`, the estimators scoring
        method will be used.
        Available scores are based on the variational approach for Markov processes [1]_ [2]_ :

        *  'VAMP1'  Sum of singular values of the symmetrized transition matrix [2]_ .
                    If the MSM is reversible, this is equal to the sum of transition
                    matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ .
        *  'VAMP2'  Sum of squared singular values of the symmetrized transition matrix [2]_ .
                    If the MSM is reversible, this is equal to the kinetic variance [4]_ .

    score_k : int or None
        The maximum number of eigenvalues or singular values used in the
        score. If set to None, all available eigenvalues will be used.

    References
    ----------
    .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes
        in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013).
    .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes
        from time series data (in preparation).
    .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow
        dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015).
    .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular
        dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015).

    """
    from sktime.markovprocess import TransitionCountEstimator
    from sktime.util import ensure_dtraj_list
    dtrajs = ensure_dtraj_list(dtrajs)  # ensure format
    if count_mode not in ('sliding', 'sample'):
        raise ValueError(
            'score_cv currently only supports count modes "sliding" and "sample"'
        )
    sliding = count_mode == 'sliding'
    scores = []
    for fold in range(n):
        dtrajs_split = blocksplit_dtrajs(dtrajs,
                                         lag=lagtime,
                                         sliding=sliding,
                                         random_state=random_state)
        dtrajs_train, dtrajs_test = cvsplit_dtrajs(dtrajs_split,
                                                   random_state=random_state)

        cc = TransitionCountEstimator(
            lagtime,
            count_mode).fit(dtrajs_train).fetch_model().submodel_largest()
        model = estimator.fit(cc).fetch_model()
        s = model.score(dtrajs_test,
                        score_method=score_method,
                        score_k=score_k)
        scores.append(s)
    return np.array(scores)
def initial_guess_discrete_from_data(
        dtrajs,
        n_hidden_states,
        lagtime,
        stride=1,
        mode='largest-regularized',
        reversible: bool = True,
        stationary: bool = False,
        separate_symbols=None,
        states: Optional[np.ndarray] = None,
        regularize: bool = True,
        connectivity_threshold: Union[str, float] = 0.):
    r"""Estimates an initial guess :class:`HMM <sktime.markov.hmm.HiddenMarkovStateModel>` from given
    discrete trajectories.

    Following the procedure described in [1]_: First a :class:`MSM <sktime.markov.msm.MarkovStateModel>` is
    estimated, which is then subsequently coarse-grained with PCCA+ [2]_. After estimation of the MSM, this method
    class :meth:`initial_guess_discrete_from_msm`.

    Parameters
    ----------
    dtrajs : array_like or list of array_like
        A discrete trajectory or a list of discrete trajectories.
    n_hidden_states : int
        Number of hidden states.
    lagtime : int
        The lagtime at which transitions are counted.
    stride : int or str, optional, default=1
        stride between two lagged trajectories extracted from the input trajectories. Given trajectory :code:`s[t]`,
        stride and lag will result in trajectories

            :code:`s[0], s[lag], s[2 lag], ...`

            :code:`s[stride], s[stride + lag], s[stride + 2 lag], ...`

        Setting stride = 1 will result in using all data (useful for maximum likelihood estimator), while a Bayesian
        estimator requires a longer stride in order to have statistically uncorrelated trajectories. Setting
        :code:`stride='effective'` uses the largest neglected timescale as an estimate for the correlation time
        and sets the stride accordingly.
    mode : str, optional, default='largest-regularized'
        The mode at which the markov state model is estimated. Since the process is assumed to be reversible and
        finite statistics might lead to unconnected regions in state space, a subselection can automatically be made
        and the count matrix can be regularized. The following options are available:

        * 'all': all available states are taken into account
        * 'largest': the largest connected state set is selected, see
          :meth:`TransitionCountModel.submodel_largest <sktime.markov.TransitionCountModel.submodel_largest>`.
        * populus: the connected set with the largest population in the data, see
          :meth:`TransitionCountModel.submodel_largest <sktime.markov.TransitionCountModel.submodel_largest>`.

        For regularization, each of the options can be suffixed by a '-regularized', e.g., 'largest-regularized'.
        This means that the count matrix has no zero entries and everything is reversibly connected. In particular,
        a prior of the form

        .. math:: b_{ij}=\left \{ \begin{array}{rl}
                     \alpha & \text{, if }c_{ij}+c_{ji}>0, \\
                     0      & \text{, otherwise,}
                     \end{array} \right .

        with :math:`\alpha=10^{-3}` is added and all non-reversibly connected components are artifically connected
        by adding backward paths.
    reversible : bool, optional, default=True
        Whether the HMM transition matrix is estimated so that it is reversibe.
    stationary : bool, optional, default=False
        If True, the initial distribution of hidden states is self-consistently computed as the stationary
        distribution of the transition matrix. If False, it will be estimated from the starting states.
        Only set this to true if you're sure that the observation trajectories are initiated from a global
        equilibrium distribution.
    separate_symbols : array_like, optional, default=None
        Force the given set of observed states to stay in a separate hidden state.
        The remaining nstates-1 states will be assigned by a metastable decomposition.
    states : (dtype=int) ndarray, optional, default=None
        Artifically restrict count model to selection of states, even before regularization.
    regularize : bool, optional, default=True
        If set to True, makes sure that the hidden initial distribution and transition matrix have nonzero probabilities
        by setting them to eps and then renormalizing. Avoids zeros that would cause estimation algorithms to crash or
        get stuck in suboptimal states.
    connectivity_threshold : float or '1/n', optional, default=0.
        Connectivity threshold. counts that are below the specified value are disregarded when finding connected
        sets. In case of '1/n', the threshold gets resolved to :math:`1 / \mathrm{n\_states\_full}`.

    Returns
    -------
    hmm_init : HiddenMarkovStateModel
        An initial guess for the HMM

    See Also
    --------
    DiscreteOutputModel : The type of output model this heuristic uses.
    initial_guess_discrete_from_msm : Initial guess from an already existing :class:`MSM <sktime.markov.msm.MarkovStateModel>`.
    initial_guess_gaussian_from_data : Initial guess with :class:`Gaussian output model <sktime.markov.hmm.GaussianOutputModel>`.

    References
    ----------
    .. [1] F. Noe, H. Wu, J.-H. Prinz and N. Plattner: Projected and hidden Markov models for calculating kinetics and
       metastable states of complex molecules. J. Chem. Phys. 139, 184114 (2013)
    .. [2] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+:
       application to Markov state models and data classification.
       Adv Data Anal Classif 7, 147-179 (2013).
    """
    if mode not in initial_guess_discrete_from_data.VALID_MODES \
            + [m + "-regularized" for m in initial_guess_discrete_from_data.VALID_MODES]:
        raise ValueError("mode can only be one of [{}]".format(", ".join(
            initial_guess_discrete_from_data.VALID_MODES)))

    dtrajs = ensure_dtraj_list(dtrajs)
    dtrajs = compute_dtrajs_effective(dtrajs,
                                      lagtime=lagtime,
                                      n_states=n_hidden_states,
                                      stride=stride)
    counts = TransitionCountEstimator(1, 'sliding',
                                      sparse=False).fit(dtrajs).fetch_model()
    if states is not None:
        counts = counts.submodel(states)
    if '-regularized' in mode:
        import msmtools.estimation as memest
        counts.count_matrix[...] += memest.prior_neighbor(
            counts.count_matrix, 0.001)
        nonempty = np.where(
            counts.count_matrix.sum(axis=0) +
            counts.count_matrix.sum(axis=1) > 0)[0]
        counts.count_matrix[nonempty, nonempty] = np.maximum(
            counts.count_matrix[nonempty, nonempty], 0.001)
    if 'all' in mode:
        pass  # no-op
    if 'largest' in mode:
        counts = counts.submodel_largest(
            directed=True,
            connectivity_threshold=connectivity_threshold,
            sort_by_population=False)
    if 'populous' in mode:
        counts = counts.submodel_largest(
            directed=True,
            connectivity_threshold=connectivity_threshold,
            sort_by_population=True)
    msm = MaximumLikelihoodMSM(reversible=True,
                               allow_disconnected=True,
                               maxerr=1e-3,
                               maxiter=10000).fit(counts).fetch_model()
    return initial_guess_discrete_from_msm(msm, n_hidden_states, reversible,
                                           stationary, separate_symbols,
                                           regularize)
    def fit(self, dtrajs, initial_model=None, **kwargs):
        r""" Fits a new :class:`HMM <HiddenMarkovStateModel>` to data.

        Parameters
        ----------
        dtrajs : array_like or list of array_like
            Timeseries data.
        initial_model : HiddenMarkovStateModel, optional, default=None
            Override for :attr:`initial_transition_model`.
        **kwargs
            Ignored kwargs for scikit-learn compatibility.

        Returns
        -------
        self : MaximumLikelihoodHMSM
            Reference to self.
        """
        if initial_model is None:
            initial_model = self.initial_transition_model
        if initial_model is None or not isinstance(initial_model,
                                                   HiddenMarkovStateModel):
            raise ValueError(
                "For estimation, an initial model of type "
                "`sktime.markov.hmm.HiddenMarkovStateModel` is required.")

        # copy initial model
        transition_matrix = initial_model.transition_model.transition_matrix
        if issparse(transition_matrix):
            # want dense matrix, toarray makes a copy
            transition_matrix = transition_matrix.toarray()
        else:
            # new instance
            transition_matrix = np.copy(transition_matrix)

        hmm_data = MaximumLikelihoodHMSM._HMMModelStorage(
            transition_matrix=transition_matrix,
            output_model=initial_model.output_model.copy(),
            initial_distribution=initial_model.initial_distribution.copy())

        dtrajs = ensure_dtraj_list(dtrajs)
        dtrajs = compute_dtrajs_effective(
            dtrajs,
            lagtime=self.lagtime,
            n_states=initial_model.n_hidden_states,
            stride=self.stride)

        max_n_frames = max(len(obs) for obs in dtrajs)
        # pre-construct hidden variables
        N = initial_model.n_hidden_states
        alpha = np.zeros((max_n_frames, N))
        beta = np.zeros((max_n_frames, N))
        gammas = [np.zeros((len(obs), N)) for obs in dtrajs]
        count_matrices = [np.zeros((N, N)) for _ in dtrajs]

        it = 0
        likelihoods = np.empty(self.maxit)
        # flag if connectivity has changed (e.g. state lost) - in that case the likelihood
        # is discontinuous and can't be used as a convergence criterion in that iteration.
        tmatrix_nonzeros = hmm_data.transition_matrix.nonzero()
        converged = False

        while not converged and it < self.maxit:
            loglik = 0.0
            for obs, gamma, counts in zip(dtrajs, gammas, count_matrices):
                loglik_update, _ = self._forward_backward(
                    hmm_data, obs, alpha, beta, gamma, counts)
                loglik += loglik_update
            assert np.isfinite(loglik), it

            # convergence check
            if it > 0:
                dL = loglik - likelihoods[it - 1]
                if dL < self.accuracy:
                    converged = True

            # update model
            self._update_model(hmm_data,
                               dtrajs,
                               gammas,
                               count_matrices,
                               maxiter=self.maxit_reversible)

            # connectivity change check
            tmatrix_nonzeros_new = hmm_data.transition_matrix.nonzero()
            if not np.array_equal(tmatrix_nonzeros, tmatrix_nonzeros_new):
                converged = False  # unset converged
                tmatrix_nonzeros = tmatrix_nonzeros_new

            # end of iteration
            likelihoods[it] = loglik
            it += 1

        likelihoods = np.resize(likelihoods, it)

        transition_counts = self._reduce_transition_counts(count_matrices)

        count_model = TransitionCountModel(count_matrix=transition_counts,
                                           lagtime=self.lagtime,
                                           physical_time=self.physical_time)
        transition_model = MarkovStateModel(hmm_data.transition_matrix,
                                            reversible=self.reversible,
                                            count_model=count_model)
        hidden_state_trajs = [
            viterbi(hmm_data.transition_matrix,
                    hmm_data.output_model.to_state_probability_trajectory(obs),
                    hmm_data.initial_distribution) for obs in dtrajs
        ]
        model = HiddenMarkovStateModel(
            transition_model=transition_model,
            output_model=hmm_data.output_model,
            initial_distribution=hmm_data.initial_distribution,
            likelihoods=likelihoods,
            state_probabilities=gammas,
            initial_count=self._init_counts(gammas),
            hidden_state_trajectories=hidden_state_trajs,
            stride=self.stride)
        self._model = model
        return self