Exemplo n.º 1
0
def oom(dtrajs, tau, order):
    dtrajs=ensure_dtraj_list(dtrajs)

    pii=np.maximum(count_states(dtrajs),1e-20).reshape(-1)
    pii/=pii.sum()
    C=cmatrix(dtrajs,tau,sliding=True).toarray()+0.0
    C_mem=two_step_cmatrix(dtrajs,tau)+0.0
    C=C+C.T
    C/=C.sum()
    for i in range(C_mem.shape[0]):
        C_mem[i]=C_mem[i]+C_mem[i].T
    C_mem/=C_mem.sum()
    nstates=pii.shape[0]
    
    D=np.diag(1/np.sqrt(pii))
    pinv_R=pinv_cholcov(D.dot(C).dot(D),order)
    order=pinv_R.shape[0]

    Xi_set=np.empty((nstates,order,order))
    for i in range(C_mem.shape[0]):
        Xi_set[i]=pinv_R.dot(D).dot(C_mem[i]).dot(D).dot(pinv_R.T)
    
    omega=pii.reshape(1,-1).dot(D).dot(pinv_R.T)
    sigma=omega.reshape(-1,1)
    return {'sigma': sigma, 'omega': omega, 'Xi_set': Xi_set}
Exemplo n.º 2
0
def two_step_cmatrix(dtrajs, tau):
    nstates = number_of_states(dtrajs)
    C = np.zeros((nstates, nstates, nstates))

    dtrajs = ensure_dtraj_list(dtrajs)
    for dtraj in dtrajs:
        L = dtraj.shape[0]
        """For each 'middle state j' compute a two-step count matrix"""
        for l in range(L-2*tau):
            i = dtraj[l]
            j = dtraj[l+tau]
            k = dtraj[l+2*tau]
            C[j, i, k] += 1

    return C
Exemplo n.º 3
0
    def _estimate(self, dtrajs):
        ### PREPARE AND CHECK DATA
        # TODO: Currently only discrete trajectories are implemented. For a general class this needs to be changed.
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # check trajectory lengths
        if self._estimated:
            # if dtrajs has now changed, unset the _estimated flag to re-set every derived quantity.
            assert hasattr(self, '_last_dtrajs_input_hash')
            if self._last_dtrajs_input_hash != _hash_dtrajs(dtrajs):
                self.logger.warning(
                    "estimating from new data, discard all previously computed models."
                )
                self._estimated = False
        else:
            self._last_dtrajs_input_hash = _hash_dtrajs(dtrajs)

        self._trajlengths = np.fromiter((len(traj) for traj in dtrajs),
                                        dtype=int,
                                        count=len(dtrajs))
        maxlength = np.max(self._trajlengths)

        # set lag times by data if not yet set
        if self._lags is None:
            maxlag = 0.5 * np.sum(self._trajlengths) / float(
                len(self._trajlengths))
            self._lags = _generate_lags(maxlag, 1.5)

        # check if some lag times are forbidden.
        if np.max(self._lags) >= maxlength:
            Ifit = np.where(self._lags < maxlength)[0]
            Inofit = np.where(self._lags >= maxlength)[0]
            self.logger.warning(
                'Ignoring lag times that exceed the longest trajectory: %s',
                self._lags[Inofit])
            self._lags = self._lags[Ifit]

        ### RUN ESTIMATION
        if self._estimated:
            # we already had run an estimation, determine which lag times we need to compute
            # TODO: this will re-evaluate problematic lag times, wont it?
            lags = sorted(list(set(self._lags).difference(self._last_lags)))
            if len(lags) == 0:
                self.logger.info("All lag times already estimated.")
                return self
            assert lags
            self.logger.info(
                "Running estimating for not yet estimated lags times: %s",
                lags)
        else:
            lags = self._lags

        # construct all parameter sets for the estimator
        param_sets = tuple(param_grid({'lag': lags}))

        if isinstance(self.estimator, SampledModel):
            self.estimator.show_progress = False

        # run estimation on all lag times
        models, estimators = estimate_param_scan(self.estimator,
                                                 dtrajs,
                                                 param_sets,
                                                 failfast=False,
                                                 return_estimators=True,
                                                 n_jobs=self.n_jobs,
                                                 progress_reporter=self)
        self._estimators = estimators

        self._postprocess_results(models)
Exemplo n.º 4
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # CHECK LAG
        trajlengths = [_np.size(dtraj) for dtraj in dtrajs]
        if self.lag >= _np.max(trajlengths):
            raise ValueError('Illegal lag time ' + str(self.lag) +
                             ' exceeds longest trajectory length')
        if self.lag > _np.mean(trajlengths):
            self.logger.warning(
                'Lag time ' + str(self.lag) +
                ' is on the order of mean trajectory length ' +
                str(_np.mean(trajlengths)) +
                '. It is recommended to fit four lag times in each ' +
                'trajectory. HMM might be inaccurate.')

        # EVALUATE STRIDE
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # get a quick estimate from the spectral radius of the non-reversible
            from pyemma.msm import estimate_markov_model
            msm_nr = estimate_markov_model(dtrajs,
                                           lag=self.lag,
                                           reversible=False,
                                           sparse=False,
                                           connectivity='largest',
                                           dt_traj=self.timestep_traj)
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_nr.nstates > self.nstates:
                # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning
                import warnings
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        'ignore',
                        category=ImaginaryEigenValueWarning,
                        module=
                        'deeptime.markov.tools.analysis.dense.decomposition')
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                # use the smaller of these two pessimistic estimates
                self.stride = int(min(self.lag, 2 * corrtime))

        # LAG AND STRIDE DATA
        from deeptime.markov import compute_dtrajs_effective
        dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs,
                                                         self.lag,
                                                         n_states=-1,
                                                         stride=self.stride)

        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # INIT HMM
        from deeptime.markov.hmm import init
        from pyemma.msm.estimators import MaximumLikelihoodMSM
        from pyemma.msm.estimators import OOMReweightedMSM
        if self.msm_init == 'largest-strong':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                mode='largest-regularized',
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
        elif self.msm_init == 'all':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate,
                mode='all-regularized')
        elif isinstance(
                self.msm_init,
            (MaximumLikelihoodMSM, OOMReweightedMSM)):  # initial MSM given.
            msm = MarkovStateModel(transition_matrix=self.msm_init.P,
                                   count_model=TransitionCountModel(
                                       self.msm_init.count_matrix_active))
            hmm_init = init.discrete.metastable_from_msm(
                msm,
                n_hidden_states=self.nstates,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
            observe_subset = self.msm_init.active_set  # override observe_subset.
        else:
            raise ValueError('Unknown MSM initialization option: ' +
                             str(self.msm_init))

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------

        # run EM
        from deeptime.markov.hmm import MaximumLikelihoodHMM
        hmm_est = MaximumLikelihoodHMM(hmm_init,
                                       lagtime=self.lag,
                                       stride=self.stride,
                                       reversible=self.reversible,
                                       stationary=self.stationary,
                                       accuracy=self.accuracy,
                                       maxit=self.maxit)
        # run
        hmm_est.fit(dtrajs)
        # package in discrete HMM
        self.hmm = hmm_est.fetch_model()

        # get model parameters
        self.initial_distribution = self.hmm.initial_distribution
        transition_matrix = self.hmm.transition_model.transition_matrix
        observation_probabilities = self.hmm.output_probabilities

        # get estimation parameters
        self.likelihoods = self.hmm.likelihoods  # Likelihood history
        self.likelihood = self.likelihoods[-1]
        self.hidden_state_probabilities = self.hmm.state_probabilities  # gamma variables
        self.hidden_state_trajectories = self.hmm.hidden_state_trajectories  # Viterbi path
        self.count_matrix = self.hmm.count_model.count_matrix  # hidden count matrix
        self.initial_count = self.hmm.initial_count  # hidden init count
        self._active_set = _np.arange(self.nstates)

        # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards?
        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged_strided
        self._nstates_obs_full = number_of_states(dtrajs)
        self._nstates_obs = number_of_states(dtrajs_lagged_strided)
        self._observable_set = _np.arange(self._nstates_obs)
        self._dtrajs_obs = dtrajs
        self.set_model_params(P=transition_matrix,
                              pobs=observation_probabilities,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        # TODO: perhaps remove connectivity and just rely on .submodel()?
        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity,
                             inplace=True)
Exemplo n.º 5
0
def timescales_hmsm(dtrajs,
                    nstates,
                    lags=None,
                    nits=None,
                    reversible=True,
                    connected=True,
                    errors=None,
                    nsamples=100,
                    n_jobs=1,
                    show_progress=True):
    r""" Calculate implied timescales from Hidden Markov state models estimated at a series of lag times.

    Warning: this can be slow!

    Parameters
    ----------
    dtrajs : array-like or list of array-likes
        discrete trajectories

    nstates : int
        number of hidden states

    lags : array-like of integers (optional)
        integer lag times at which the implied timescales will be calculated

    nits : int (optional)
        number of implied timescales to be computed. Will compute less if the
        number of states are smaller. None means the number of timescales will
        be determined automatically.

    connected : boolean (optional)
        If true compute the connected set before transition matrix
        estimation at each lag separately

    reversible : boolean (optional)
        Estimate transition matrix reversibly (True) or nonreversibly (False)

    errors : None | 'bayes'
        Specifies whether to compute statistical uncertainties (by default not),
        an which algorithm to use if yes. The only option is currently 'bayes'.
        This algorithm is much faster than MSM-based error calculation because
        the involved matrices are much smaller.

    nsamples : int
        Number of approximately independent HMSM samples generated for each lag
        time for uncertainty quantification. Only used if errors is not None.

    n_jobs = 1 : int
        how many subprocesses to start to estimate the models for each lag time.

    show_progress : bool, default=True
        Show progressbars for calculation?

    Returns
    -------
    itsobj : :class:`ImpliedTimescales <pyemma.msm.ImpliedTimescales>` object

    See also
    --------
    ImpliedTimescales
        The object returned by this function.
    pyemma.plots.plot_implied_timescales
        Plotting function for the :class:`ImpliedTimescales <pyemma.msm.ImpliedTimescales>` object

    Example
    -------
    >>> from pyemma import msm
    >>> import numpy as np
    >>> np.set_printoptions(precision=3)
    >>> dtraj = [0,1,1,2,2,2,1,2,2,2,1,0,0,1,1,1,2,2,1,1,2,1,1,0,0,0,1,1,2,2,1]   # mini-trajectory
    >>> ts = msm.timescales_hmsm(dtraj, 2, [1,2,3,4,5])
    >>> print(ts.timescales) # doctest: +ELLIPSIS
    [[  1.691]
     [  7.537]
     [  1.919]
     [ 40.962]
     [ 11.527]]


    .. autoclass:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales
            :attributes:

    References
    ----------
    Implied timescales as a lagtime-selection and MSM-validation approach were
    suggested in [1]_. Hidden Markov state model estimation is done here as
    described in [2]_. For uncertainty quantification we employ the Bayesian
    sampling algorithm described in [3]_.

    .. [1] Swope, W. C. and J. W. Pitera and F. Suits: Describing protein
        folding kinetics by molecular dynamics simulations:  1. Theory.
        J. Phys. Chem. B 108: 6571-6581 (2004)

    .. [2] F. Noe, H. Wu, J.-H. Prinz and N. Plattner: Projected and hidden
        Markov models for calculating kinetics and metastable states of
        complex molecules. J. Chem. Phys. 139, 184114 (2013)

    .. [3] J. D. Chodera et al:
        Bayesian hidden Markov model analysis of single-molecule force
        spectroscopy: Characterizing kinetics under measurement uncertainty
        arXiv:1108.1430 (2011)

    """
    # format data
    dtrajs = _types.ensure_dtraj_list(dtrajs)

    if connected:
        connectivity = 'largest'
    else:
        connectivity = 'none'

    # MLE or error estimation?
    if errors is None:
        estimator = _ML_HMSM(nstates=nstates,
                             reversible=reversible,
                             connectivity=connectivity)
    elif errors == 'bayes':
        estimator = _Bayes_HMSM(nstates=nstates,
                                reversible=reversible,
                                connectivity=connectivity,
                                show_progress=show_progress,
                                nsamples=nsamples)
    else:
        raise NotImplementedError('Error estimation method' + str(errors) +
                                  'currently not implemented')

    # go
    itsobj = _ImpliedTimescales(estimator,
                                lags=lags,
                                nits=nits,
                                n_jobs=n_jobs,
                                show_progress=show_progress)
    itsobj.estimate(dtrajs)
    return itsobj
Exemplo n.º 6
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)
        # remove last lag steps from dtrajs:
        dtrajs_lag = [traj[:-self.lag] for traj in dtrajs]
        # compute and store discrete trajectory statistics
        dtrajstats = _DiscreteTrajectoryStats(dtrajs_lag)
        # check if this MSM seems too large to be dense
        if dtrajstats.nstates > 4000 and not self.sparse:
            self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be '
                              'inefficient or unfeasible in terms of both runtime and memory consumption. '
                              'Consider using sparse=True.')

        # count lagged
        dtrajstats.count_lagged(self.lag, count_mode=self.count_mode)

        # full count matrix and number of states
        self._C_full = dtrajstats.count_matrix()
        self._nstates_full = self._C_full.shape[0]

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            self.active_set = dtrajstats.largest_connected_set
        else:
            raise NotImplementedError('OOM based MSM estimation is only implemented for connectivity=\'largest\'.')

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # Estimate transition matrix
        if self.connectivity == 'largest':
            # Re-sampling:
            if self.rank_Ct=='bootstrap_counts':
                Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag)
                from pyemma.util.linalg import submatrix
                Ceff = submatrix(Ceff_full, self.active_set)
                smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs)
            else:
                smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs,
                                                   active_set=self._active_set)
            # Estimate two step count matrices:
            C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full)
            # Rank decision:
            rank_ind = rank_decision(smean, sdev, tol=self.tol_rank)
            # Estimate OOM components:
            Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind,
                                                 lcc=self.active_set)
            # Compute transition matrix:
            P, lcc_new = equilibrium_transition_matrix(Xi, omega, sigma, reversible=self.reversible)
        else:
            raise NotImplementedError('OOM based MSM estimation is only implemented for connectivity=\'largest\'.')

        # Update active set and derived quantities:
        if lcc_new.size < self._nstates:
            self._active_set = self._active_set[lcc_new]
            self._C_active = dtrajstats.count_matrix(subset=self.active_set)
            self._nstates = self._C_active.shape[0]
            self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int)
            self._full2active[self.active_set] = _np.arange(len(self.active_set))
            warnings.warn("Caution: Re-estimation of count matrix resulted in reduction of the active set.")

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self._Xi = Xi
        self._omega = omega
        self._sigma = sigma
        self._eigenvalues_OOM = l
        self._rank_ind = rank_ind
        self._oom_rank = self._sigma.size
        self._C2t = C2t
        self.set_model_params(P=P, pi=None, reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Exemplo n.º 7
0
def wham(ttrajs,
         dtrajs,
         bias,
         maxiter=100000,
         maxerr=1.0E-15,
         save_convergence_info=0,
         dt_traj='1 step'):
    r"""
    Weighted histogram analysis method

    Parameters
    ----------
    ttrajs : ndarray(T) of int, or list of ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are
        indexes in 1,...,K enumerating the thermodynamic states the trajectory is in at any time.
    dtrajs : ndarray(T) of int, or list of ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are indexes
        in 1,...,n enumerating the n Markov states or the bins the trajectory is in at any time.
    bias : ndarray(K, n)
        bias[j,i] is the bias energy for each discrete state i at thermodynamic state j.
    maxiter : int, optional, default=10000
        The maximum number of dTRAM iterations before the estimator exits unsuccessfully.
    maxerr : float, optional, default=1e-15
        Convergence criterion based on the maximal free energy change in a self-consistent
        iteration step.
    save_convergence_info : int, optional, default=0
        Every save_convergence_info iteration steps, store the actual increment
        and the actual loglikelihood; 0 means no storage.
    dt_traj : str, optional, default='1 step'
        Description of the physical time corresponding to the lag. May be used by analysis
        algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e.
        there is no physical time unit.  Specify by a number, whitespace and unit. Permitted
        units are (* is an arbitrary string):

        |  'fs',   'femtosecond*'
        |  'ps',   'picosecond*'
        |  'ns',   'nanosecond*'
        |  'us',   'microsecond*'
        |  'ms',   'millisecond*'
        |  's',    'second*'

    Returns
    -------
    sm : StationaryModel
        A stationary model which consists of thermodynamic quantities at all
        temperatures/thermodynamic states.

    Example
    -------
    **Example: Umbrella sampling**. Suppose we simulate in K umbrellas, centered at
    positions :math:`y_1,...,y_K` with bias energies

    .. math::
        b_k(x) = 0.5 * c_k * (x - y_k)^2 / kT

    Suppose we have one simulation of length T in each umbrella, and they are ordered from 1 to K.
    We have discretized the x-coordinate into 100 bins.
    Then dtrajs and ttrajs should each be a list of :math:`K` arrays.
    dtrajs would look for example like this::
    
    [ (1, 2, 2, 3, 2, ...),  (2, 4, 5, 4, 4, ...), ... ]
    where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along
    the trajectory. ttrajs would look like this:
    [ (0, 0, 0, 0, 0, ...),  (1, 1, 1, 1, 1, ...), ... ]
    Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1),
    and so forth. bias is a :math:`K \times n` matrix with all reduced bias energies evaluated at
    all centers::

    [[b_0(y_0), b_0(y_1), ..., b_0(y_n)],
     [b_1(y_0), b_1(y_1), ..., b_1(y_n)],
     ...
     [b_K(y_0), b_K(y_1), ..., b_K(y_n)]]

    """
    # prepare trajectories
    ttrajs = _types.ensure_dtraj_list(ttrajs)
    dtrajs = _types.ensure_dtraj_list(dtrajs)
    assert len(ttrajs) == len(dtrajs)
    X = []
    for i in range(len(ttrajs)):
        ttraj = ttrajs[i]
        dtraj = dtrajs[i]
        assert len(ttraj) == len(dtraj)
        X.append(_np.ascontiguousarray(_np.array([ttraj, dtraj]).T))
    # build WHAM
    from pyemma.thermo import WHAM
    wham_estimator = WHAM(bias,
                          maxiter=maxiter,
                          maxerr=maxerr,
                          save_convergence_info=save_convergence_info,
                          dt_traj=dt_traj)
    # run estimation
    return wham_estimator.estimate(X)
Exemplo n.º 8
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            copy_attributes = ['_nstates', '_reversible', '_pi', '_observable_set', 'likelihoods', 'likelihood',
                               'hidden_state_probabilities', 'hidden_state_trajectories', 'count_matrix',
                               'initial_count', 'initial_distribution', '_active_set']
            check_user_choices = ['lag', '_nstates']

            # check if nstates and lag are compatible
            for attr in check_user_choices:
                if not getattr(self, attr) == getattr(self.init_hmsm, attr):
                    raise UserWarning('BayesianHMSM cannot be initialized with init_hmsm with '
                                      'incompatible lag or nstates.')

            if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or
                    not all((_np.array_equal(d1, d2) for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))):
                raise NotImplementedError('Bayesian HMM estimation with init_hmsm is currently only implemented ' +
                                          'if applied to the same data.')

            # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM.
            # EVALUATE STRIDE
            if self.stride == 'effective':
                # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
                # how many uncorrelated counts we can make
                self.stride = self.lag
                # get a quick estimate from the spectral radius of the nonreversible
                from pyemma.msm import estimate_markov_model
                msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False,
                                               connectivity='largest', dt_traj=self.timestep_traj)
                # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
                # estimate of the decorrelation time
                if msm_nr.nstates > self.nstates:
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                    # use the smaller of these two pessimistic estimates
                    self.stride = int(min(self.lag, 2 * corrtime))

            # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible
            if self.stride != self.init_hmsm.stride:
                dtrajs_lagged_strided = _lag_observations(dtrajs, self.lag, stride=self.stride)
                _nstates_obs = _number_of_states(dtrajs_lagged_strided, only_used=True)
                _nstates_obs_full = _number_of_states(dtrajs)

                if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided),
                                 _np.concatenate(self.init_hmsm._dtrajs_lagged)).size != 0:
                    raise UserWarning('Choice of stride has excluded a different set of microstates than in ' +
                                      'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' +
                                      'must match to the one used for init_hmsm estimation.')

                self._dtrajs_full = dtrajs
                self._dtrajs_lagged = dtrajs_lagged_strided
                self._nstates_obs_full = _nstates_obs_full
                self._nstates_obs = _nstates_obs
                self._observable_set = _np.arange(self._nstates_obs)
                self._dtrajs_obs = dtrajs
            else:
                copy_attributes += ['_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full',
                                    '_nstates_obs', '_observable_set', '_dtrajs_obs']

            # update self with estimates from init_hmsm
            self.__dict__.update(
                {k: i for k, i in self.init_hmsm.__dict__.items() if k in copy_attributes})

            # as mentioned in the docstring, take init_hmsm observed set observation probabilities
            self.observe_nonempty = False

            # update HMM Model
            self.update_model_params(P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities,
                                     dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        import msmtools.estimation as msmest
        if self.reversible and not msmest.is_connected(self.count_matrix):
            raise NotImplementedError('Encountered disconnected count matrix:\n ' + str(self.count_matrix)
                                      + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag)
                                      + ' and stride=' + str(self.stride) + '. Consider using shorter lag, '
                                      + 'or shorter stride (to use more of the data), '
                                      + 'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        nstates_full = msmest.number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm

        if self.init_hmsm is not None:
            hmm_mle = self.init_hmsm.hmm
        else:
            hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
                                   reversible=self.reversible, stationary=self.stationary,
                                   p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior,
                                   store_hidden=self.store_hidden, call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_inp = [(m.transition_matrix, m.stationary_distribution, m.output_probabilities)
                      for m in sampled_hmm.sampled_hmms]

        samples = []
        for P, pi, pobs in sample_inp:  # restrict to observable set if necessary
            Bobs = pobs[:, self.observable_set]
            pobs = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples)]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset, obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity)
Exemplo n.º 9
0
    def _estimate(self, dtrajs):
        """

        Parameters
        ----------
        dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int)
            discrete trajectories, stored as integer ndarrays (arbitrary size)
            or a single ndarray for only one trajectory.

        Return
        ------
        hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>`
            Estimated Hidden Markov state model

        """
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)
        # conduct MLE estimation (superclass) first
        _MLMSM._estimate(self, dtrajs)

        # transition matrix sampler
        from msmtools.estimation import tmatrix_sampler
        from math import sqrt
        if self.nsteps is None:
            self.nsteps = int(sqrt(
                self.nstates))  # heuristic for number of steps to decorrelate
        # use the same count matrix as the MLE. This is why we have effective as a default
        if self.statdist_constraint is None:
            tsampler = tmatrix_sampler(self.count_matrix_active,
                                       reversible=self.reversible,
                                       T0=self.transition_matrix,
                                       nsteps=self.nsteps)
        else:
            # Use the stationary distribution on the active set of states
            statdist_active = self.pi
            # We can not uise the MLE as T0. Use the initialization in the reversible pi sampler
            tsampler = tmatrix_sampler(self.count_matrix_active,
                                       reversible=self.reversible,
                                       mu=statdist_active,
                                       nsteps=self.nsteps)

        self._progress_register(self.nsamples,
                                description="Sampling MSMs",
                                stage=0)

        if self.show_progress:

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        sample_Ps, sample_mus = tsampler.sample(nsamples=self.nsamples,
                                                return_statdist=True,
                                                call_back=call_back)
        self._progress_force_finish(0)

        # construct sampled MSMs
        samples = []
        for i in range(self.nsamples):
            samples.append(
                _MSM(sample_Ps[i],
                     pi=sample_mus[i],
                     reversible=self.reversible,
                     dt_model=self.dt_model))

        # update self model
        self.update_model_params(samples=samples)

        # done
        return self
Exemplo n.º 10
0
def wham(ttrajs,
         dtrajs,
         bias,
         maxiter=100000,
         maxerr=1.0E-15,
         save_convergence_info=0,
         dt_traj='1 step'):
    #TODO fix docstring
    r"""
    Weighted histogram analysis method

    Parameters
    ----------
    ttrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are
        indexes in 0,...,num_therm_states-1 enumerating the thermodynamic states the trajectory is
        in at any time.
    dtrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are indexes
        in 0,...,num_conf_states-1 enumerating the num_conf_states Markov states or the bins the
        trajectory is in at any time.
    bias : numpy.ndarray(shape=(num_therm_states, num_conf_states)) object
        bias_energies_full[j, i] is the bias energy in units of kT for each discrete state i
        at thermodynamic state j.
    maxiter : int, optional, default=10000
        The maximum number of dTRAM iterations before the estimator exits unsuccessfully.
    maxerr : float, optional, default=1e-15
        Convergence criterion based on the maximal free energy change in a self-consistent
        iteration step.
    save_convergence_info : int, optional, default=0
        Every save_convergence_info iteration steps, store the actual increment
        and the actual loglikelihood; 0 means no storage.
    dt_traj : str, optional, default='1 step'
        Description of the physical time corresponding to the lag. May be used by analysis
        algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e.
        there is no physical time unit.  Specify by a number, whitespace and unit. Permitted
        units are (* is an arbitrary string):

        |  'fs',   'femtosecond*'
        |  'ps',   'picosecond*'
        |  'ns',   'nanosecond*'
        |  'us',   'microsecond*'
        |  'ms',   'millisecond*'
        |  's',    'second*'

    Returns
    -------
    sm : StationaryModel
        A stationary model which consists of thermodynamic quantities at all
        temperatures/thermodynamic states.

    Example
    -------
    **Umbrella sampling**: Suppose we simulate in K umbrellas, centered at
    positions :math:`y_0,...,y_{K-1}` with bias energies

    .. math::
        b_k(x) = \frac{c_k}{2 \textrm{kT}} \cdot (x - y_k)^2

    Suppose we have one simulation of length T in each umbrella, and they are ordered from 0 to K-1.
    We have discretized the x-coordinate into 100 bins.
    Then dtrajs and ttrajs should each be a list of :math:`K` arrays.
    dtrajs would look for example like this::
    
    [ (0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...),  (0, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...), ... ]
    
    where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along
    the trajectory. ttrajs would look like this::

    [ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...),  (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...), ... ]
    
    Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1),
    and so forth. bias is a :math:`K \times n` matrix with all reduced bias energies evaluated at
    all centers:

    .. math::
        \left(\begin{array}{cccc}
            b_0(y_0) &  b_0(y_1) &  ... &  b_0(y_{n-1}) \\
            b_1(y_0) &  b_1(y_1) &  ... &  b_1(y_{n-1}) \\
            ... \\
            b_{K-1}(y_0) &  b_{K-1}(y_1) &  ... &  b_{K-1}(y_{n-1})
        \end{array}\right)

    Let us try the above example:

    >>> from pyemma.thermo import wham
    >>> import numpy as np
    >>> ttrajs = [np.array([0,0,0,0,0,0,0,0,0,0]), np.array([1,1,1,1,1,1,1,1,1,1])]
    >>> dtrajs = [np.array([0,0,0,0,1,1,1,0,0,0]), np.array([0,1,0,1,0,1,1,0,0,1])]
    >>> bias = np.array([[0.0, 0.0], [0.5, 1.0]])
    >>> wham_obj = wham(ttrajs, dtrajs, bias)
    >>> wham_obj.log_likelihood() # doctest: +ELLIPSIS
    -6.6...
    >>> wham_obj.state_counts # doctest: +SKIP
    array([[7, 3],
           [5, 5]])
    >>> wham_obj.stationary_distribution # doctest: +ELLIPSIS +REPORT_NDIFF
    array([ 0.5...,  0.4...])

    References
    ----------
    
    .. [1] Ferrenberg, A.M. and Swensen, R.H. 1988.
        New Monte Carlo Technique for Studying Phase Transitions.
        Phys. Rev. Lett. 23, 2635--2638

    .. [2] Kumar, S. et al 1992.
        The Weighted Histogram Analysis Method for Free-Energy Calculations on Biomolecules. I. The Method.
        J. Comp. Chem. 13, 1011--1021

    """
    # check trajectories
    ttrajs = _types.ensure_dtraj_list(ttrajs)
    dtrajs = _types.ensure_dtraj_list(dtrajs)
    assert len(ttrajs) == len(dtrajs)
    for ttraj, dtraj in zip(ttrajs, dtrajs):
        assert len(ttrajs) == len(dtrajs)
    # build WHAM
    from pyemma.thermo import WHAM
    wham_estimator = WHAM(bias,
                          maxiter=maxiter,
                          maxerr=maxerr,
                          save_convergence_info=save_convergence_info,
                          dt_traj=dt_traj)
    # run estimation
    return wham_estimator.estimate((ttrajs, dtrajs))
Exemplo n.º 11
0
def dtram(ttrajs,
          dtrajs,
          bias,
          lag,
          unbiased_state=None,
          count_mode='sliding',
          connectivity='largest',
          maxiter=10000,
          maxerr=1.0E-15,
          save_convergence_info=0,
          dt_traj='1 step',
          init=None,
          init_maxiter=10000,
          init_maxerr=1.0E-8):
    r"""
    Discrete transition-based reweighting analysis method

    Parameters
    ----------
    ttrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are
        indexes in 0,...,num_therm_states-1 enumerating the thermodynamic states the trajectory is
        in at any time.
    dtrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are indexes
        in 0,...,num_conf_states-1 enumerating the num_conf_states Markov states or the bins the
        trajectory is in at any time.
    bias : numpy.ndarray(shape=(num_therm_states, num_conf_states)) object
        bias_energies_full[j, i] is the bias energy in units of kT for each discrete state i
        at thermodynamic state j.
    lag : int or list of int, optional, default=1
        Integer lag time at which transitions are counted. Providing a list of lag times will
        trigger one estimation per lag time.
    count_mode : str, optional, default='sliding'
        Mode to obtain count matrices from discrete trajectories. Should be one of:

        * 'sliding' : a trajectory of length T will have :math:`T-\tau` counts at time indexes
            .. math::
                 (0 \rightarrow \tau), (1 \rightarrow \tau+1), ..., (T-\tau-1 \rightarrow T-1)
        * 'sample' : a trajectory of length T will have :math:`T/\tau` counts at time indexes
            .. math::
                    (0 \rightarrow \tau), (\tau \rightarrow 2 \tau), ..., ((T/\tau-1) \tau \rightarrow T)

        Currently only 'sliding' is supported.
    connectivity : str, optional, default='largest'
        Defines what should be considered a connected set in the joint space of conformations and
        thermodynamic ensembles. Currently only 'largest' is supported.
    maxiter : int, optional, default=10000
        The maximum number of dTRAM iterations before the estimator exits unsuccessfully.
    maxerr : float, optional, default=1e-15
        Convergence criterion based on the maximal free energy change in a self-consistent
        iteration step.
    save_convergence_info : int, optional, default=0
        Every save_convergence_info iteration steps, store the actual increment
        and the actual loglikelihood; 0 means no storage.
    dt_traj : str, optional, default='1 step'
        Description of the physical time corresponding to the lag. May be used by analysis
        algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e.
        there is no physical time unit.  Specify by a number, whitespace and unit. Permitted
        units are (* is an arbitrary string):

        |  'fs',   'femtosecond*'
        |  'ps',   'picosecond*'
        |  'ns',   'nanosecond*'
        |  'us',   'microsecond*'
        |  'ms',   'millisecond*'
        |  's',    'second*'

    init : str, optional, default=None
        Use a specific initialization for self-consistent iteration:

        | None:    use a hard-coded guess for free energies and Lagrangian multipliers
        | 'wham':  perform a short WHAM estimate to initialize the free energies

    init_maxiter : int, optional, default=10000
        The maximum number of self-consistent iterations during the initialization.
    init_maxerr : float, optional, default=1.0E-8
        Convergence criterion for the initialization.

    Returns
    -------
    memm : MEMM or list of MEMMs
        A multi-ensemble Markov state model (for each given lag time) which consists of stationary
        and kinetic quantities at all temperatures/thermodynamic states.

    Example
    -------
    **Umbrella sampling**: Suppose we simulate in K umbrellas, centered at
    positions :math:`y_0,...,y_{K-1}` with bias energies

    .. math::
        b_k(x) = \frac{c_k}{2 \textrm{kT}} \cdot (x - y_k)^2

    Suppose we have one simulation of length T in each umbrella, and they are ordered from 0 to K-1.
    We have discretized the x-coordinate into 100 bins.
    Then dtrajs and ttrajs should each be a list of :math:`K` arrays.
    dtrajs would look for example like this::

    [ (0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...),  (0, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...), ... ]

    where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along
    the trajectory. ttrajs would look like this::

    [ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...),  (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...), ... ]

    Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1),
    and so forth. bias is a :math:`K \times n` matrix with all reduced bias energies evaluated at
    all centers:

    .. math::
        \left(\begin{array}{cccc}
            b_0(y_0) &  b_0(y_1) &  ... &  b_0(y_{n-1}) \\
            b_1(y_0) &  b_1(y_1) &  ... &  b_1(y_{n-1}) \\
            ... \\
            b_{K-1}(y_0) &  b_{K-1}(y_1) &  ... &  b_{K-1}(y_{n-1})
        \end{array}\right)

    Let us try the above example:

    >>> from pyemma.thermo import dtram
    >>> import numpy as np
    >>> ttrajs = [np.array([0,0,0,0,0,0,0,0,0,0]), np.array([1,1,1,1,1,1,1,1,1,1])]
    >>> dtrajs = [np.array([0,0,0,0,1,1,1,0,0,0]), np.array([0,1,0,1,0,1,1,0,0,1])]
    >>> bias = np.array([[0.0, 0.0], [0.5, 1.0]])
    >>> dtram_obj = dtram(ttrajs, dtrajs, bias, 1)
    >>> dtram_obj.log_likelihood() # doctest: +ELLIPSIS
    -9.805...
    >>> dtram_obj.count_matrices # doctest: +SKIP
    array([[[5, 1],
            [1, 2]],
           [[1, 4],
            [3, 1]]], dtype=int32)
    >>> dtram_obj.stationary_distribution # doctest: +ELLIPSIS
    array([ 0.38...,  0.61...])

    References
    ----------

    .. [1] Wu, H. et al 2014
        Statistically optimal analysis of state-discretized trajectory data from multiple thermodynamic states
        J. Chem. Phys. 141, 214106

    """
    # prepare trajectories
    ttrajs = _types.ensure_dtraj_list(ttrajs)
    dtrajs = _types.ensure_dtraj_list(dtrajs)
    assert len(ttrajs) == len(dtrajs)
    for ttraj, dtraj in zip(ttrajs, dtrajs):
        assert len(ttraj) == len(dtraj)
    # check lag time(s)
    lags = _np.asarray(lag, dtype=_np.intc).reshape((-1, )).tolist()
    # build DTRAM and run estimation
    from pyemma.thermo import DTRAM
    dtram_estimators = [
        DTRAM(bias,
              _lag,
              count_mode=count_mode,
              connectivity=connectivity,
              maxiter=maxiter,
              maxerr=maxerr,
              save_convergence_info=save_convergence_info,
              dt_traj=dt_traj,
              init=init,
              init_maxiter=init_maxiter,
              init_maxerr=init_maxerr).estimate((ttrajs, dtrajs))
        for _lag in lags
    ]
    _assign_unbiased_state_label(dtram_estimators, unbiased_state)
    # return
    if len(dtram_estimators) == 1:
        return dtram_estimators[0]
    return dtram_estimators
Exemplo n.º 12
0
def tram(ttrajs,
         dtrajs,
         bias,
         lag,
         unbiased_state=None,
         count_mode='sliding',
         connectivity='summed_count_matrix',
         maxiter=10000,
         maxerr=1.0E-15,
         save_convergence_info=0,
         dt_traj='1 step',
         connectivity_factor=1.0,
         nn=None,
         direct_space=False,
         N_dtram_accelerations=0,
         callback=None,
         init='mbar',
         init_maxiter=10000,
         init_maxerr=1e-8):
    r"""
    Transition-based reweighting analysis method

    Parameters
    ----------
    ttrajs : numpy.ndarray(T), or list of numpy.ndarray(T_i)
        A single discrete trajectory or a list of discrete trajectories. The integers are
        indexes in 0,...,num_therm_states-1 enumerating the thermodynamic states the trajectory is
        in at any time.
    dtrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int
        A single discrete trajectory or a list of discrete trajectories. The integers are indexes
        in 0,...,num_conf_states-1 enumerating the num_conf_states Markov states or the bins the
        trajectory is in at any time.
    btrajs : numpy.ndarray(T, num_therm_states), or list of numpy.ndarray(T_i, num_therm_states)
        A single reduced bias energy trajectory or a list of reduced bias energy trajectories.
        For every simulation frame seen in trajectory i and time step t, btrajs[i][t, k] is the
        reduced bias energy of that frame evaluated in the k'th thermodynamic state (i.e. at
        the k'th umbrella/Hamiltonian/temperature)
    lag : int or list of int, optional, default=1
        Integer lag time at which transitions are counted. Providing a list of lag times will
        trigger one estimation per lag time.
    maxiter : int, optional, default=10000
        The maximum number of dTRAM iterations before the estimator exits unsuccessfully.
    maxerr : float, optional, default=1e-15
        Convergence criterion based on the maximal free energy change in a self-consistent
        iteration step.
    save_convergence_info : int, optional, default=0
        Every save_convergence_info iteration steps, store the actual increment
        and the actual loglikelihood; 0 means no storage.
    dt_traj : str, optional, default='1 step'
        Description of the physical time corresponding to the lag. May be used by analysis
        algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e.
        there is no physical time unit.  Specify by a number, whitespace and unit. Permitted
        units are (* is an arbitrary string):

        |  'fs',   'femtosecond*'
        |  'ps',   'picosecond*'
        |  'ns',   'nanosecond*'
        |  'us',   'microsecond*'
        |  'ms',   'millisecond*'
        |  's',    'second*'

    connectivity : str, optional, default='summed_count_matrix'
        One of 'summed_count_matrix', 'strong_in_every_ensemble',
        'neighbors', 'post_hoc_RE' or 'BAR_variance'.
        Defines what should be considered a connected set in the joint space
        of conformations and thermodynamic ensembles.
        For details see thermotools.cset.compute_csets_TRAM.
    nn : int, optional, default=None
        Only needed if connectivity='neighbors'
        See thermotools.cset.compute_csets_TRAM.
    connectivity_factor : float, optional, default=1.0
        Only needed if connectivity='post_hoc_RE' or 'BAR_variance'. Weakens the connectivity
        requirement, see thermotools.cset.compute_csets_TRAM.
    direct_space : bool, optional, default=False
        Whether to perform the self-consitent iteration with Boltzmann factors
        (direct space) or free energies (log-space). When analyzing data from
        multi-temperature simulations, direct-space is not recommended.
    N_dtram_accelerations : int, optional, default=0
        Convergence of TRAM can be speeded up by interleaving the updates
        in the self-consitent iteration with a dTRAM-like update step.
        N_dtram_accelerations says how many times the dTRAM-like update
        step should be applied in every iteration of the TRAM equations.
        Currently this is only effective if direct_space=True.
    init : str, optional, default=None
        Use a specific initialization for self-consistent iteration:

        | None:    use a hard-coded guess for free energies and Lagrangian multipliers
        | 'wham':  perform a short WHAM estimate to initialize the free energies

    init_maxiter : int, optional, default=10000
        The maximum number of self-consistent iterations during the initialization.
    init_maxerr : float, optional, default=1.0E-8
        Convergence criterion for the initialization.

    Returns
    -------
    memm : MEMM or list of MEMMs
        A multi-ensemble Markov state model (for each given lag time) which consists of stationary
        and kinetic quantities at all temperatures/thermodynamic states.

    Example
    -------
    **Umbrella sampling**: Suppose we simulate in K umbrellas, centered at
    positions :math:`y_0,...,y_{K-1}` with bias energies

    .. math::
        b_k(x) = \frac{c_k}{2 \textrm{kT}} \cdot (x - y_k)^2

    Suppose we have one simulation of length T in each umbrella, and they are ordered from 0 to K-1.
    We have discretized the x-coordinate into 100 bins.
    Then dtrajs and ttrajs should each be a list of :math:`K` arrays.
    dtrajs would look for example like this::

    [ (0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...),  (0, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...), ... ]

    where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along
    the trajectory. ttrajs would look like this::

    [ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...),  (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...), ... ]

    Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1),
    and so forth.

    The bias would be a list of :math:`T \times K` arrays which specify each frame's bias energy in
    all thermodynamic states:

    [ ((0, 1.7, 2.3, 6.1, ...), ...), ((0, 2.4, 3.1, 9,5, ...), ...), ... ]

    Let us try the above example:

    >>> from pyemma.thermo import tram
    >>> import numpy as np
    >>> ttrajs = [np.array([0,0,0,0,0,0,0]), np.array([1,1,1,1,1,1,1])]
    >>> dtrajs = [np.array([0,0,0,0,1,1,1]), np.array([0,1,0,1,0,1,1])]
    >>> bias = [np.array([[1,0],[1,0],[0,0],[0,0],[0,0],[0,0],[0,0]],dtype=np.float64), np.array([[1,0],[0,0],[0,0],[1,0],[0,0],[1,0],[1,0]],dtype=np.float64)]
    >>> tram_obj = tram(ttrajs, dtrajs, bias, 1)
    >>> tram_obj.log_likelihood() # doctest: +ELLIPSIS
    -29.111...
    >>> tram_obj.count_matrices # doctest: +SKIP
    array([[[1 1]
            [0 4]]
           [[0 3]
            [2 1]]], dtype=int32)
    >>> tram_obj.stationary_distribution # doctest: +ELLIPSIS
    array([ 0.38...  0.61...])

    References
    ----------

    .. [1] Wu, H. et al 2016
        in press

    """
    # prepare trajectories
    ttrajs = _types.ensure_dtraj_list(ttrajs)
    dtrajs = _types.ensure_dtraj_list(dtrajs)
    assert len(ttrajs) == len(dtrajs)
    assert len(ttrajs) == len(bias)
    for ttraj, dtraj, btraj in zip(ttrajs, dtrajs, bias):
        assert len(ttraj) == len(dtraj)
        assert len(ttraj) == btraj.shape[0]
    # check lag time(s)
    lags = _np.asarray(lag, dtype=_np.intc).reshape((-1, )).tolist()
    # build TRAM and run estimation
    from pyemma.thermo import TRAM as _TRAM
    tram_estimators = [
        _TRAM(_lag,
              count_mode=count_mode,
              connectivity=connectivity,
              maxiter=maxiter,
              maxerr=maxerr,
              save_convergence_info=save_convergence_info,
              dt_traj=dt_traj,
              connectivity_factor=connectivity_factor,
              nn=nn,
              direct_space=direct_space,
              N_dtram_accelerations=N_dtram_accelerations,
              callback=callback,
              init='mbar',
              init_maxiter=init_maxiter,
              init_maxerr=init_maxerr).estimate((ttrajs, dtrajs, bias))
        for _lag in lags
    ]
    _assign_unbiased_state_label(tram_estimators, unbiased_state)
    # return
    if len(tram_estimators) == 1:
        return tram_estimators[0]
    return tram_estimators
Exemplo n.º 13
0
    def _estimate(self, dtrajs):
        ### PREPARE AND CHECK DATA
        # TODO: Currently only discrete trajectories are implemented. For a general class this needs to be changed.
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # check trajectory lengths
        if self._estimated:
            # if dtrajs has now changed, unset the _estimated flag to re-set every derived quantity.
            assert hasattr(self, '_last_dtrajs_input_hash')
            current_hash = _hash_dtrajs(dtrajs)
            if self._last_dtrajs_input_hash != current_hash:
                self.logger.warning(
                    "estimating from new data, discard all previously computed models."
                )
                self._estimated = False
                self._last_dtrajs_input_hash = current_hash
        else:
            self._last_dtrajs_input_hash = _hash_dtrajs(dtrajs)

        self._trajlengths = np.fromiter((len(traj) for traj in dtrajs),
                                        dtype=int,
                                        count=len(dtrajs))
        maxlength = np.max(self._trajlengths)

        # set lag times by data if not yet set
        if self._lags is None:
            maxlag = 0.5 * np.sum(self._trajlengths) / float(
                len(self._trajlengths))
            self._lags = _generate_lags(maxlag, 1.5)

        # check if some lag times are forbidden.
        if np.max(self._lags) >= maxlength:
            Ifit = np.where(self._lags < maxlength)[0]
            Inofit = np.where(self._lags >= maxlength)[0]
            self.logger.warning(
                'Ignoring lag times that exceed the longest trajectory: %s',
                self._lags[Inofit])
            self._lags = self._lags[Ifit]

        ### RUN ESTIMATION
        if self._estimated:
            # we already had run an estimation, determine which lag times we need to compute
            # TODO: this will re-evaluate problematic lag times, wont it?
            lags = sorted(list(set(self._lags).difference(self._last_lags)))
            if len(lags) == 0:
                self.logger.info("All lag times already estimated.")
                return self
            assert lags
            self.logger.info(
                "Running estimating for not yet estimated lags times: %s",
                lags)
        else:
            lags = self._lags

        # construct all parameter sets for the estimator
        param_sets = tuple(param_grid({'lag': lags}))

        # run estimation on all lag times
        if hasattr(self.estimator, 'show_progress'):
            self.estimator.show_progress = False
        if self.show_progress:
            pg = ProgressReporter()
            ctx = pg.context()
        else:
            pg = None
            # TODO: replace with nullcontext from util once merged.
            from contextlib import contextmanager

            @contextmanager
            def dummy():
                yield

            ctx = dummy()
        with ctx:
            if not self.only_timescales:
                models, estimators = estimate_param_scan(
                    self.estimator,
                    dtrajs,
                    param_sets,
                    failfast=False,
                    return_estimators=True,
                    n_jobs=self.n_jobs,
                    progress_reporter=pg,
                    return_exceptions=True)
                self._estimators = estimators
            else:
                evaluate = ['timescales']
                evaluate_args = [[self.nits]]
                if self._estimator_produces_samples():
                    evaluate.append('sample_f')
                    evaluate_args.append('timescales')
                results = estimate_param_scan(
                    self.estimator,
                    dtrajs,
                    param_sets,
                    failfast=False,
                    return_estimators=False,
                    n_jobs=self.n_jobs,
                    evaluate=evaluate,
                    evaluate_args=evaluate_args,
                    progress_reporter=pg,
                    return_exceptions=True,
                )

                if self._estimator_produces_samples():
                    models = [
                        _DummyModel(lag, ts, ts_sample)
                        for lag, (ts, ts_sample) in zip(lags, results)
                    ]
                else:
                    models = [
                        _DummyModel(
                            lag,
                            ts,
                            None,
                        ) for lag, ts in zip(lags, results)
                    ]
            self._postprocess_results(models)

        return self
Exemplo n.º 14
0
    def _estimate(self, dtrajs):
        """

        Parameters
        ----------

        Return
        ------
        hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>`
            Estimated Hidden Markov state model

        """
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)
        # if no initial MSM is given, estimate it now
        if self.msm_init is None:
            # estimate with sparse=False, because we need to do PCCA which is currently not implemented for sparse
            # estimate with store_data=True, because we need an EstimatedMSM
            msm_estimator = _MSMEstimator(lag=self.lag, reversible=self.reversible, sparse=False,
                                          connectivity=self.connectivity, dt_traj=self.timestep_traj)
            msm_init = msm_estimator.estimate(dtrajs)
        else:
            assert isinstance(self.msm_init, _EstimatedMSM), 'msm_init must be of type EstimatedMSM'
            msm_init = self.msm_init
            self.reversible = msm_init.is_reversible

        # print 'Connected set: ', msm_init.active_set

        # generate lagged observations
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_init.nstates > self.nstates:
                corrtime = int(max(1, msm_init.timescales()[self.nstates-1]))
                # use the smaller of these two pessimistic estimates
                self.stride = min(self.stride, 2*corrtime)
        # TODO: Here we always use the full observation state space for the estimation.
        dtrajs_lagged = _lag_observations(dtrajs, self.lag, stride=self.stride)

        # check input
        assert _types.is_int(self.nstates) and self.nstates > 1 and self.nstates <= msm_init.nstates, \
            'nstates must be an int in [2,msmobj.nstates]'
        # if hmm.nstates = msm.nstates there is no problem. Otherwise, check spectral gap
        if msm_init.nstates > self.nstates:
            timescale_ratios = msm_init.timescales()[:-1] / msm_init.timescales()[1:]
            if timescale_ratios[self.nstates-2] < 2.0:
                self.logger.warn('Requested coarse-grained model with ' + str(self.nstates) + ' metastable states at ' +
                                 'lag=' + str(self.lag) + '.' + 'The ratio of relaxation timescales between ' +
                                 str(self.nstates) + ' and ' + str(self.nstates+1) + ' states is only ' +
                                 str(timescale_ratios[self.nstates-2]) + ' while we recommend at least 2. ' +
                                 ' It is possible that the resulting HMM is inaccurate. Handle with caution.')

        # set things from MSM
        # TODO: dtrajs_obs is set here, but not used in estimation. Estimation is alwas done with
        # TODO: respect to full observation (see above). This is confusing. Define how we want to do this in gen.
        # TODO: observable set is also not used, it is just saved.
        nstates_obs_full = msm_init.nstates_full
        if self.observe_active:
            nstates_obs = msm_init.nstates
            observable_set = msm_init.active_set
            dtrajs_obs = msm_init.discrete_trajectories_active
        else:
            nstates_obs = msm_init.nstates_full
            observable_set = np.arange(nstates_obs_full)
            dtrajs_obs = msm_init.discrete_trajectories_full

        # TODO: this is redundant with BHMM code because that code is currently not easily accessible and
        # TODO: we don't want to re-estimate. Should be reengineered in bhmm.
        # ---------------------------------------------------------------------------------------
        # PCCA-based coarse-graining
        # ---------------------------------------------------------------------------------------
        # pcca- to number of metastable states
        pcca = msm_init.pcca(self.nstates)

        # HMM output matrix
        eps = 0.01 * (1.0/nstates_obs_full)  # default output probability, in order to avoid zero columns
        # Use PCCA distributions, but at least eps to avoid 100% assignment to any state (breaks convergence)
        B_conn = np.maximum(msm_init.metastable_distributions, eps)
        # full state space output matrix
        B = eps * np.ones((self.nstates, nstates_obs_full), dtype=np.float64)
        # expand B_conn to full state space
        # TODO: here we always select the active set, no matter if observe_active=True or False.
        B[:, msm_init.active_set] = B_conn[:, :]
        # TODO: at this point we will have zero observation probabilities for states that are not in the active
        # TODO: set. If these occur in the trajectory, that will mean zero columns in the output probabilities
        # TODO: and crash of forward-backward and sampling algorithms.
        # renormalize B to make it row-stochastic
        B /= B.sum(axis=1)[:, None]

        # coarse-grained transition matrix
        P_coarse = pcca.coarse_grained_transition_matrix
        # take care of unphysical values. First symmetrize
        X = np.dot(np.diag(pcca.coarse_grained_stationary_probability), P_coarse)
        X = 0.5*(X + X.T)
        # if there are values < 0, set to eps
        X = np.maximum(X, eps)
        # turn into coarse-grained transition matrix
        A = X / X.sum(axis=1)[:, None]

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------
        # lazy import bhmm here in order to avoid dependency loops
        import bhmm
        # initialize discrete HMM
        hmm_init = bhmm.discrete_hmm(A, B, stationary=True, reversible=self.reversible)
        # run EM
        hmm = bhmm.estimate_hmm(dtrajs_lagged, self.nstates, lag=1, initial_model=hmm_init,
                                accuracy=self.accuracy, maxit=self.maxit)
        self.hmm = bhmm.DiscreteHMM(hmm)

        # find observable set
        transition_matrix = self.hmm.transition_matrix
        observation_probabilities = self.hmm.output_probabilities
        # TODO: Cutting down... OK, this can be done
        if self.observe_active:  # cut down observation probabilities to active set
            observation_probabilities = observation_probabilities[:, msm_init.active_set]
            observation_probabilities /= observation_probabilities.sum(axis=1)[:,None]  # renormalize

        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged
        self._observable_set = observable_set
        self._dtrajs_obs = dtrajs_obs
        self.set_model_params(P=transition_matrix, pobs=observation_probabilities,
                              reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Exemplo n.º 15
0
    def score(self, dtrajs, score_method=None, score_k=None):
        """ Scores the MSM using the dtrajs using the variational approach for Markov processes [1]_ [2]_

        Currently only implemented using dense matrices - will be slow for large state spaces.

        Parameters
        ----------
        dtrajs : list of arrays
            test data (discrete trajectories).
        score_method : str
            Overwrite scoring method if desired. If `None`, the estimators scoring
            method will be used. See __init__ for documentation.
        score_k : int or None
            Overwrite scoring rank if desired. If `None`, the estimators scoring
            rank will be used. See __init__ for documentation.
        score_method : str, optional, default='VAMP2'
            Overwrite scoring method to be used if desired. If `None`, the estimators scoring
            method will be used.
            Available scores are based on the variational approach for Markov processes [1]_ [2]_ :

            *  'VAMP1'  Sum of singular values of the symmetrized transition matrix [2]_ .
                        If the MSM is reversible, this is equal to the sum of transition
                        matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ .
            *  'VAMP2'  Sum of squared singular values of the symmetrized transition matrix [2]_ .
                        If the MSM is reversible, this is equal to the kinetic variance [4]_ .

        score_k : int or None
            The maximum number of eigenvalues or singular values used in the
            score. If set to None, all available eigenvalues will be used.

        References
        ----------
        .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes
            in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013).
        .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes
            from time series data (in preparation)
        .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow
            dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015)
        .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular
            dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015)

        """
        dtrajs = ensure_dtraj_list(dtrajs)  # ensure format

        # reset estimator data if needed
        if score_method is not None:
            self.score_method = score_method
        if score_k is not None:
            self.score_k = score_k

        # determine actual scoring rank
        if self.score_k is None:
            self.score_k = self.nstates
        if self.score_k > self.nstates:
            self.logger.warning('Requested scoring rank {rank} exceeds number of MSM states. '
                                'Reduced to score_k = {nstates}'.format(rank=self.score_k, nstates=self.nstates))
            self.score_k = self.nstates  # limit to nstates

        # training data
        K = self.transition_matrix  # model
        C0t_train = self.count_matrix_active
        from scipy.sparse import issparse
        if issparse(K):  # can't deal with sparse right now.
            K = K.toarray()
        if issparse(C0t_train):  # can't deal with sparse right now.
            C0t_train = C0t_train.toarray()
        C00_train = _np.diag(C0t_train.sum(axis=1))  # empirical cov
        Ctt_train = _np.diag(C0t_train.sum(axis=0))  # empirical cov

        # test data
        C0t_test_raw = count_matrix(dtrajs, self.lag, sparse_return=False)
        # map to present active set
        map_from = self.active_set[_np.where(self.active_set < C0t_test_raw.shape[0])[0]]
        map_to = _np.arange(len(map_from))
        C0t_test = _np.zeros((self.nstates, self.nstates))
        C0t_test[_np.ix_(map_to, map_to)] = C0t_test_raw[_np.ix_(map_from, map_from)]
        C00_test = _np.diag(C0t_test.sum(axis=1))
        Ctt_test = _np.diag(C0t_test.sum(axis=0))

        # score
        from pyemma.util.metrics import vamp_score
        return vamp_score(K, C00_train, C0t_train, Ctt_train, C00_test, C0t_test, Ctt_test,
                          k=self.score_k, score=self.score_method)
Exemplo n.º 16
0
    def score_cv(self, dtrajs, n=10, score_method=None, score_k=None):
        """ Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and crossvalidation [3]_ .

        Divides the data into training and test data, fits a MSM using the training
        data using the parameters of this estimator, and scores is using the test
        data.
        Currently only one way of splitting is implemented, where for each n,
        the data is randomly divided into two approximately equally large sets of
        discrete trajectory fragments with lengths of at least the lagtime.

        Currently only implemented using dense matrices - will be slow for large state spaces.

        Parameters
        ----------
        dtrajs : list of arrays
            Test data (discrete trajectories).
        n : number of samples
            Number of repetitions of the cross-validation. Use large n to get solid
            means of the score.
        score_method : str, optional, default='VAMP2'
            Overwrite scoring method to be used if desired. If `None`, the estimators scoring
            method will be used.
            Available scores are based on the variational approach for Markov processes [1]_ [2]_ :

            *  'VAMP1'  Sum of singular values of the symmetrized transition matrix [2]_ .
                        If the MSM is reversible, this is equal to the sum of transition
                        matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ .
            *  'VAMP2'  Sum of squared singular values of the symmetrized transition matrix [2]_ .
                        If the MSM is reversible, this is equal to the kinetic variance [4]_ .

        score_k : int or None
            The maximum number of eigenvalues or singular values used in the
            score. If set to None, all available eigenvalues will be used.

        References
        ----------
        .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes
            in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013).
        .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes
            from time series data (in preparation).
        .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow
            dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015).
        .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular
            dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015).

        """
        from deeptime.decomposition import cvsplit_trajs
        dtrajs = ensure_dtraj_list(dtrajs)  # ensure format

        if self.count_mode not in ('sliding', 'sample'):
            raise ValueError('score_cv currently only supports count modes "sliding" and "sample"')
        sliding = self.count_mode == 'sliding'
        scores = []
        from pyemma._ext.sklearn.base import clone
        estimator = clone(self)
        for i in range(n):
            dtrajs_split = self._blocksplit_dtrajs(dtrajs, sliding)
            dtrajs_train, dtrajs_test = cvsplit_trajs(dtrajs_split)
            estimator.fit(dtrajs_train)
            s = estimator.score(dtrajs_test, score_method=score_method, score_k=score_k)
            scores.append(s)
        return _np.array(scores)
Exemplo n.º 17
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            # TODO: this is too tedious - need to automatize parameter+result copying between estimators.
            self.nstates = self.init_hmsm.nstates
            self.reversible = self.init_hmsm.is_reversible
            self.stationary = self.init_hmsm.stationary
            # trajectories
            self._dtrajs_full = self.init_hmsm._dtrajs_full
            self._dtrajs_lagged = self.init_hmsm._dtrajs_lagged
            self._observable_set = self.init_hmsm._observable_set
            self._dtrajs_obs = self.init_hmsm._dtrajs_obs
            # MLE estimation results
            self.likelihoods = self.init_hmsm.likelihoods  # Likelihood history
            self.likelihood = self.init_hmsm.likelihood
            self.hidden_state_probabilities = self.init_hmsm.hidden_state_probabilities  # gamma variables
            self.hidden_state_trajectories = self.init_hmsm.hidden_state_trajectories  # Viterbi path
            self.count_matrix = self.init_hmsm.count_matrix  # hidden count matrix
            self.initial_count = self.init_hmsm.initial_count  # hidden init count
            self.initial_distribution = self.init_hmsm.initial_distribution
            self._active_set = self.init_hmsm._active_set
            # update HMM Model
            self.update_model_params(
                P=self.init_hmsm.transition_matrix,
                pobs=self.init_hmsm.observation_probabilities,
                dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        import msmtools.estimation as msmest
        if self.reversible and not msmest.is_connected(self.count_matrix):
            raise NotImplementedError(
                'Encountered disconnected count matrix:\n ' +
                str(self.count_matrix) +
                'with reversible Bayesian HMM sampler using lag=' +
                str(self.lag) + ' and stride=' + str(self.stride) +
                '. Consider using shorter lag, ' +
                'or shorter stride (to use more of the data), ' +
                'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        nstates_full = msmest.number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones(
                (self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(
                eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples,
                                    description='Sampling HMSMs',
                                    stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm
        hmm_mle = discrete_hmm(self.initial_distribution,
                               self.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(
            self.discrete_trajectories_lagged,
            hmm_mle,
            nsample=self.nsamples,
            reversible=self.reversible,
            stationary=self.stationary,
            p0_prior=self.p0_prior,
            transition_matrix_prior=self.transition_matrix_prior,
            store_hidden=self.store_hidden,
            call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_Ps = [
            sampled_hmm.sampled_hmms[i].transition_matrix
            for i in range(self.nsamples)
        ]
        sample_pis = [
            sampled_hmm.sampled_hmms[i].stationary_distribution
            for i in range(self.nsamples)
        ]
        sample_pobs = [
            sampled_hmm.sampled_hmms[i].output_model.output_probabilities
            for i in range(self.nsamples)
        ]
        samples = []
        for i in range(
                self.nsamples):  # restrict to observable set if necessary
            Bobs = sample_pobs[i][:, self.observable_set]
            sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(
                _HMSM(sample_Ps[i],
                      sample_pobs[i],
                      pi=sample_pis[i],
                      dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [
            sampled_hmm.sampled_hmms[i].hidden_state_trajectories
            for i in range(self.nsamples)
        ]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity)
Exemplo n.º 18
0
    def _estimate(self, dtrajs):
        """

        Return
        ------
        hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>`
            Estimated Hidden Markov state model

        """
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        # if no initial MSM is given, estimate it now
        if self.init_hmsm is None:
            # estimate with store_data=True, because we need an EstimatedHMSM
            hmsm_estimator = _MaximumLikelihoodHMSM(
                lag=self.lag,
                stride=self.stride,
                nstates=self.nstates,
                reversible=self.reversible,
                connectivity=self.connectivity,
                observe_active=self.observe_active,
                dt_traj=self.dt_traj)
            init_hmsm = hmsm_estimator.estimate(
                dtrajs)  # estimate with lagged trajectories
        else:
            # check input
            assert isinstance(
                self.init_hmsm,
                _EstimatedHMSM), 'hmsm must be of type EstimatedHMSM'
            init_hmsm = self.init_hmsm
            self.nstates = init_hmsm.nstates
            self.reversible = init_hmsm.is_reversible

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler
        if self.observe_active:
            import msmtools.estimation as msmest
            nstates_full = msmest.number_of_states(dtrajs)
            # pobs = _np.zeros((init_hmsm.nstates, nstates_full))  # currently unused because that produces zero cols
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            pobs = eps * _np.ones(
                (self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            pobs[:, init_hmsm.observable_set] = _np.maximum(
                eps, init_hmsm.observation_probabilities)
            # renormalize B to make it row-stochastic
            pobs /= pobs.sum(axis=1)[:, None]
        else:
            pobs = init_hmsm.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples,
                                    description='Sampling HMSMs',
                                    stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm
        hmm_mle = discrete_hmm(init_hmsm.transition_matrix,
                               pobs,
                               stationary=True,
                               reversible=self.reversible)

        # define prior
        if self.prior == 'sparse':
            self.prior_count_matrix = _np.zeros((self.nstates, self.nstates),
                                                dtype=_np.float64)
        elif self.prior == 'uniform':
            self.prior_count_matrix = _np.ones((self.nstates, self.nstates),
                                               dtype=_np.float64)
        elif self.prior == 'mixed':
            # C0 = _np.dot(_np.diag(init_hmsm.stationary_distribution), init_hmsm.transition_matrix)
            P0 = init_hmsm.transition_matrix
            P0_offdiag = P0 - _np.diag(_np.diag(P0))
            scaling_factor = 1.0 / _np.sum(P0_offdiag, axis=1)
            self.prior_count_matrix = P0 * scaling_factor[:, None]
        else:
            raise ValueError('Unknown prior mode: ' + self.prior)

        sampled_hmm = bayesian_hmm(
            init_hmsm.discrete_trajectories_lagged,
            hmm_mle,
            nsample=self.nsamples,
            transition_matrix_prior=self.prior_count_matrix,
            call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_Ps = [
            sampled_hmm.sampled_hmms[i].transition_matrix
            for i in range(self.nsamples)
        ]
        sample_pis = [
            sampled_hmm.sampled_hmms[i].stationary_distribution
            for i in range(self.nsamples)
        ]
        sample_pobs = [
            sampled_hmm.sampled_hmms[i].output_model.output_probabilities
            for i in range(self.nsamples)
        ]
        samples = []
        for i in range(
                self.nsamples):  # restrict to observable set if necessary
            Bobs = sample_pobs[i][:, init_hmsm.observable_set]
            sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(
                _HMSM(sample_Ps[i],
                      sample_pobs[i],
                      pi=sample_pis[i],
                      dt_model=init_hmsm.dt_model))

        # parametrize self
        self._dtrajs_full = dtrajs
        self._observable_set = init_hmsm._observable_set
        self._dtrajs_obs = init_hmsm._dtrajs_obs
        self.set_model_params(samples=samples,
                              P=init_hmsm.transition_matrix,
                              pobs=init_hmsm.observation_probabilities,
                              dt_model=init_hmsm.dt_model)

        return self
Exemplo n.º 19
0
    def _estimate(self, dtrajs):
        """
        Parameters
        ----------
        dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`pyemma.msm.util.dtraj_states.DiscreteTrajectoryStats`
            discrete trajectories, stored as integer ndarrays (arbitrary size)
            or a single ndarray for only one trajectory.
        **params :
            Other keyword parameters if different from the settings when this estimator was constructed

        Returns
        -------
        MSM : :class:`pyemma.msm.EstimatedMSM` or :class:`pyemma.msm.MSM`

        """
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)
        # harvest discrete statistics
        if isinstance(dtrajs, _DiscreteTrajectoryStats):
            dtrajstats = dtrajs
        else:
            # compute and store discrete trajectory statistics
            dtrajstats = _DiscreteTrajectoryStats(dtrajs)
            # check if this MSM seems too large to be dense
            if dtrajstats.nstates > 4000 and not self.sparse:
                self.logger.warning(
                    'Building a dense MSM with ' + str(dtrajstats.nstates) +
                    ' states. This can be '
                    'inefficient or unfeasible in terms of both runtime and memory consumption. '
                    'Consider using sparse=True.')

        # count lagged
        dtrajstats.count_lagged(self.lag, count_mode=self.count_mode)

        # full count matrix and number of states
        self._C_full = dtrajstats.count_matrix()
        self._nstates_full = self._C_full.shape[0]

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            if self.statdist_constraint is None:
                # statdist not given - full connectivity on all states
                self.active_set = dtrajstats.largest_connected_set
            else:
                active_set = self._prepare_input_revpi(
                    self._C_full, self.statdist_constraint)
                self.active_set = active_set
        else:
            # for 'None' and 'all' all visited states are active
            self.active_set = dtrajstats.visited_set

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # restrict stationary distribution to active set
        if self.statdist_constraint is None:
            statdist_active = None
        else:
            statdist_active = self.statdist_constraint[self.active_set]
            statdist_active /= statdist_active.sum()  # renormalize

        # Estimate transition matrix
        if self.connectivity == 'largest':
            P = msmest.transition_matrix(self._C_active,
                                         reversible=self.reversible,
                                         mu=statdist_active,
                                         maxiter=self.maxiter,
                                         maxerr=self.maxerr)
        elif self.connectivity == 'none':
            # reversible mode only possible if active set is connected
            # - in this case all visited states are connected and thus
            # this mode is identical to 'largest'
            if self.reversible and not msmest.is_connected(self._C_active):
                raise ValueError(
                    'Reversible MSM estimation is not possible with connectivity mode "none", '
                    'because the set of all visited states is not reversibly connected'
                )
            P = msmest.transition_matrix(self._C_active,
                                         reversible=self.reversible,
                                         mu=statdist_active,
                                         maxiter=self.maxiter,
                                         maxerr=self.maxerr)
        else:
            raise NotImplementedError(
                'MSM estimation with connectivity=%s is currently not implemented.'
                % self.connectivity)

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()
            P = P.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self.set_model_params(P=P,
                              pi=statdist_active,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Exemplo n.º 20
0
    def _estimate(self, dtrajs):
        """

        Parameters
        ----------

        Return
        ------
        hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>`
            Estimated Hidden Markov state model

        """
        import bhmm
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # CHECK LAG
        trajlengths = [_np.size(dtraj) for dtraj in dtrajs]
        if self.lag >= _np.max(trajlengths):
            raise ValueError('Illegal lag time ' + str(self.lag) + ' exceeds longest trajectory length')
        if self.lag > _np.mean(trajlengths):
            self.logger.warning('Lag time ' + str(self.lag) + ' is on the order of mean trajectory length'
                                + _np.mean(trajlengths) + '. It is recommended to fit four lag times in each '
                                + 'trajectory. HMM might be inaccurate.')

        # EVALUATE STRIDE
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # get a quick estimate from the spectral radius of the nonreversible
            from pyemma.msm import estimate_markov_model
            msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False,
                                           connectivity='largest', dt_traj=self.timestep_traj)
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_nr.nstates > self.nstates:
                corrtime = max(1, msm_nr.timescales()[self.nstates-1])
                # use the smaller of these two pessimistic estimates
                self.stride = int(min(self.lag, 2*corrtime))

        # LAG AND STRIDE DATA
        dtrajs_lagged_strided = bhmm.lag_observations(dtrajs, self.lag, stride=self.stride)

        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # INIT HMM
        from bhmm import init_discrete_hmm
        from pyemma.msm.estimators import MaximumLikelihoodMSM
        if self.msm_init=='largest-strong':
            hmm_init = init_discrete_hmm(dtrajs_lagged_strided, self.nstates, lag=1,
                                         reversible=self.reversible, stationary=True, regularize=True,
                                         method='lcs-spectral', separate=self.separate)
        elif self.msm_init=='all':
            hmm_init = init_discrete_hmm(dtrajs_lagged_strided, self.nstates, lag=1,
                                         reversible=self.reversible, stationary=True, regularize=True,
                                         method='spectral', separate=self.separate)
        elif issubclass(self.msm_init.__class__, MaximumLikelihoodMSM):  # initial MSM given.
            from bhmm.init.discrete import init_discrete_hmm_spectral
            p0, P0, pobs0 = init_discrete_hmm_spectral(self.msm_init.count_matrix_full, self.nstates,
                                                       reversible=self.reversible, stationary=True,
                                                       active_set=self.msm_init.active_set,
                                                       P=self.msm_init.transition_matrix, separate=self.separate)
            hmm_init = bhmm.discrete_hmm(p0, P0, pobs0)
            observe_subset = self.msm_init.active_set  # override observe_subset.
        else:
            raise ValueError('Unknown MSM initialization option: ' + str(self.msm_init))

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------

        # run EM
        from bhmm.estimators.maximum_likelihood import MaximumLikelihoodEstimator as _MaximumLikelihoodEstimator
        hmm_est = _MaximumLikelihoodEstimator(dtrajs_lagged_strided, self.nstates, initial_model=hmm_init,
                                              output='discrete', reversible=self.reversible, stationary=self.stationary,
                                              accuracy=self.accuracy, maxit=self.maxit)
        # run
        hmm_est.fit()
        # package in discrete HMM
        self.hmm = bhmm.DiscreteHMM(hmm_est.hmm)

        # get model parameters
        self.initial_distribution = self.hmm.initial_distribution
        transition_matrix = self.hmm.transition_matrix
        observation_probabilities = self.hmm.output_probabilities

        # get estimation parameters
        self.likelihoods = hmm_est.likelihoods  # Likelihood history
        self.likelihood = self.likelihoods[-1]
        self.hidden_state_probabilities = hmm_est.hidden_state_probabilities  # gamma variables
        self.hidden_state_trajectories = hmm_est.hmm.hidden_state_trajectories  # Viterbi path
        self.count_matrix = hmm_est.count_matrix  # hidden count matrix
        self.initial_count = hmm_est.initial_count  # hidden init count
        self._active_set = _np.arange(self.nstates)

        # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards?
        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged_strided
        self._nstates_obs_full = msmest.number_of_states(dtrajs)
        self._nstates_obs = msmest.number_of_states(dtrajs_lagged_strided)
        self._observable_set = _np.arange(self._nstates_obs)
        self._dtrajs_obs = dtrajs
        self.set_model_params(P=transition_matrix, pobs=observation_probabilities,
                              reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag))

        # TODO: perhaps remove connectivity and just rely on .submodel()?
        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity)
Exemplo n.º 21
0
    def _estimate(self, data):
        r"""Estimates ITS at set of lagtimes

        """
        ### PREPARE AND CHECK DATA
        # TODO: Currenlty only discrete trajectories are implemented. For a general class this needs to be changed.
        data = _types.ensure_dtraj_list(data)

        # check trajectory lengths
        self._trajlengths = np.array([len(traj) for traj in data])
        maxlength = np.max(self._trajlengths)

        # set lag times by data if not yet set
        if self._lags is None:
            maxlag = 0.5 * np.sum(self._trajlengths) / float(
                len(self._trajlengths))
            self._lags = _generate_lags(maxlag, 1.5)

        # check if some lag times are forbidden.
        if np.max(self._lags) >= maxlength:
            Ifit = np.where(self._lags < maxlength)[0]
            Inofit = np.where(self._lags >= maxlength)[0]
            self.logger.warning(
                'Ignoring lag times that exceed the longest trajectory: ' +
                str(self._lags[Inofit]))
            self._lags = self._lags[Ifit]

        ### RUN ESTIMATION

        # construct all parameter sets for the estimator
        param_sets = tuple(param_grid({'lag': self._lags}))

        if isinstance(self.estimator, SampledModel):
            self.estimator.show_progress = False

        # run estimation on all lag times
        self._models, self._estimators = estimate_param_scan(
            self.estimator,
            data,
            param_sets,
            failfast=False,
            return_estimators=True,
            n_jobs=self.n_jobs,
            progress_reporter=self)

        ### PROCESS RESULTS
        # if some results are None, estimation has failed. Warn and truncate models and lag times
        good = np.array(
            [i for i, m in enumerate(self._models) if m is not None],
            dtype=int)
        bad = np.array([i for i, m in enumerate(self._models) if m is None],
                       dtype=int)
        if good.size == 0:
            raise RuntimeError(
                'Estimation has failed at ALL lagtimes. Check for errors.')
        if bad.size > 0:
            self.logger.warning(
                'Estimation has failed at lagtimes: ' + str(self._lags[bad]) +
                '. Run single-lag estimation at these lags to track down the error.'
            )
            self._lags = self._lags[good]
            self._models = list(np.array(self._models)[good])

        # timescales
        timescales = [m.timescales() for m in self._models]

        # how many finite timescales do we really have?
        maxnts = max([len(ts[np.isfinite(ts)]) for ts in timescales])
        if self.nits is None:
            self.nits = maxnts
        if maxnts < self.nits:
            self.nits = maxnts
            self.logger.warning(
                'Changed user setting nits to the number of available timescales nits='
                + str(self.nits))

        # sort timescales into matrix
        computed_all = True  # flag if we have found any problems
        self._its = np.empty((len(self._lags), self.nits))
        self._its[:] = np.NAN  # initialize with NaN in order to point out timescales that were not computed
        self._successful_lag_indexes = []
        for i, ts in enumerate(timescales):
            if ts is not None:
                if np.any(
                        np.isfinite(ts)
                ):  # if there are any finite timescales available, add them
                    self._its[i, :len(
                        ts
                    )] = ts[:self.
                            nits]  # copy into array. Leave NaN if there is no timescale
                    self._successful_lag_indexes.append(i)

        if len(self._successful_lag_indexes) < len(self._lags):
            computed_all = False
        if np.any(np.isnan(self._its)):
            computed_all = False

        # timescales samples if available
        if issubclass(self._models[0].__class__, SampledModel):
            # samples
            timescales_samples = [
                m.sample_f('timescales') for m in self._models
            ]
            nsamples = np.shape(timescales_samples[0])[0]
            self._its_samples = np.empty(
                (nsamples, len(self._lags), self.nits))
            self._its_samples[:] = np.NAN  # initialize with NaN in order to point out timescales that were not computed

            for i, ts in enumerate(timescales_samples):
                if ts is not None:
                    ts = np.vstack(ts)
                    ts = ts[:, :self.nits]
                    self._its_samples[:, i, :ts.shape[
                        1]] = ts  # copy into array. Leave NaN if there is no timescales

            if np.any(np.isnan(self._its_samples)):
                computed_all = False

        if not computed_all:
            self.logger.warning(
                'Some timescales could not be computed. Timescales array is smaller than '
                'expected or contains NaNs')
Exemplo n.º 22
0
def timescales_msm(dtrajs,
                   lags=None,
                   nits=None,
                   reversible=True,
                   connected=True,
                   errors=None,
                   nsamples=50,
                   n_jobs=1,
                   show_progress=True):
    # format data
    r""" Implied timescales from Markov state models estimated at a series of lag times.

    Parameters
    ----------
    dtrajs : array-like or list of array-likes
        discrete trajectories

    lags : array-like of integers, optional
        integer lag times at which the implied timescales will be calculated

    nits : int, optional
        number of implied timescales to be computed. Will compute less
        if the number of states are smaller. If None, the number of timescales
        will be automatically determined.

    connected : boolean, optional
        If true compute the connected set before transition matrix estimation
        at each lag separately

    reversible : boolean, optional
        Estimate transition matrix reversibly (True) or nonreversibly (False)

    errors : None | 'bayes', optional
        Specifies whether to compute statistical uncertainties (by default
        not), an which algorithm to use if yes. Currently the only option is:

        * 'bayes' for Bayesian sampling of the posterior

        Attention: Computing errors can be *very* slow if the MSM has many
        states. Moreover there are still unsolved theoretical problems, and
        therefore the uncertainty interval and the maximum likelihood estimator
        can be inconsistent. Use this as a rough guess for statistical
        uncertainties.

    nsamples : int, optional
        The number of approximately independent transition matrix samples
        generated for each lag time for uncertainty quantification.
        Only used if errors is not None.

    n_jobs : int, optional
        how many subprocesses to start to estimate the models for each lag time.

    Returns
    -------
    itsobj : :class:`ImpliedTimescales <pyemma.msm.estimators.implied_timescales.ImpliedTimescales>` object

    Example
    -------
    >>> from pyemma import msm
    >>> dtraj = [0,1,1,2,2,2,1,2,2,2,1,0,0,1,1,1,2,2,1,1,2,1,1,0,0,0,1,1,2,2,1]   # mini-trajectory
    >>> ts = msm.its(dtraj, [1,2,3,4,5])
    >>> print(ts.timescales)  # doctest: +ELLIPSIS
    [[ 1.5...  0.2...]
     [ 3.1...  1.0...]
     [ 2.03...  1.02...]
     [ 4.63...  3.42...]
     [ 5.13...  2.59...]]

    See also
    --------
    ImpliedTimescales
        The object returned by this function.
    pyemma.plots.plot_implied_timescales
        Implied timescales plotting function. Just call it with the :class:`ImpliedTimescales <pyemma.msm.estimators.ImpliedTimescales>`
        object produced by this function as an argument.


    .. autoclass:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales
        :members:
        :undoc-members:

        .. rubric:: Methods

        .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales
           :methods:

        .. rubric:: Attributes

        .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales
            :attributes:

    References
    ----------
    Implied timescales as a lagtime-selection and MSM-validation approach were
    suggested in [1]_. Error estimation is done either using moving block
    bootstrapping [2]_ or a Bayesian analysis using Metropolis-Hastings Monte
    Carlo sampling of the posterior. Nonreversible Bayesian sampling is done
    by independently sampling Dirichtlet distributions of the transition matrix
    rows. A Monte Carlo method for sampling reversible MSMs was introduced
    in [3]_. Here we employ a much more efficient algorithm introduced in [4]_.

    .. [1] Swope, W. C. and J. W. Pitera and F. Suits: Describing protein
        folding kinetics by molecular dynamics simulations: 1. Theory.
        J. Phys. Chem. B 108: 6571-6581 (2004)
    .. [2] Kuensch, H. R.: The jackknife and the bootstrap for general
        stationary observations. Ann. Stat. 17, 1217-1241 (1989)
    .. [3] Noe, F.: Probability Distributions of Molecular Observables computed
        from Markov Models. J. Chem. Phys. 128, 244103 (2008)
    .. [4] Trendelkamp-Schroer, B, H. Wu, F. Paul and F. Noe:
        Estimation and uncertainty of reversible Markov models.
        http://arxiv.org/abs/1507.05990

    """
    # format data
    dtrajs = _types.ensure_dtraj_list(dtrajs)

    if connected:
        connectivity = 'largest'
    else:
        connectivity = 'none'

    # MLE or error estimation?
    if errors is None:
        estimator = _ML_MSM(reversible=reversible, connectivity=connectivity)
    elif errors == 'bayes':
        estimator = _Bayes_MSM(reversible=reversible,
                               connectivity=connectivity,
                               nsamples=nsamples,
                               show_progress=show_progress)
    else:
        raise NotImplementedError('Error estimation method' + errors +
                                  'currently not implemented')

    # go
    itsobj = _ImpliedTimescales(estimator,
                                lags=lags,
                                nits=nits,
                                n_jobs=n_jobs,
                                show_progress=show_progress)
    itsobj.estimate(dtrajs)
    return itsobj
Exemplo n.º 23
0
def rewrite_dtrajs_to_core_sets(dtrajs, core_set, in_place=False):
    r""" Rewrite trajectories that contain unassigned states.

    The given discrete trajectories are rewritten such that states not in the core
    set are -1. Trajectories that begin with unassigned states will be truncated here.
    Index offsets are computed to keep assignment to original data.

    Examples
    --------
    Let's assume we want to restrict the core sets to 1, 2 and 3:

    >>> import numpy as np
    >>> dtrajs = [np.array([5, 4, 1, 3, 4, 4, 5, 3, 0, 1]),
    ...           np.array([4, 4, 4, 5]),
    ...           np.array([4, 4, 5, 1, 2, 3])]
    >>> dtraj_core, offsets, n_cores = rewrite_dtrajs_to_core_sets(dtrajs, core_set=[0, 1, 3])
    >>> print(dtraj_core)
    [array([ 1,  3, -1, -1, -1,  3,  0,  1]), array([ 1, -1,  3])]

    We reach the first milestone in the first trajectory after two steps, after four in the second and so on:
    >>> print(offsets)
    [2, None, 3]

    Since the second trajectory never visited a core set, it will be removed and marked as such in the offsets
    lists by a 'None'. Each entry corresponds to one entry in the input list.

    Parameters
    ----------
    dtrajs: array_like or list of array_like
        Discretized trajectory or list of discretized trajectories.

    core_set: array -like of ints
        Pass an array of micro-states to define the core sets.

    in_place: boolean, default=False
        if True, replace the current dtrajs
        if False, return a copy

    Returns
    -------
    dtrajs, offsets, n_cores: list of ndarray(dtype=int), list, int

    """
    import copy
    from pyemma.util import types

    dtrajs = types.ensure_dtraj_list(dtrajs)

    if isinstance(core_set, (list, tuple)):
        core_set = list(map(types.ensure_int_vector, core_set))
        core_set = np.unique(np.concatenate(core_set))
    else:
        core_set = np.unique(types.ensure_int_vector(core_set))

    n_cores = len(core_set)

    if not in_place:
        dtrajs = copy.deepcopy(dtrajs)

    # if we have no state definition at the beginning of a trajectory, we store the offset to the first milestone.
    offsets = [0] * len(dtrajs)

    for i, d in enumerate(dtrajs):
        # set non-core states to -1
        outside_core_set = ~np.in1d(d, core_set)
        if not np.any(outside_core_set):
            continue
        d[outside_core_set] = -1

        where_positive = np.where(d >= 0)[0]
        offsets[i] = where_positive.min() if len(where_positive) > 0 else None
        # traj never reached a core set?
        if offsets[i] is None:
            warnings.warn(
                'The entire trajectory with index {i} never visited a core set!'
                .format(i=i))
        elif offsets[i] > 0:
            warnings.warn(
                'The trajectory with index {i} had to be truncated for not starting in a core.'
                .format(i=i))
            dtrajs[i] = d[np.where(d >= 0)[0][0]:]

    # filter empty dtrajs
    dtrajs = [d for i, d in enumerate(dtrajs) if offsets[i] is not None]

    return dtrajs, offsets, n_cores