Пример #1
0
def bootstrap_counts(dtrajs, lagtime, corrlength=None):
    """
    Generates a randomly resampled count matrix given the input coordinates.

    See API function for full documentation.
    """
    from scipy.stats import rv_discrete
    # if we have just one trajectory, put it into a one-element list:
    if not isinstance(dtrajs, list):
        dtrajs = [dtrajs]
    ntraj = len(dtrajs)

    # can we do the estimate?
    lengths = determine_lengths(dtrajs)
    Lmax = np.max(lengths)
    Ltot = np.sum(lengths)
    if lagtime >= Lmax:
        raise ValueError('Cannot estimate count matrix: lag time '
                         + str(lagtime) + ' is longer than the longest trajectory length ' + str(Lmax))

    # how many counts can we sample?
    if corrlength is None:
        corrlength = lagtime
    nsample = int(Ltot / corrlength)

    # determine number of states n
    from deeptime.markov import number_of_states
    n = number_of_states(dtrajs)

    # assigning trajectory sampling weights
    w_trajs = np.maximum(0.0, lengths - lagtime)
    w_trajs /= np.sum(w_trajs)  # normalize to sum 1.0
    distrib_trajs = rv_discrete(values=(list(range(ntraj)), w_trajs))
    # sample number of counts from each trajectory
    n_from_traj = np.bincount(distrib_trajs.rvs(size=nsample), minlength=ntraj)

    # for each trajectory, sample counts and stack them
    rows = np.zeros((nsample,))
    cols = np.zeros((nsample,))
    ones = np.ones((nsample,))
    ncur = 0
    for i in range(len(n_from_traj)):
        if n_from_traj[i] > 0:
            (r, c) = bootstrap_counts_singletraj(dtrajs[i], lagtime, n_from_traj[i])
            rows[ncur:ncur + n_from_traj[i]] = r
            cols[ncur:ncur + n_from_traj[i]] = c
            ncur += n_from_traj[i]
    # sum over counts
    Csparse = scipy.sparse.coo_matrix((ones, (rows, cols)), shape=(n, n))

    return Csparse.tocsr()
Пример #2
0
def _split_sequences_multitraj(dtrajs, lag):
    """ splits the discrete trajectories into conditional sequences by starting state

    Parameters
    ----------
    dtrajs : list of int-iterables
        discrete trajectories
    lag : int
        lag time
    """
    from deeptime.markov import number_of_states
    n = number_of_states(dtrajs)
    res = []
    for i in range(n):
        res.append([])
    for dtraj in dtrajs:
        states, seqs = _split_sequences_singletraj(dtraj, n, lag)
        for i in range(len(states)):
            res[states[i]].append(seqs[i])
    return res
Пример #3
0
    def __init__(self, dtrajs):
        from pyemma.util.types import ensure_dtraj_list

        # discrete trajectories
        self._dtrajs = ensure_dtraj_list(dtrajs)

        # TODO: extensive input checking!
        if any([np.any(d < -1) for d in self._dtrajs]):
            raise ValueError('Discrete trajectory contains elements < -1.')

        ## basic count statistics
        # histogram
        self._hist = count_states(self._dtrajs, ignore_negative=True)
        # total counts
        self._total_count = np.sum(self._hist)
        # number of states
        self._nstates = number_of_states(dtrajs)

        # not yet estimated
        self._counted_at_lag = False
Пример #4
0
    def fit(self,
            data,
            n_burn_in: int = 0,
            n_thin: int = 1,
            progress=None,
            **kwargs):
        r""" Sample from the posterior.

        Parameters
        ----------
        data : array_like or list of array_like
            Input time series data.
        n_burn_in : int, optional, default=0
            The number of samples to discard to burn-in, following which :attr:`n_samples` samples will be generated.
        n_thin : int, optional, default=1
            The number of Gibbs sampling updates used to generate each returned sample.
        progress : iterable, optional, default=None
            Optional progressbar. Tested for tqdm.
        **kwargs
            Ignored kwargs for scikit-learn compatibility.

        Returns
        -------
        self : BayesianHMM
            Reference to self.
        """
        progress = handle_progress_bar(progress)
        dtrajs = ensure_dtraj_list(data)

        # fetch priors
        tmat = self.initial_hmm.transition_model.transition_matrix
        transition_matrix_prior = self._transition_matrix_prior_np

        initial_distribution_prior = self._initial_distribution_prior_np

        model = BayesianHMMPosterior()
        # update HMM Model
        model.prior = self.initial_hmm.copy()

        prior = model.prior

        # check if we are strongly connected in the reversible case (plus prior)
        if self.reversible and not is_connected(tmat + transition_matrix_prior,
                                                directed=True):
            raise NotImplementedError(
                'Trying to sample disconnected HMM with option reversible:\n '
                f'{tmat}\n Use prior to connect, select connected subset, '
                f'or use reversible=False.')

        # EVALUATE STRIDE
        dtrajs_lagged_strided = compute_dtrajs_effective(
            dtrajs,
            lagtime=prior.lagtime,
            n_states=prior.n_hidden_states,
            stride=self.stride)
        # if stride is different to init_hmm, check if microstates in lagged-strided trajs are compatible
        if self.stride != self.initial_hmm.stride:
            symbols = np.unique(np.concatenate(dtrajs_lagged_strided))
            if not len(
                    np.intersect1d(self.initial_hmm.observation_symbols,
                                   symbols)) == len(symbols):
                raise ValueError(
                    'Choice of stride has excluded a different set of microstates than in '
                    'init_hmm. Set of observed microstates in time-lagged strided trajectories '
                    'must match to the one used for init_hmm estimation.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        n_states_full = number_of_states(dtrajs_lagged_strided)

        if prior.n_observation_states < n_states_full:
            eps = 0.01 / n_states_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            full_obs_probabilities = eps * np.ones(
                (prior.n_hidden_states, n_states_full), dtype=np.float64)
            # fill active states
            full_obs_probabilities[:, prior.observation_symbols] = np.maximum(
                eps, prior.output_probabilities)
            # renormalize B to make it row-stochastic
            full_obs_probabilities /= full_obs_probabilities.sum(axis=1)[:,
                                                                         None]
        else:
            full_obs_probabilities = prior.output_probabilities

        maxT = max(len(o) for o in dtrajs_lagged_strided)

        # pre-construct hidden variables
        temp_alpha = np.zeros((maxT, prior.n_hidden_states))

        has_all_obs_symbols = model.prior.n_observation_states == len(
            model.prior.observation_symbols_full)

        try:
            # sample model is basically copy of prior
            sample_model = BayesianHMM._SampleStorage(
                transition_matrix=prior.transition_model.transition_matrix.
                copy(),
                output_model=DiscreteOutputModel(
                    full_obs_probabilities.copy()),
                initial_distribution=prior.initial_distribution.copy(),
                stationary_distribution=prior.transition_model.
                stationary_distribution.copy(),
                counts=prior.count_model.count_matrix.copy(),
                hidden_trajs=[])

            # Run burn-in.
            for _ in range(n_burn_in):
                self._update(sample_model, dtrajs_lagged_strided, temp_alpha,
                             transition_matrix_prior,
                             initial_distribution_prior)

            # Collect data.
            models = []
            for _ in progress(range(self.n_samples),
                              desc="Drawing samples",
                              leave=False):
                # Run a number of Gibbs sampling updates to generate each sample.
                for _ in range(n_thin):
                    self._update(sample_model, dtrajs_lagged_strided,
                                 temp_alpha, transition_matrix_prior,
                                 initial_distribution_prior)
                    sample_model.output_model.normalize()
                self._append_sample(models, prior, sample_model)

            if not has_all_obs_symbols:
                models = [
                    m.submodel(states=None,
                               obs=model.prior.observation_symbols)
                    for m in models
                ]

            model.samples = models
        finally:
            del temp_alpha

        # set new model
        self._model = model

        return self
Пример #5
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # CHECK LAG
        trajlengths = [_np.size(dtraj) for dtraj in dtrajs]
        if self.lag >= _np.max(trajlengths):
            raise ValueError('Illegal lag time ' + str(self.lag) +
                             ' exceeds longest trajectory length')
        if self.lag > _np.mean(trajlengths):
            self.logger.warning(
                'Lag time ' + str(self.lag) +
                ' is on the order of mean trajectory length ' +
                str(_np.mean(trajlengths)) +
                '. It is recommended to fit four lag times in each ' +
                'trajectory. HMM might be inaccurate.')

        # EVALUATE STRIDE
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # get a quick estimate from the spectral radius of the non-reversible
            from pyemma.msm import estimate_markov_model
            msm_nr = estimate_markov_model(dtrajs,
                                           lag=self.lag,
                                           reversible=False,
                                           sparse=False,
                                           connectivity='largest',
                                           dt_traj=self.timestep_traj)
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_nr.nstates > self.nstates:
                # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning
                import warnings
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        'ignore',
                        category=ImaginaryEigenValueWarning,
                        module=
                        'deeptime.markov.tools.analysis.dense.decomposition')
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                # use the smaller of these two pessimistic estimates
                self.stride = int(min(self.lag, 2 * corrtime))

        # LAG AND STRIDE DATA
        from deeptime.markov import compute_dtrajs_effective
        dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs,
                                                         self.lag,
                                                         n_states=-1,
                                                         stride=self.stride)

        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # INIT HMM
        from deeptime.markov.hmm import init
        from pyemma.msm.estimators import MaximumLikelihoodMSM
        from pyemma.msm.estimators import OOMReweightedMSM
        if self.msm_init == 'largest-strong':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                mode='largest-regularized',
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
        elif self.msm_init == 'all':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate,
                mode='all-regularized')
        elif isinstance(
                self.msm_init,
            (MaximumLikelihoodMSM, OOMReweightedMSM)):  # initial MSM given.
            msm = MarkovStateModel(transition_matrix=self.msm_init.P,
                                   count_model=TransitionCountModel(
                                       self.msm_init.count_matrix_active))
            hmm_init = init.discrete.metastable_from_msm(
                msm,
                n_hidden_states=self.nstates,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
            observe_subset = self.msm_init.active_set  # override observe_subset.
        else:
            raise ValueError('Unknown MSM initialization option: ' +
                             str(self.msm_init))

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------

        # run EM
        from deeptime.markov.hmm import MaximumLikelihoodHMM
        hmm_est = MaximumLikelihoodHMM(hmm_init,
                                       lagtime=self.lag,
                                       stride=self.stride,
                                       reversible=self.reversible,
                                       stationary=self.stationary,
                                       accuracy=self.accuracy,
                                       maxit=self.maxit)
        # run
        hmm_est.fit(dtrajs)
        # package in discrete HMM
        self.hmm = hmm_est.fetch_model()

        # get model parameters
        self.initial_distribution = self.hmm.initial_distribution
        transition_matrix = self.hmm.transition_model.transition_matrix
        observation_probabilities = self.hmm.output_probabilities

        # get estimation parameters
        self.likelihoods = self.hmm.likelihoods  # Likelihood history
        self.likelihood = self.likelihoods[-1]
        self.hidden_state_probabilities = self.hmm.state_probabilities  # gamma variables
        self.hidden_state_trajectories = self.hmm.hidden_state_trajectories  # Viterbi path
        self.count_matrix = self.hmm.count_model.count_matrix  # hidden count matrix
        self.initial_count = self.hmm.initial_count  # hidden init count
        self._active_set = _np.arange(self.nstates)

        # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards?
        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged_strided
        self._nstates_obs_full = number_of_states(dtrajs)
        self._nstates_obs = number_of_states(dtrajs_lagged_strided)
        self._observable_set = _np.arange(self._nstates_obs)
        self._dtrajs_obs = dtrajs
        self.set_model_params(P=transition_matrix,
                              pobs=observation_probabilities,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        # TODO: perhaps remove connectivity and just rely on .submodel()?
        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity,
                             inplace=True)
Пример #6
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            copy_attributes = [
                '_nstates', '_reversible', '_pi', '_observable_set',
                'likelihoods', 'likelihood', 'hidden_state_probabilities',
                'hidden_state_trajectories', 'count_matrix', 'initial_count',
                'initial_distribution', '_active_set'
            ]
            check_user_choices = ['lag', '_nstates']

            # check if nstates and lag are compatible
            for attr in check_user_choices:
                if not getattr(self, attr) == getattr(self.init_hmsm, attr):
                    raise UserWarning(
                        'BayesianHMSM cannot be initialized with init_hmsm with '
                        'incompatible lag or nstates.')

            if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or not all(
                (_np.array_equal(d1, d2)
                 for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))):
                raise NotImplementedError(
                    'Bayesian HMM estimation with init_hmsm is currently only implemented '
                    + 'if applied to the same data.')

            # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM.
            # EVALUATE STRIDE
            if self.stride == 'effective':
                # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
                # how many uncorrelated counts we can make
                self.stride = self.lag
                # get a quick estimate from the spectral radius of the nonreversible
                from pyemma.msm import estimate_markov_model
                msm_nr = estimate_markov_model(dtrajs,
                                               lag=self.lag,
                                               reversible=False,
                                               sparse=False,
                                               connectivity='largest',
                                               dt_traj=self.timestep_traj)
                # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
                # estimate of the decorrelation time
                if msm_nr.nstates > self.nstates:
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                    # use the smaller of these two pessimistic estimates
                    self.stride = int(min(self.lag, 2 * corrtime))

            # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible
            if self.stride != self.init_hmsm.stride:
                from deeptime.markov import compute_dtrajs_effective
                dtrajs_lagged_strided = compute_dtrajs_effective(
                    dtrajs, lagtime=self.lag, n_states=-1, stride=self.stride)
                _nstates_obs = number_of_states(dtrajs_lagged_strided,
                                                only_used=True)
                _nstates_obs_full = number_of_states(dtrajs)

                if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided),
                                _np.concatenate(
                                    self.init_hmsm._dtrajs_lagged)).size != 0:
                    raise UserWarning(
                        'Choice of stride has excluded a different set of microstates than in '
                        'init_hmsm. Set of observed microstates in time-lagged strided trajectories '
                        'must match to the one used for init_hmsm estimation.')

                self._dtrajs_full = dtrajs
                self._dtrajs_lagged = dtrajs_lagged_strided
                self._nstates_obs_full = _nstates_obs_full
                self._nstates_obs = _nstates_obs
                self._observable_set = _np.arange(self._nstates_obs)
                self._dtrajs_obs = dtrajs
            else:
                copy_attributes += [
                    '_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full',
                    '_nstates_obs', '_observable_set', '_dtrajs_obs'
                ]

            # update self with estimates from init_hmsm
            self.__dict__.update({
                k: i
                for k, i in self.init_hmsm.__dict__.items()
                if k in copy_attributes
            })

            # as mentioned in the docstring, take init_hmsm observed set observation probabilities
            self.observe_nonempty = False

            # update HMM Model
            self.update_model_params(
                P=self.init_hmsm.transition_matrix,
                pobs=self.init_hmsm.observation_probabilities,
                dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        if self.reversible and not is_connected(self.count_matrix):
            raise NotImplementedError(
                'Encountered disconnected count matrix:\n{count_matrix} '
                'with reversible Bayesian HMM sampler using lag={lag}'
                ' and stride={stride}. Consider using shorter lag, '
                'or shorter stride (to use more of the data), '
                'or using a lower value for mincount_connectivity.'.format(
                    count_matrix=self.count_matrix,
                    lag=self.lag,
                    stride=self.stride))

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        nstates_full = number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones(
                (self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(
                eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples,
                                    description='Sampling HMSMs',
                                    stage=0)

            from deeptime.util.callbacks import ProgressCallback
            outer_self = self

            class BHMMCallback(ProgressCallback):
                def __call__(self, inc=1, *args, **kw):
                    super().__call__(inc, *args, **kw)
                    outer_self._progress_update(1, stage=0)

            progress = BHMMCallback
        else:
            progress = None

        from deeptime.markov.hmm import BayesianHMM

        if self.init_hmsm is not None:
            hmm_mle = self.init_hmsm.hmm
            estimator = BayesianHMM(
                hmm_mle,
                n_samples=self.nsamples,
                stride=self.stride,
                initial_distribution_prior=self.p0_prior,
                transition_matrix_prior=self.transition_matrix_prior,
                store_hidden=self.store_hidden,
                reversible=self.reversible,
                stationary=self.stationary)
        else:
            estimator = BayesianHMM.default(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                n_samples=self.nsamples,
                stride=self.stride,
                initial_distribution_prior=self.p0_prior,
                transition_matrix_prior=self.transition_matrix_prior,
                store_hidden=self.store_hidden,
                reversible=self.reversible,
                stationary=self.stationary,
                prior_submodel=True,
                separate=self.separate)

        estimator.fit(dtrajs, n_burn_in=0, n_thin=1, progress=progress)
        model = estimator.fetch_model()
        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_inp = [(m.transition_model.transition_matrix,
                       m.transition_model.stationary_distribution,
                       m.output_probabilities) for m in model.samples]

        samples = []
        for P, pi, pobs in sample_inp:  # restrict to observable set if necessary
            Bobs = pobs[:, self.observable_set]
            pobs = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [
            model.samples[i].hidden_state_trajectories
            for i in range(self.nsamples)
        ]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity,
                             inplace=True)
Пример #7
0
def count_matrix_coo2_mult(dtrajs,
                           lag,
                           sliding=True,
                           sparse=True,
                           nstates=None):
    r"""Generate a count matrix from a given list discrete trajectories.

    The generated count matrix is a sparse matrix in compressed
    sparse row (CSR) or numpy ndarray format.

    Parameters
    ----------
    dtraj : list of ndarrays
        discrete trajectories
    lag : int
        Lagtime in trajectory steps
    sliding : bool, optional
        If true the sliding window approach
        is used for transition counting
    sparse : bool (optional)
        Whether to return a dense or a sparse matrix
    nstates : int, optional
        Enforce a count-matrix with shape=(nstates, nstates). If there are
        more states in the data, this will lead to an exception.

    Returns
    -------
    C : scipy.sparse.csr_matrix or numpy.ndarray
        The countmatrix at given lag in scipy compressed sparse row
        or numpy ndarray format.

    """
    # Determine number of states
    if nstates is None:
        from deeptime.markov import number_of_states
        nstates = number_of_states(dtrajs)
    rows = []
    cols = []
    # collect transition index pairs
    for dtraj in dtrajs:
        if dtraj.size > lag:
            if (sliding):
                rows.append(dtraj[0:-lag])
                cols.append(dtraj[lag:])
            else:
                rows.append(dtraj[0:-lag:lag])
                cols.append(dtraj[lag::lag])
    # is there anything?
    if len(rows) == 0:
        raise ValueError('No counts found - lag ' + str(lag) +
                         ' may exceed all trajectory lengths.')
    # feed into one COO matrix
    row = np.concatenate(rows)
    col = np.concatenate(cols)
    data = np.ones(row.size)
    C = scipy.sparse.coo_matrix((data, (row, col)), shape=(nstates, nstates))
    # export to output format
    if sparse:
        return C.tocsr()
    else:
        return C.toarray()