Exemplo n.º 1
0
 def test_2state_nonrev_step(self):
     obs = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int)
     mle = bhmm.estimate_hmm([obs], nstates=2, lag=1)
     sampled = bhmm.bayesian_hmm([obs], mle, reversible=False, nsample=2000,
                                 p0_prior='mixed', transition_matrix_prior='mixed')
     assert np.all(sampled.transition_matrix_std[0] > 0)
     assert np.max(np.abs(sampled.transition_matrix_std[1])) < 1e-3
Exemplo n.º 2
0
 def test_no_except(self):
     obs = [
         np.array([0, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0],
                  dtype=int),
         np.array([0, 0, 0, 0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0],
                  dtype=int)
     ]
     lag = 1
     nstates = 2
     nsamples = 2
     hmm_lag10 = bhmm.estimate_hmm(obs, nstates, lag=lag, output='discrete')
     # BHMM
     sampled_hmm_lag10 = bhmm.bayesian_hmm(obs[::lag],
                                           hmm_lag10,
                                           nsample=nsamples)
Exemplo n.º 3
0
    def setUpClass(cls):
        # load observations
        testfile = abspath(join(abspath(__file__), pardir))
        testfile = join(testfile, 'data')
        testfile = join(testfile, '2well_traj_100K.dat')
        obs = np.loadtxt(testfile, dtype=int)

        # don't print
        bhmm.config.verbose = False
        # hidden states
        cls.nstates = 2
        # samples
        cls.nsamples = 100

        # EM with lag 10
        lag = 10
        cls.hmm_lag10 = bhmm.estimate_hmm([obs], cls.nstates, lag=lag, output='discrete')
        # BHMM
        cls.sampled_hmm_lag10 = bhmm.bayesian_hmm([obs[::lag]], cls.hmm_lag10, nsample=cls.nsamples)
Exemplo n.º 4
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            # TODO: this is too tedious - need to automatize parameter+result copying between estimators.
            self.nstates = self.init_hmsm.nstates
            self.reversible = self.init_hmsm.is_reversible
            self.stationary = self.init_hmsm.stationary
            # trajectories
            self._dtrajs_full = self.init_hmsm._dtrajs_full
            self._dtrajs_lagged = self.init_hmsm._dtrajs_lagged
            self._observable_set = self.init_hmsm._observable_set
            self._dtrajs_obs = self.init_hmsm._dtrajs_obs
            # MLE estimation results
            self.likelihoods = self.init_hmsm.likelihoods  # Likelihood history
            self.likelihood = self.init_hmsm.likelihood
            self.hidden_state_probabilities = self.init_hmsm.hidden_state_probabilities  # gamma variables
            self.hidden_state_trajectories = self.init_hmsm.hidden_state_trajectories  # Viterbi path
            self.count_matrix = self.init_hmsm.count_matrix  # hidden count matrix
            self.initial_count = self.init_hmsm.initial_count  # hidden init count
            self.initial_distribution = self.init_hmsm.initial_distribution
            self._active_set = self.init_hmsm._active_set
            # update HMM Model
            self.update_model_params(
                P=self.init_hmsm.transition_matrix,
                pobs=self.init_hmsm.observation_probabilities,
                dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        import msmtools.estimation as msmest
        if self.reversible and not msmest.is_connected(self.count_matrix):
            raise NotImplementedError(
                'Encountered disconnected count matrix:\n ' +
                str(self.count_matrix) +
                'with reversible Bayesian HMM sampler using lag=' +
                str(self.lag) + ' and stride=' + str(self.stride) +
                '. Consider using shorter lag, ' +
                'or shorter stride (to use more of the data), ' +
                'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        import msmtools.estimation as msmest
        nstates_full = msmest.number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones(
                (self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(
                eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples,
                                    description='Sampling HMSMs',
                                    stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm
        hmm_mle = discrete_hmm(self.initial_distribution,
                               self.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(
            self.discrete_trajectories_lagged,
            hmm_mle,
            nsample=self.nsamples,
            reversible=self.reversible,
            stationary=self.stationary,
            p0_prior=self.p0_prior,
            transition_matrix_prior=self.transition_matrix_prior,
            store_hidden=self.store_hidden,
            call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_Ps = [
            sampled_hmm.sampled_hmms[i].transition_matrix
            for i in range(self.nsamples)
        ]
        sample_pis = [
            sampled_hmm.sampled_hmms[i].stationary_distribution
            for i in range(self.nsamples)
        ]
        sample_pobs = [
            sampled_hmm.sampled_hmms[i].output_model.output_probabilities
            for i in range(self.nsamples)
        ]
        samples = []
        for i in range(
                self.nsamples):  # restrict to observable set if necessary
            Bobs = sample_pobs[i][:, self.observable_set]
            sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(
                _HMSM(sample_Ps[i],
                      sample_pobs[i],
                      pi=sample_pis[i],
                      dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [
            sampled_hmm.sampled_hmms[i].hidden_state_trajectories
            for i in range(self.nsamples)
        ]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity)
Exemplo n.º 5
0
    def _estimate(self, dtrajs):
        """

        Return
        ------
        hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>`
            Estimated Hidden Markov state model

        """
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        # if no initial MSM is given, estimate it now
        if self.init_hmsm is None:
            # estimate with store_data=True, because we need an EstimatedHMSM
            hmsm_estimator = _MaximumLikelihoodHMSM(
                lag=self.lag,
                stride=self.stride,
                nstates=self.nstates,
                reversible=self.reversible,
                connectivity=self.connectivity,
                observe_active=self.observe_active,
                dt_traj=self.dt_traj)
            init_hmsm = hmsm_estimator.estimate(
                dtrajs)  # estimate with lagged trajectories
        else:
            # check input
            assert isinstance(
                self.init_hmsm,
                _EstimatedHMSM), 'hmsm must be of type EstimatedHMSM'
            init_hmsm = self.init_hmsm
            self.nstates = init_hmsm.nstates
            self.reversible = init_hmsm.is_reversible

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler
        if self.observe_active:
            import msmtools.estimation as msmest
            nstates_full = msmest.number_of_states(dtrajs)
            # pobs = _np.zeros((init_hmsm.nstates, nstates_full))  # currently unused because that produces zero cols
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            pobs = eps * _np.ones(
                (self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            pobs[:, init_hmsm.observable_set] = _np.maximum(
                eps, init_hmsm.observation_probabilities)
            # renormalize B to make it row-stochastic
            pobs /= pobs.sum(axis=1)[:, None]
        else:
            pobs = init_hmsm.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples,
                                    description='Sampling HMSMs',
                                    stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm
        hmm_mle = discrete_hmm(init_hmsm.transition_matrix,
                               pobs,
                               stationary=True,
                               reversible=self.reversible)

        # define prior
        if self.prior == 'sparse':
            self.prior_count_matrix = _np.zeros((self.nstates, self.nstates),
                                                dtype=_np.float64)
        elif self.prior == 'uniform':
            self.prior_count_matrix = _np.ones((self.nstates, self.nstates),
                                               dtype=_np.float64)
        elif self.prior == 'mixed':
            # C0 = _np.dot(_np.diag(init_hmsm.stationary_distribution), init_hmsm.transition_matrix)
            P0 = init_hmsm.transition_matrix
            P0_offdiag = P0 - _np.diag(_np.diag(P0))
            scaling_factor = 1.0 / _np.sum(P0_offdiag, axis=1)
            self.prior_count_matrix = P0 * scaling_factor[:, None]
        else:
            raise ValueError('Unknown prior mode: ' + self.prior)

        sampled_hmm = bayesian_hmm(
            init_hmsm.discrete_trajectories_lagged,
            hmm_mle,
            nsample=self.nsamples,
            transition_matrix_prior=self.prior_count_matrix,
            call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_Ps = [
            sampled_hmm.sampled_hmms[i].transition_matrix
            for i in range(self.nsamples)
        ]
        sample_pis = [
            sampled_hmm.sampled_hmms[i].stationary_distribution
            for i in range(self.nsamples)
        ]
        sample_pobs = [
            sampled_hmm.sampled_hmms[i].output_model.output_probabilities
            for i in range(self.nsamples)
        ]
        samples = []
        for i in range(
                self.nsamples):  # restrict to observable set if necessary
            Bobs = sample_pobs[i][:, init_hmsm.observable_set]
            sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(
                _HMSM(sample_Ps[i],
                      sample_pobs[i],
                      pi=sample_pis[i],
                      dt_model=init_hmsm.dt_model))

        # parametrize self
        self._dtrajs_full = dtrajs
        self._observable_set = init_hmsm._observable_set
        self._dtrajs_obs = init_hmsm._dtrajs_obs
        self.set_model_params(samples=samples,
                              P=init_hmsm.transition_matrix,
                              pobs=init_hmsm.observation_probabilities,
                              dt_model=init_hmsm.dt_model)

        return self
Exemplo n.º 6
0
 def test_2state_rev_2step(self):
     obs = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0], dtype=int)
     mle = bhmm.estimate_hmm([obs], nstates=2, lag=1)
     sampled = bhmm.bayesian_hmm([obs], mle, reversible=False, nsample=100,
                                 p0_prior='mixed', transition_matrix_prior='mixed')
     assert np.all(sampled.transition_matrix_std > 0)
Exemplo n.º 7
0
 def test_2state_rev_step(self):
     obs = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int)
     mle = bhmm.estimate_hmm([obs], nstates=2, lag=1)
     # this will generate disconnected count matrices and should fail:
     with self.assertRaises(NotImplementedError):
         bhmm.bayesian_hmm([obs], mle, reversible=True, p0_prior=None, transition_matrix_prior=None)
Exemplo n.º 8
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            copy_attributes = ['_nstates', '_reversible', '_pi', '_observable_set', 'likelihoods', 'likelihood',
                               'hidden_state_probabilities', 'hidden_state_trajectories', 'count_matrix',
                               'initial_count', 'initial_distribution', '_active_set']
            check_user_choices = ['lag', '_nstates']

            # check if nstates and lag are compatible
            for attr in check_user_choices:
                if not getattr(self, attr) == getattr(self.init_hmsm, attr):
                    raise UserWarning('BayesianHMSM cannot be initialized with init_hmsm with '
                                      'incompatible lag or nstates.')

            if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or
                    not all((_np.array_equal(d1, d2) for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))):
                raise NotImplementedError('Bayesian HMM estimation with init_hmsm is currently only implemented ' +
                                          'if applied to the same data.')

            # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM.
            # EVALUATE STRIDE
            if self.stride == 'effective':
                # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
                # how many uncorrelated counts we can make
                self.stride = self.lag
                # get a quick estimate from the spectral radius of the nonreversible
                from pyemma.msm import estimate_markov_model
                msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False,
                                               connectivity='largest', dt_traj=self.timestep_traj)
                # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
                # estimate of the decorrelation time
                if msm_nr.nstates > self.nstates:
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                    # use the smaller of these two pessimistic estimates
                    self.stride = int(min(self.lag, 2 * corrtime))

            # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible
            if self.stride != self.init_hmsm.stride:
                dtrajs_lagged_strided = _lag_observations(dtrajs, self.lag, stride=self.stride)
                _nstates_obs = _number_of_states(dtrajs_lagged_strided, only_used=True)
                _nstates_obs_full = _number_of_states(dtrajs)

                if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided),
                                 _np.concatenate(self.init_hmsm._dtrajs_lagged)).size != 0:
                    raise UserWarning('Choice of stride has excluded a different set of microstates than in ' +
                                      'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' +
                                      'must match to the one used for init_hmsm estimation.')

                self._dtrajs_full = dtrajs
                self._dtrajs_lagged = dtrajs_lagged_strided
                self._nstates_obs_full = _nstates_obs_full
                self._nstates_obs = _nstates_obs
                self._observable_set = _np.arange(self._nstates_obs)
                self._dtrajs_obs = dtrajs
            else:
                copy_attributes += ['_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full',
                                    '_nstates_obs', '_observable_set', '_dtrajs_obs']

            # update self with estimates from init_hmsm
            self.__dict__.update(
                {k: i for k, i in self.init_hmsm.__dict__.items() if k in copy_attributes})

            # as mentioned in the docstring, take init_hmsm observed set observation probabilities
            self.observe_nonempty = False

            # update HMM Model
            self.update_model_params(P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities,
                                     dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        import msmtools.estimation as msmest
        if self.reversible and not msmest.is_connected(self.count_matrix):
            raise NotImplementedError('Encountered disconnected count matrix:\n ' + str(self.count_matrix)
                                      + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag)
                                      + ' and stride=' + str(self.stride) + '. Consider using shorter lag, '
                                      + 'or shorter stride (to use more of the data), '
                                      + 'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        nstates_full = msmest.number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm

        if self.init_hmsm is not None:
            hmm_mle = self.init_hmsm.hmm
        else:
            hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
                                   reversible=self.reversible, stationary=self.stationary,
                                   p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior,
                                   store_hidden=self.store_hidden, call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_inp = [(m.transition_matrix, m.stationary_distribution, m.output_probabilities)
                      for m in sampled_hmm.sampled_hmms]

        samples = []
        for P, pi, pobs in sample_inp:  # restrict to observable set if necessary
            Bobs = pobs[:, self.observable_set]
            pobs = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples)]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset, obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity)