Exemplo n.º 1
0
 def test_connected_count_matrix(self):
     """Directed"""
     is_c = is_connected(self.C_not_connected)
     self.assertFalse(is_c)
     is_c = is_connected(self.C_connected)
     self.assertTrue(is_c)
     """Undirected"""
     is_c = is_connected(self.C_not_connected, directed=False)
     self.assertTrue(is_c)
Exemplo n.º 2
0
    def __init__(self, n_states, initial_model=None, reversible=True, stationary=False,
                 transition_matrix_sampling_steps=1000, p0_prior='mixed', transition_matrix_prior='mixed',
                 output='gaussian', nsamples=100, ):
        super(BayesianHMMSampler, self).__init__()
        self.reversible = reversible
        self.stationary = stationary

        self.n_states = n_states

        # Use user-specified initial model, if provided.
        if initial_model is not None:
            self.initial_model = initial_model.copy()
        else:
            self.initial_model = None

        # prior initial vector
        if p0_prior is None or p0_prior == 'sparse':
            self.prior_n0 = np.zeros(self.n_states)
        elif isinstance(p0_prior, np.ndarray):
            if len(p0_prior.shape) == 1 and p0_prior.shape[0] == self.n_states:
                self.prior_n0 = np.array(p0_prior)
            else:
                raise ValueError(f'initial distribution prior must have dimension {n_states}')
        elif p0_prior == 'mixed':
            if initial_model is not None:
                self.prior_n0 = np.array(self.initial_model.initial_distribution)
            else:
                self.prior_n0 = None
        elif p0_prior == 'uniform':
            self.prior_n0 = np.ones(n_states)
        else:
            raise ValueError(f'initial distribution prior mode undefined: {p0_prior}')

        # prior count matrix
        if transition_matrix_prior is None or p0_prior == 'sparse':
            self.prior_C = np.zeros((self.n_states, self.n_states))
        elif isinstance(transition_matrix_prior, np.ndarray):
            if np.array_equal(transition_matrix_prior.shape, (self.n_states, self.n_states)):
                self.prior_C = np.array(transition_matrix_prior)
        elif transition_matrix_prior == 'mixed':
            if initial_model is not None:
                self.prior_C = np.array(self.initial_model.transition_matrix)
            else:
                self.prior_C = None
        elif p0_prior == 'uniform':
            self.prior_C = np.ones((n_states, n_states))
        else:
            raise ValueError(f'transition matrix prior mode undefined: {transition_matrix_prior}')

        # check if we work with these options
        if (reversible and self.initial_model is not None
                and not msmest.is_connected(self.initial_model.transition_matrix + self.prior_C, directed=True)):
            raise NotImplementedError('Trying to sample disconnected HMM with option reversible:\n '
                                      f'{self.initial_model.transition_matrix}\n'
                                      'Use prior to connect, select connected subset, or use reversible=False.')

        # sampling options
        self.transition_matrix_sampling_steps = transition_matrix_sampling_steps
        self.nsamples = nsamples
        self.output = output
Exemplo n.º 3
0
    def _update_transition_matrix(self, model):
        """ Updates the hidden-state transition matrix and the initial distribution """
        C = model.count_matrix() + self.prior_C  # posterior count matrix

        # check if we work with these options
        if self.reversible and not msmest.is_connected(C, directed=True):
            raise NotImplementedError('Encountered disconnected count matrix with sampling option reversible:\n '
                                      f'{C}\nUse prior to ensure connectivity or use reversible=False.')
        # ensure consistent sparsity pattern (P0 might have additional zeros because of underflows)
        # TODO: these steps work around a bug in msmtools. Should be fixed there
        P0 = msmest.transition_matrix(C, reversible=self.reversible, maxiter=10000, warn_not_converged=False)
        zeros = np.where(P0 + P0.T == 0)
        C[zeros] = 0
        # run sampler
        Tij = msmest.sample_tmatrix(C, nsample=1, nsteps=self.transition_matrix_sampling_steps,
                                    reversible=self.reversible)

        # INITIAL DISTRIBUTION
        if self.stationary:  # p0 is consistent with P
            p0 = _tmatrix_disconnected.stationary_distribution(Tij, C=C)
        else:
            n0 = model.count_init().astype(float)
            first_timestep_counts_with_prior = n0 + self.prior_n0
            positive = first_timestep_counts_with_prior > 0
            p0 = np.zeros_like(n0)
            p0[positive] = np.random.dirichlet(first_timestep_counts_with_prior[positive])  # sample p0 from posterior

        # update HMM with new sample
        model.update(p0, Tij)
Exemplo n.º 4
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)
        # harvest discrete statistics
        if isinstance(dtrajs, _DiscreteTrajectoryStats):
            dtrajstats = dtrajs
        else:
            # compute and store discrete trajectory statistics
            dtrajstats = _DiscreteTrajectoryStats(dtrajs)
            # check if this MSM seems too large to be dense
            if dtrajstats.nstates > 4000 and not self.sparse:
                self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be '
                                  'inefficient or unfeasible in terms of both runtime and memory consumption. '
                                  'Consider using sparse=True.')

        # count lagged
        dtrajstats.count_lagged(self.lag, count_mode=self.count_mode)

        # full count matrix and number of states
        self._C_full = dtrajstats.count_matrix()
        self._nstates_full = self._C_full.shape[0]

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            if self.statdist_constraint is None:
                # statdist not given - full connectivity on all states
                self.active_set = dtrajstats.largest_connected_set
            else:
                active_set = self._prepare_input_revpi(self._C_full,
                                                       self.statdist_constraint)
                self.active_set = active_set
        else:
            # for 'None' and 'all' all visited states are active
            self.active_set = dtrajstats.visited_set

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # restrict stationary distribution to active set
        if self.statdist_constraint is None:
            statdist_active = None
        else:
            statdist_active = self.statdist_constraint[self.active_set]
            statdist_active /= statdist_active.sum()  # renormalize

        # Estimate transition matrix
        if self.connectivity == 'largest':
            P = msmest.transition_matrix(self._C_active, reversible=self.reversible,
                                         mu=statdist_active, maxiter=self.maxiter,
                                         maxerr=self.maxerr)
        elif self.connectivity == 'none':
            # reversible mode only possible if active set is connected
            # - in this case all visited states are connected and thus
            # this mode is identical to 'largest'
            if self.reversible and not msmest.is_connected(self._C_active):
                raise ValueError('Reversible MSM estimation is not possible with connectivity mode "none", '
                                 'because the set of all visited states is not reversibly connected')
            P = msmest.transition_matrix(self._C_active, reversible=self.reversible,
                                         mu=statdist_active,
                                         maxiter=self.maxiter, maxerr=self.maxerr)
        else:
            raise NotImplementedError(
                'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity)

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()
            P = P.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Exemplo n.º 5
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            # TODO: this is too tedious - need to automatize parameter+result copying between estimators.
            self.nstates = self.init_hmsm.nstates
            self.reversible = self.init_hmsm.is_reversible
            self.stationary = self.init_hmsm.stationary
            # trajectories
            self._dtrajs_full = self.init_hmsm._dtrajs_full
            self._dtrajs_lagged = self.init_hmsm._dtrajs_lagged
            self._observable_set = self.init_hmsm._observable_set
            self._dtrajs_obs = self.init_hmsm._dtrajs_obs
            # MLE estimation results
            self.likelihoods = self.init_hmsm.likelihoods  # Likelihood history
            self.likelihood = self.init_hmsm.likelihood
            self.hidden_state_probabilities = self.init_hmsm.hidden_state_probabilities  # gamma variables
            self.hidden_state_trajectories = self.init_hmsm.hidden_state_trajectories  # Viterbi path
            self.count_matrix = self.init_hmsm.count_matrix  # hidden count matrix
            self.initial_count = self.init_hmsm.initial_count  # hidden init count
            self.initial_distribution = self.init_hmsm.initial_distribution
            self._active_set = self.init_hmsm._active_set
            # update HMM Model
            self.update_model_params(
                P=self.init_hmsm.transition_matrix,
                pobs=self.init_hmsm.observation_probabilities,
                dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        import msmtools.estimation as msmest
        if self.reversible and not msmest.is_connected(self.count_matrix):
            raise NotImplementedError(
                'Encountered disconnected count matrix:\n ' +
                str(self.count_matrix) +
                'with reversible Bayesian HMM sampler using lag=' +
                str(self.lag) + ' and stride=' + str(self.stride) +
                '. Consider using shorter lag, ' +
                'or shorter stride (to use more of the data), ' +
                'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        import msmtools.estimation as msmest
        nstates_full = msmest.number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones(
                (self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(
                eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples,
                                    description='Sampling HMSMs',
                                    stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm
        hmm_mle = discrete_hmm(self.initial_distribution,
                               self.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(
            self.discrete_trajectories_lagged,
            hmm_mle,
            nsample=self.nsamples,
            reversible=self.reversible,
            stationary=self.stationary,
            p0_prior=self.p0_prior,
            transition_matrix_prior=self.transition_matrix_prior,
            store_hidden=self.store_hidden,
            call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_Ps = [
            sampled_hmm.sampled_hmms[i].transition_matrix
            for i in range(self.nsamples)
        ]
        sample_pis = [
            sampled_hmm.sampled_hmms[i].stationary_distribution
            for i in range(self.nsamples)
        ]
        sample_pobs = [
            sampled_hmm.sampled_hmms[i].output_model.output_probabilities
            for i in range(self.nsamples)
        ]
        samples = []
        for i in range(
                self.nsamples):  # restrict to observable set if necessary
            Bobs = sample_pobs[i][:, self.observable_set]
            sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(
                _HMSM(sample_Ps[i],
                      sample_pobs[i],
                      pi=sample_pis[i],
                      dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [
            sampled_hmm.sampled_hmms[i].hidden_state_trajectories
            for i in range(self.nsamples)
        ]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity)
Exemplo n.º 6
0
    def _estimate(self, dtrajs):
        """ Estimates the MSM """
        # get trajectory counts. This sets _C_full and _nstates_full
        dtrajstats = self._get_dtraj_stats(dtrajs)
        self._C_full = dtrajstats.count_matrix()  # full count matrix
        self._nstates_full = self._C_full.shape[0]  # number of states

        # check for consistency between statdist constraints and core set
        if self.core_set is not None and self.statdist_constraint is not None:
            if len(self.core_set) != len(self.statdist_constraint):
                raise ValueError('Number of core sets and stationary distribution '
                                 'constraints do not match.')

            # rewrite statdist constraints to full set for compatibility reasons
            #TODO: find a more consistent way of dealing with this
            import copy
            _stdist_constr_coreset = copy.deepcopy(self.statdist_constraint)
            self.statdist_constraint = _np.zeros(self._nstates_full)
            self.statdist_constraint[self.core_set] = _stdist_constr_coreset


        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            if self.statdist_constraint is None:
                # statdist not given - full connectivity on all states
                self.active_set = dtrajstats.largest_connected_set
            else:
                active_set = self._prepare_input_revpi(self._C_full,
                                                       self.statdist_constraint)
                self.active_set = active_set
        else:
            # for 'None' and 'all' all visited states are active
            self.active_set = dtrajstats.visited_set

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()

        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # restrict stationary distribution to active set
        if self.statdist_constraint is None:
            statdist_active = None
        else:
            statdist_active = self.statdist_constraint[self.active_set]
            statdist_active /= statdist_active.sum()  # renormalize

        opt_args = {}
        # TODO: non-rev estimate of msmtools does not comply with its own api...
        if statdist_active is None and self.reversible:
            opt_args['return_statdist'] = True

        # Estimate transition matrix
        if self.connectivity == 'largest':
            P = msmest.transition_matrix(self._C_active, reversible=self.reversible,
                                         mu=statdist_active, maxiter=self.maxiter,
                                         maxerr=self.maxerr, **opt_args)
        elif self.connectivity == 'none':
            # reversible mode only possible if active set is connected
            # - in this case all visited states are connected and thus
            # this mode is identical to 'largest'
            if self.reversible and not msmest.is_connected(self._C_active):
                raise ValueError('Reversible MSM estimation is not possible with connectivity mode "none", '
                                 'because the set of all visited states is not reversibly connected')
            P = msmest.transition_matrix(self._C_active, reversible=self.reversible,
                                         mu=statdist_active,
                                         maxiter=self.maxiter, maxerr=self.maxerr,
                                         **opt_args
                                         )
        else:
            raise NotImplementedError(
                'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity)

        # msmtools returns a tuple for statdist_active = None.
        if isinstance(P, tuple):
            P, statdist_active = P

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._connected_sets = dtrajstats.connected_sets
        self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
def calculate_transition_matrix(transitions, topic_labels):

    topic_list = []
    #[topic_list.extend(list(itertools.chain(*interview))) for interview in transitions]

    #topic_list = [list(itertools.chain(*interview)) for interview in transitions]

    [
        topic_list.extend(list(itertools.chain(*item)))
        for sublist in transitions for item in transitions
    ]

    topic_list = list(set(topic_list))
    topic_list = sorted(topic_list)

    transition_matrix = np.zeros([len(topic_list),
                                  len(topic_list)]).astype(float)
    # Iterate through all transitions
    count = 0
    for interview in transitions:
        for element in interview:

            # Get the two states
            if ((element[1] == 'topic_9') and (element[0] == 'topic_9_26')):

                count = count + 1
            state1 = element[0]
            state2 = element[1]

            # Get the indices of the two states
            state1_index = topic_list.index(state1)
            state2_index = topic_list.index(state2)

            # Fill in the necessary row - column based on the transition
            transition_matrix[state1_index, state2_index] = transition_matrix[
                state1_index, state2_index] + 1

    # Create the final transition matrix with probability values

    transition_matrix_scaled = (transition_matrix.T /
                                transition_matrix.sum(axis=1)).T
    transition_matrix_scaled[np.isnan(transition_matrix_scaled)] = 0

    transition_matrix_scaled, removed_nodes = transform_transition_matrix_connected(
        transition_matrix_scaled)

    for element in removed_nodes:
        del topic_list[element]

    try:
        assert (len(topic_list) == transition_matrix_scaled.shape[0])
    except:
        pdb.set_trace()

    transition_matrix_scaled = transition_matrix_scaled.astype(float)
    #transition_matrix_scaled = np.around(transition_matrix_scaled, 3)
    transition_matrix_scaled = (
        transition_matrix_scaled /
        transition_matrix_scaled.sum(axis=1, keepdims=1))
    #transition_matrix_scaled = preprocessing.normalize(transition_matrix_scaled,axis=1,norm="l1")

    assert np.allclose(transition_matrix_scaled.sum(axis=1), 1)

    transition_matrix_scaled, removed_nodes = transform_transition_matrix_connected(
        transition_matrix_scaled)
    for element in removed_nodes:
        del topic_list[element]

    try:
        assert (len(topic_list) == transition_matrix_scaled.shape[0])
    except:
        pdb.set_trace()

    assert msmtools.analysis.is_connected(transition_matrix_scaled)

    #transition_matrix_scaled = transition_matrix_scaled.astype(np.float64)

    assert np.allclose(transition_matrix_scaled.sum(axis=1), 1)

    #transition_matrix_scaled = softmax(transition_matrix_scaled,axis=1)
    #transition_matrix_scaled  = transition_matrix_scaled/transition_matrix_scaled.sum(axis=0,keepdims=1)

    # Create a binary map

    binary_map = np.zeros([len(transition_matrix_scaled), len(topic_labels)])
    for i, label in enumerate(topic_list):
        if label == "topic_9_21_23":
            pass
        topic_numbers = label.split('_')[1:]

        for topic_number in topic_numbers:
            try:
                binary_map[i, int(topic_number)] = 1 / len(topic_numbers)
            except:
                pdb.set_trace()

    transition = cg_transition_matrix(transition_matrix_scaled, binary_map)

    transition[np.isnan(transition)] = 0

    if not is_connected(transition):
        transition, removed_nodes = transform_transition_matrix_connected(
            transition)
        transition = softmax(transition, axis=1)
        for element in removed_nodes:
            del topic_labels[element]

    #(transition+1e-12).sum(axis=1)
    transition = (transition / transition.sum(axis=1, keepdims=1))
    try:
        assert np.allclose(transition.sum(axis=1), 1)
    except:
        pdb.set_trace()
    try:
        assert msmtools.analysis.is_transition_matrix(transition)
    except:
        pdb.set_trace()

    return (transition, topic_labels)
Exemplo n.º 8
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)

        if self.init_hmsm is None:  # estimate using maximum-likelihood superclass
            # memorize the observation state for bhmm and reset
            # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs)
            default_connectivity = self.connectivity
            default_mincount_connectivity = self.mincount_connectivity
            default_observe_nonempty = self.observe_nonempty
            self.connectivity = None
            self.observe_nonempty = False
            self.mincount_connectivity = 0
            self.accuracy = 1e-2  # this is sufficient for an initial guess
            super(BayesianHMSM, self)._estimate(dtrajs)
            self.connectivity = default_connectivity
            self.mincount_connectivity = default_mincount_connectivity
            self.observe_nonempty = default_observe_nonempty
        else:  # if given another initialization, must copy its attributes
            copy_attributes = ['_nstates', '_reversible', '_pi', '_observable_set', 'likelihoods', 'likelihood',
                               'hidden_state_probabilities', 'hidden_state_trajectories', 'count_matrix',
                               'initial_count', 'initial_distribution', '_active_set']
            check_user_choices = ['lag', '_nstates']

            # check if nstates and lag are compatible
            for attr in check_user_choices:
                if not getattr(self, attr) == getattr(self.init_hmsm, attr):
                    raise UserWarning('BayesianHMSM cannot be initialized with init_hmsm with '
                                      'incompatible lag or nstates.')

            if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or
                    not all((_np.array_equal(d1, d2) for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))):
                raise NotImplementedError('Bayesian HMM estimation with init_hmsm is currently only implemented ' +
                                          'if applied to the same data.')

            # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM.
            # EVALUATE STRIDE
            if self.stride == 'effective':
                # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
                # how many uncorrelated counts we can make
                self.stride = self.lag
                # get a quick estimate from the spectral radius of the nonreversible
                from pyemma.msm import estimate_markov_model
                msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False,
                                               connectivity='largest', dt_traj=self.timestep_traj)
                # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
                # estimate of the decorrelation time
                if msm_nr.nstates > self.nstates:
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                    # use the smaller of these two pessimistic estimates
                    self.stride = int(min(self.lag, 2 * corrtime))

            # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible
            if self.stride != self.init_hmsm.stride:
                dtrajs_lagged_strided = _lag_observations(dtrajs, self.lag, stride=self.stride)
                _nstates_obs = _number_of_states(dtrajs_lagged_strided, only_used=True)
                _nstates_obs_full = _number_of_states(dtrajs)

                if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided),
                                 _np.concatenate(self.init_hmsm._dtrajs_lagged)).size != 0:
                    raise UserWarning('Choice of stride has excluded a different set of microstates than in ' +
                                      'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' +
                                      'must match to the one used for init_hmsm estimation.')

                self._dtrajs_full = dtrajs
                self._dtrajs_lagged = dtrajs_lagged_strided
                self._nstates_obs_full = _nstates_obs_full
                self._nstates_obs = _nstates_obs
                self._observable_set = _np.arange(self._nstates_obs)
                self._dtrajs_obs = dtrajs
            else:
                copy_attributes += ['_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full',
                                    '_nstates_obs', '_observable_set', '_dtrajs_obs']

            # update self with estimates from init_hmsm
            self.__dict__.update(
                {k: i for k, i in self.init_hmsm.__dict__.items() if k in copy_attributes})

            # as mentioned in the docstring, take init_hmsm observed set observation probabilities
            self.observe_nonempty = False

            # update HMM Model
            self.update_model_params(P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities,
                                     dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag))

        # check if we have a valid initial model
        import msmtools.estimation as msmest
        if self.reversible and not msmest.is_connected(self.count_matrix):
            raise NotImplementedError('Encountered disconnected count matrix:\n ' + str(self.count_matrix)
                                      + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag)
                                      + ' and stride=' + str(self.stride) + '. Consider using shorter lag, '
                                      + 'or shorter stride (to use more of the data), '
                                      + 'or using a lower value for mincount_connectivity.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        nstates_full = msmest.number_of_states(dtrajs)
        if self.nstates_obs < nstates_full:
            eps = 0.01 / nstates_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            B_init = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64)
            # fill active states
            B_init[:, self.observable_set] = _np.maximum(eps, self.observation_probabilities)
            # renormalize B to make it row-stochastic
            B_init /= B_init.sum(axis=1)[:, None]
        else:
            B_init = self.observation_probabilities

        # HMM sampler
        if self.show_progress:
            self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0)

            def call_back():
                self._progress_update(1, stage=0)
        else:
            call_back = None

        from bhmm import discrete_hmm, bayesian_hmm

        if self.init_hmsm is not None:
            hmm_mle = self.init_hmsm.hmm
        else:
            hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init)

        sampled_hmm = bayesian_hmm(self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples,
                                   reversible=self.reversible, stationary=self.stationary,
                                   p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior,
                                   store_hidden=self.store_hidden, call_back=call_back)

        if self.show_progress:
            self._progress_force_finish(stage=0)

        # Samples
        sample_inp = [(m.transition_matrix, m.stationary_distribution, m.output_probabilities)
                      for m in sampled_hmm.sampled_hmms]

        samples = []
        for P, pi, pobs in sample_inp:  # restrict to observable set if necessary
            Bobs = pobs[:, self.observable_set]
            pobs = Bobs / Bobs.sum(axis=1)[:, None]  # renormalize
            samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model))

        # store results
        self.sampled_trajs = [sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples)]
        self.update_model_params(samples=samples)

        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'
        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset, obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity)
Exemplo n.º 9
0
    def set_model_params(self,
                         P=None,
                         pi=None,
                         reversible=None,
                         dt_model='1 step',
                         neig=None):
        """ Call to set all basic model parameters.

        Sets or updates given model parameters. This argument list of this
        method must contain the full list of essential, or independent model
        parameters. It can additionally contain derived parameters, e.g. in
        order to save computational costs of re-computing them.

        Parameters
        ----------
        P : ndarray(n,n)
            transition matrix

        pi : ndarray(n), optional, default=None
            stationary distribution. Can be optionally given in case if it was
            already computed, e.g. by the estimator.

        reversible : bool, optional, default=None
            whether P is reversible with respect to its stationary distribution.
            If None (default), will be determined from P

        dt_model : str, optional, default='1 step'
            Description of the physical time corresponding to the model time
            step.  May be used by analysis algorithms such as plotting tools to
            pretty-print the axes. By default '1 step', i.e. there is no
            physical time unit. Specify by a number, whitespace and unit.
            Permitted units are (* is an arbitrary string):

            |  'fs',  'femtosecond*'
            |  'ps',  'picosecond*'
            |  'ns',  'nanosecond*'
            |  'us',  'microsecond*'
            |  'ms',  'millisecond*'
            |  's',   'second*'

        neig : int or None
            The number of eigenvalues / eigenvectors to be kept. If set to
            None, defaults will be used. For a dense MSM the default is all
            eigenvalues. For a sparse MSM the default is 10.

        Notes
        -----
        Explicitly define all independent model parameters in the argument
        list of this function (by mandatory or keyword arguments)

        """
        import msmtools.analysis as msmana
        # check input
        if P is not None:
            import msmtools.estimation as msmest
            if not msmana.is_transition_matrix(P):
                raise ValueError('T is not a transition matrix.')
            # check connectivity
            # TODO: abusing C-connectivity test for T. Either provide separate T-connectivity test or move to a central
            # TODO: location because it's the same code.
            if not msmest.is_connected(P):
                raise NotImplementedError(
                    'Transition matrix T is disconnected. ' +
                    'This is currently not supported in the MSM object.')

        # update all parameters
        self.update_model_params(P=P,
                                 pi=pi,
                                 reversible=reversible,
                                 dt_model=dt_model,
                                 neig=neig)
        # set ncv for consistency
        if not hasattr(self, 'ncv'):
            self.ncv = None
        # update derived quantities
        from pyemma.util.units import TimeUnit
        self._timeunit_model = TimeUnit(self.dt_model)

        # set P and derived quantities if available
        if P is not None:
            from scipy.sparse import issparse
            # set states
            self._nstates = np.shape(P)[0]
            if self.reversible is None:
                self.reversible = msmana.is_reversible(P)
            self.sparse = issparse(P)

            # set or correct eig param
            if neig is None:
                if self.sparse:
                    self.neig = 10
                else:
                    self.neig = self._nstates
        tr = [el for el in window(trajectories)]
        count_matrix = np.zeros(
            (unique.shape[0], unique.shape[0])).astype(float)

        for element in tr:
            count_matrix[element[0],
                         element[1]] = count_matrix[element[0],
                                                    element[1]] + float(1)

        count_matrix = count_matrix + 1e-12
        transition_matrix = (count_matrix /
                             count_matrix.sum(axis=1, keepdims=1))
        assert np.allclose(transition_matrix.sum(axis=1), 1)
        assert msmtools.analysis.is_transition_matrix(transition_matrix)
        assert is_connected(transition_matrix)

        binary_map = (unique / unique.sum(axis=1, keepdims=1))
        new_tra = cg_transition_matrix(transition_matrix, binary_map)
        new_tra[np.isnan(new_tra)] = 0
        new_tra = new_tra + 1e-12
        new_tra = (new_tra / new_tra.sum(axis=1, keepdims=1))
        np.savetxt('transition_matrix' + str(d + 1), new_tra, fmt='%.8f')

        assert np.allclose(new_tra.sum(axis=1), 1)
        mm = train_markov_chain(new_tra)
        stationary_prob = print_stationary_distributions(
            mm, features_df.KeywordLabel.to_list())
        stationary_probs.append(stationary_prob)

    pdb.set_trace()
def estimate_transition_matrix_reversible(C,
                                          Xinit=None,
                                          maxiter=1000000,
                                          maxerr=1e-8,
                                          return_statdist=False,
                                          return_conv=False,
                                          warn_not_converged=True):
    """
    iterative method for estimating a maximum likelihood reversible transition matrix

    The iteration equation implemented here is:
        t_ij = (c_ij + c_ji) / ((c_i / x_i) + (c_j / x_j))
    Please note that there is a better (=faster) iteration that has been described in
    Prinz et al, J. Chem. Phys. 134, p. 174105 (2011). We should implement that too.

    Parameters
    ----------
    C : ndarray (n,n)
        count matrix. If a non-connected count matrix is used, the method returns in error
    Xinit = None : ndarray (n,n)
        initial value for the matrix of absolute transition probabilities. Unless set otherwise,
        will use X = diag(pi) T, where T is a nonreversible transition matrix estimated from C,
        i.e. T_ij = c_ij / sum_k c_ik, and pi is its stationary distribution.
    maxerr = 1000000 : int
        maximum number of iterations before the method exits
    maxiter = 1e-8 : float
        convergence tolerance. This specifies the maximum change of the Euclidean norm of relative
        stationary probabilities (x_i = sum_k x_ik). The relative stationary probability changes
        e_i = (x_i^(1) - x_i^(2))/(x_i^(1) + x_i^(2)) are used in order to track changes in small
        probabilities. The Euclidean norm of the change vector, |e_i|_2, is compared to convtol.
    return_statdist : bool, default=False
        If set to true, the stationary distribution is also returned
    return_conv : bool, default=False
        If set to true, the likelihood history and the pi_change history is returned.
    warn_not_converged : bool, default=True
        Prints a warning if not converged.

    Returns
    -------
    T or (T,pi) or (T,lhist,pi_changes) or (T,pi,lhist,pi_changes)
    T : ndarray (n,n)
        transition matrix. This is the only return for return_statdist = False, return_conv = False
    (pi) : ndarray (n)
        stationary distribution. Only returned if return_statdist = True
    (lhist) : ndarray (k)
        likelihood history. Has the length of the number of iterations needed.
        Only returned if return_conv = True
    (pi_changes) : ndarray (k)
        history of likelihood history. Has the length of the number of iterations needed.
        Only returned if return_conv = True
    """
    from msmtools.estimation import is_connected
    from msmtools.estimation import log_likelihood
    # check input
    if (not is_connected(C)):
        ValueError('Count matrix is not fully connected. ' +
                   'Need fully connected count matrix for ' +
                   'reversible transition matrix estimation.')
    converged = False
    n = np.shape(C)[0]
    # initialization
    C2 = C + C.T  # reversibly counted matrix
    nz = np.nonzero(C2)
    csum = np.sum(C, axis=1)  # row sums C
    X = Xinit
    if (X is None):
        X = __initX(C)  # initial X
    xsum = np.sum(X, axis=1)  # row sums x
    D = np.zeros((n, n))  # helper matrix
    T = np.zeros((n, n))  # transition matrix
    # if convergence history requested, initialize variables
    if (return_conv):
        diffs = np.zeros(maxiter)
        # likelihood
        lhist = np.zeros(maxiter)
        T = X / xsum[:, np.newaxis]
        lhist[0] = log_likelihood(C, T)
    # iteration
    i = 1
    while (i < maxiter - 1) and (not converged):
        # c_i / x_i
        c_over_x = csum / xsum
        # d_ij = (c_i/x_i) + (c_j/x_j)
        D[:] = c_over_x[:, np.newaxis]
        D += c_over_x
        # update estimate
        X[nz] = C2[nz] / D[nz]
        X[nz] /= np.sum(X[nz])  # renormalize
        xsumnew = np.sum(X, axis=1)
        # compute difference in pi
        diff = __relative_error(xsum, xsumnew)
        # update pi
        xsum = xsumnew
        # any convergence history wanted?
        if (return_conv):
            # update T and likelihood
            T = X / xsum[:, np.newaxis]
            lhist[i] = log_likelihood(C, T)
            diffs[i] = diff
        # converged?
        converged = (diff < maxerr)
        i += 1
    # finalize and return
    T = X / xsum[:, np.newaxis]
    if warn_not_converged and not converged:
        warnings.warn(
            "Reversible transition matrix estimation didn't converge.",
            msmtools.util.exceptions.NotConvergedWarning)
    if (return_statdist and return_conv):
        return (T, xsum, lhist[0:i], diffs[0:i])
    if (return_statdist):
        return (T, xsum)
    if (return_conv):
        return (T, lhist[0:i], diffs[0:i])
    return T  # else just return T
Exemplo n.º 12
0
    def _estimate(self, dtrajs):
        """
            Parameters
            ----------
            dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`pyemma.msm.util.dtraj_states.DiscreteTrajectoryStats`
                discrete trajectories, stored as integer ndarrays (arbitrary size)
                or a single ndarray for only one trajectory.
            **params :
                Other keyword parameters if different from the settings when this estimator was constructed

            Returns
            -------
            MSM : :class:`pyemma.msm.EstimatedMSM` or :class:`pyemma.msm.MSM`

        """
        # ensure right format
        dtrajs = ensure_dtraj_list(dtrajs)
        # harvest discrete statistics
        if isinstance(dtrajs, _DiscreteTrajectoryStats):
            dtrajstats = dtrajs
        else:
            # compute and store discrete trajectory statistics
            dtrajstats = _DiscreteTrajectoryStats(dtrajs)
            # check if this MSM seems too large to be dense
            if dtrajstats.nstates > 4000 and not self.sparse:
                self.logger.warn(
                    'Building a dense MSM with ' + str(dtrajstats.nstates) +
                    ' states. This can be '
                    'inefficient or unfeasible in terms of both runtime and memory consumption. '
                    'Consider using sparse=True.')

        # count lagged
        dtrajstats.count_lagged(self.lag, count_mode=self.count_mode)

        # full count matrix and number of states
        self._C_full = dtrajstats.count_matrix()
        self._nstates_full = self._C_full.shape[0]

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            if self.statdist_constraint is None:
                # statdist not given - full connectivity on all states
                self.active_set = dtrajstats.largest_connected_set
            else:
                # statdist given - simple connectivity on all nonzero probability states
                nz = _np.nonzero(self.statdist_constraint)[0]
                Cnz = dtrajstats.count_matrix(subset=nz)
                self.active_set = nz[msmest.largest_connected_set(
                    Cnz, directed=False)]
        else:
            # for 'None' and 'all' all visited states are active
            self.active_set = dtrajstats.visited_set

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int)
        self._full2active[self.active_set] = _np.array(list(
            range(len(self.active_set))),
                                                       dtype=int)

        # restrict stationary distribution to active set
        if self.statdist_constraint is None:
            statdist_active = None
        else:
            statdist_active = self.statdist_constraint[self.active_set]
            statdist_active /= statdist_active.sum()  # renormalize

        # Estimate transition matrix
        if self.connectivity == 'largest':
            P = msmest.transition_matrix(self._C_active,
                                         reversible=self.reversible,
                                         mu=statdist_active,
                                         maxiter=self.maxiter,
                                         maxerr=self.maxerr)
        elif self.connectivity == 'none':
            # reversible mode only possible if active set is connected
            # - in this case all visited states are connected and thus
            # this mode is identical to 'largest'
            if self.reversible and not msmest.is_connected(self._C_active):
                raise ValueError(
                    'Reversible MSM estimation is not possible with connectivity mode \'none\', '
                    +
                    'because the set of all visited states is not reversibly connected'
                )
            P = msmest.transition_matrix(self._C_active,
                                         reversible=self.reversible,
                                         mu=statdist_active,
                                         maxiter=self.maxiter,
                                         maxerr=self.maxerr)
        else:
            raise NotImplementedError(
                'MSM estimation with connectivity=\'self.connectivity\' is currently not implemented.'
            )

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()
            P = P.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self.set_model_params(P=P,
                              pi=statdist_active,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Exemplo n.º 13
0
 def is_weakly_connected(self):
     """ Whether the HMM transition matrix is weakly connected """
     return msmest.is_connected(self._Tij, directed=False)
Exemplo n.º 14
0
 def is_strongly_connected(self):
     """ Whether the HMM transition matrix is strongly connected """
     return msmest.is_connected(self._Tij, directed=True)