def test_njobs_speedup(self):
        artificial_dtraj = [
            np.random.randint(0, 100, size=10000) for _ in range(10)
        ]
        import time

        class timing(object):
            def __enter__(self):
                self.start = time.time()
                return self

            def __exit__(self, exc_type, exc_val, exc_tb):
                self.stop = time.time()
                self.diff = self.stop - self.start

        lag = 100
        with timing() as serial:
            ceff = effective_count_matrix(artificial_dtraj, lag=lag)
        for n_jobs in (2, 3, 4):
            with timing() as parallel:
                ceff_parallel = effective_count_matrix(artificial_dtraj,
                                                       lag=lag,
                                                       n_jobs=n_jobs)
            self.assertLess(parallel.diff,
                            serial.diff / n_jobs + 0.5,
                            msg='does not scale for njobs=%s' % n_jobs)
            np.testing.assert_allclose(
                ceff_parallel.toarray(),
                ceff.toarray(),
                atol=1e-14,
                err_msg='different result for njobs=%s' % n_jobs)
 def test_singletraj(self):
     # lag 1
     C = count_matrix(self.dtraj_long, 1)
     Ceff = effective_count_matrix(self.dtraj_long, 1)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
     # lag 100
     C = count_matrix(self.dtraj_long, 100)
     Ceff = effective_count_matrix(self.dtraj_long, 100)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
 def test_multitraj(self):
     dtrajs = [[1, 0, 1, 0, 1, 1, 0, 0, 0, 1], [2], [0, 1, 0, 1]]
     # lag 1
     C = count_matrix(dtrajs, 1)
     Ceff = effective_count_matrix(dtrajs, 1)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
     # lag 2
     C = count_matrix(dtrajs, 2)
     Ceff = effective_count_matrix(dtrajs, 2)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
예제 #4
0
    def count(count_mode: str,
              dtrajs: List[np.ndarray],
              lagtime: int,
              sparse: bool = False):
        r""" Computes a count matrix based on a counting mode, some discrete trajectories, a lagtime, and
        whether to use sparse matrices.

        Parameters
        ----------
        count_mode : str
            The counting mode to use. One of "sample", "sliding", "sliding-effective", and "effective".
            See :meth:`__init__` for a more detailed description.
        dtrajs : array_like or list of array_like
            Discrete trajectories, i.e., a list of arrays which contain non-negative integer values. A single ndarray
            can also be passed, which is then treated as if it was a list with that one ndarray in it.
        lagtime : int
            Distance between two frames in the discretized trajectories under which their potential change of state
            is considered a transition.
        sparse : bool, default=False
            Whether to use sparse matrices or dense matrices. Sparse matrices can make sense when dealing with a lot of
            states.

        Returns
        -------
        count_matrix : (N, N) ndarray or sparse array
            The computed count matrix. Can be ndarray or sparse depending on whether sparse was set to true or false.
            N is the number of encountered states, i.e., :code:`np.max(dtrajs)+1`.

        Example
        -------
        >>> dtrajs = [np.array([0,0,1,1]), np.array([0,0,1])]
        >>> count_matrix = TransitionCountEstimator.count(
        ...     count_mode="sliding", dtrajs=dtrajs, lagtime=1, sparse=False
        ... )
        >>> np.testing.assert_equal(count_matrix, np.array([[2, 2], [0, 1]]))
        """
        if count_mode == 'sliding' or count_mode == 'sliding-effective':
            count_matrix = msmest.count_matrix(dtrajs,
                                               lagtime,
                                               sliding=True,
                                               sparse_return=sparse)
            if count_mode == 'sliding-effective':
                count_matrix /= lagtime
        elif count_mode == 'sample':
            count_matrix = msmest.count_matrix(dtrajs,
                                               lagtime,
                                               sliding=False,
                                               sparse_return=sparse)
        elif count_mode == 'effective':
            count_matrix = msmest.effective_count_matrix(dtrajs, lagtime)
            if not sparse and issparse(count_matrix):
                count_matrix = count_matrix.toarray()
        else:
            raise ValueError('Count mode {} is unknown.'.format(count_mode))
        return count_matrix
    def test_compare_with_old_impl(self):
        # generated with v1.1@ from
        # pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10_n6good
        Ceff_ref = np.array(
            [[
                2.21353316e+04, 2.13659736e+03, 4.63558176e+02, 1.56043628e+02,
                3.88680098e+01, 1.14317676e+01
            ],
             [
                 1.84456322e+03, 3.74107190e+02, 1.79811199e+02,
                 9.29024530e+01, 5.59412620e+01, 2.59727288e+01
             ],
             [
                 3.45678646e+02, 1.42148228e+02, 8.19775293e+01,
                 7.75353971e+01, 5.73438875e+01, 8.19775293e+01
             ],
             [
                 9.08206988e+01, 6.53466003e+01, 7.82682445e+01,
                 7.71606750e+01, 8.38060919e+01, 2.84276171e+02
             ],
             [
                 3.56219388e+01, 3.43186971e+01, 7.64568442e+01,
                 1.13816439e+02, 2.51960055e+02, 1.33451946e+03
             ],
             [
                 1.57044024e+01, 3.26168358e+01, 1.12346879e+02,
                 4.34287128e+02, 1.88573632e+03, 2.35837843e+04
             ]])
        ref_dtraj = deeptime.data.double_well_discrete().dtraj_n6good
        Ceff = effective_count_matrix(ref_dtraj,
                                      lag=10,
                                      average='row',
                                      mact=1.0).toarray()
        Ceff2 = effective_count_matrix(ref_dtraj,
                                       lag=10,
                                       average='row',
                                       mact=1.0,
                                       n_jobs=2).toarray()

        np.testing.assert_allclose(Ceff, Ceff_ref, atol=1e-15, rtol=1e-8)
        np.testing.assert_allclose(Ceff2, Ceff_ref, atol=1e-15, rtol=1e-8)
    def test_multitraj_njobs(self):
        dtrajs = [[1, 0, 1, 0, 1, 1, 0, 0, 0, 1], [2], [0, 1, 0, 1]]
        # lag 1
        C = count_matrix(dtrajs, 1)
        Ceff = effective_count_matrix(dtrajs, 1, n_jobs=1)
        assert np.array_equal(Ceff.shape, C.shape)
        assert np.array_equal(C.nonzero(), Ceff.nonzero())
        assert np.all(Ceff.toarray() <= C.toarray())

        Ceff2 = effective_count_matrix(dtrajs, 1, n_jobs=2)
        np.testing.assert_equal(Ceff2.toarray(), Ceff.toarray())
        np.testing.assert_allclose(Ceff2.toarray(), Ceff.toarray())
        assert np.array_equal(Ceff2.shape, C.shape)
        assert np.array_equal(C.nonzero(), Ceff2.nonzero())
        assert np.all(Ceff2.toarray() <= C.toarray())

        # lag 2
        C = count_matrix(dtrajs, 2)
        Ceff2 = effective_count_matrix(dtrajs, 2)
        assert np.array_equal(Ceff2.shape, C.shape)
        assert np.array_equal(C.nonzero(), Ceff2.nonzero())
        assert np.all(Ceff2.toarray() <= C.toarray())
예제 #7
0
    def effective_count_matrix(self):
        """Statistically uncorrelated transition counts within the active set of states

        You can use this count matrix for Bayesian estimation or error perturbation.

        References
        ----------
        [1] Noe, F. (2015) Statistical inefficiency of Markov model count matrices
            http://publications.mi.fu-berlin.de/1699/1/autocorrelation_counts.pdf

        """
        self._check_is_estimated()
        Ceff_full = effective_count_matrix(self._dtrajs_full, self.lag)
        from pyemma.util.linalg import submatrix
        Ceff = submatrix(Ceff_full, self.active_set)
        return Ceff
예제 #8
0
    def fit(self, dtrajs, **kw):
        r""" Fits an MSM using Koopman reweighting.

        Parameters
        ----------
        dtrajs : array_like or list of array_like
            Discrete trajectories.
        **kw
            API compatibility to sklearn, not actually used in algorithm.
        """
        dtrajs = ensure_dtraj_list(dtrajs)
        # remove last lag steps from dtrajs:
        dtrajs_lag = [traj[:-self.lagtime] for traj in dtrajs]

        # statistics are collected over full trajectories
        histogram = count_states(dtrajs, ignore_negative=True)
        # because of double counting, only count lagged trajs
        count_matrix = TransitionCountEstimator.count(
            count_mode=self.count_mode,
            dtrajs=dtrajs_lag,
            lagtime=self.lagtime,
            sparse=self.sparse)
        count_model = TransitionCountModel(count_matrix,
                                           counting_mode=self.count_mode,
                                           lagtime=self.lagtime,
                                           state_histogram=histogram)
        count_model = count_model.submodel_largest(
            connectivity_threshold=self.connectivity_threshold, directed=True)

        # Estimate transition matrix using re-sampling:
        if self.rank_mode == 'bootstrap_counts':
            effective_count_mat = effective_count_matrix(
                dtrajs_lag, self.lagtime)
            Ceff = submatrix(effective_count_mat, count_model.state_symbols)
            smean, sdev = _impl.bootstrapping_count_matrix(Ceff, nbs=self.nbs)
        else:
            smean, sdev = _impl.bootstrapping_dtrajs(
                dtrajs_lag,
                self.lagtime,
                count_model.n_states_full,
                nbs=self.nbs,
                active_set=count_model.state_symbols)
        # Estimate two step count matrices:
        twostep_count_matrices = _impl.twostep_count_matrix(
            dtrajs, self.lagtime, count_model.n_states_full)
        # Rank decision:
        rank_ind = _impl.rank_decision(smean, sdev, tol=self.tol_rank)
        # Estimate OOM components:
        if issparse(count_model.count_matrix_full):
            cmat = count_model.count_matrix_full.toarray()
        else:
            cmat = count_model.count_matrix_full
        oom_components, omega, sigma, eigenvalues = _impl.oom_components(
            cmat,
            twostep_count_matrices,
            rank_ind=rank_ind,
            lcc=count_model.state_symbols)
        # Compute transition matrix:
        P, lcc_new = _impl.equilibrium_transition_matrix(
            oom_components, omega, sigma, reversible=self.reversible)

        # Update active set and derived quantities:
        if lcc_new.size < count_model.n_states:
            assert isinstance(count_model, TransitionCountModel)
            count_model = count_model.submodel(
                count_model.symbols_to_states(lcc_new))
            warnings.warn(
                "Caution: Re-estimation of count matrix resulted in reduction of the active set."
            )

        self._model = KoopmanReweightedMSM(
            transition_matrix=P,
            oom_eigenvalues=eigenvalues,
            oom_evaluator=sigma,
            oom_information_state_vector=omega,
            count_model=count_model,
            oom_components=oom_components,
            twostep_count_matrices=twostep_count_matrices)

        return self
예제 #9
0
    def _estimate(self, dtrajs):
        """ Estimate MSM """

        if self.core_set is not None:
            raise NotImplementedError(
                'Core set MSMs currently not compatible with {}.'.format(
                    self.__class__.__name__))

        # remove last lag steps from dtrajs:
        dtrajs_lag = [traj[:-self.lag] for traj in dtrajs]

        # get trajectory counts. This sets _C_full and _nstates_full
        dtrajstats = self._get_dtraj_stats(dtrajs_lag)
        self._C_full = dtrajstats.count_matrix()  # full count matrix
        self._nstates_full = self._C_full.shape[0]  # number of states

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            self.active_set = dtrajstats.largest_connected_set
        else:
            raise NotImplementedError(
                'OOM based MSM estimation is only implemented for connectivity=\'largest\'.'
            )

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # Estimate transition matrix
        if self.connectivity == 'largest':
            # Re-sampling:
            if self.rank_Ct == 'bootstrap_counts':
                Ceff_full = effective_count_matrix(dtrajs_lag, self.lag)
                from pyemma.util.linalg import submatrix
                Ceff = submatrix(Ceff_full, self.active_set)
                smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs)
            else:
                smean, sdev = bootstrapping_dtrajs(dtrajs_lag,
                                                   self.lag,
                                                   self._nstates_full,
                                                   nbs=self.nbs,
                                                   active_set=self._active_set)
            # Estimate two step count matrices:
            C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full)
            # Rank decision:
            rank_ind = rank_decision(smean, sdev, tol=self.tol_rank)
            # Estimate OOM components:
            Xi, omega, sigma, l = oom_components(self._C_full.toarray(),
                                                 C2t,
                                                 rank_ind=rank_ind,
                                                 lcc=self.active_set)
            # Compute transition matrix:
            P, lcc_new = equilibrium_transition_matrix(
                Xi, omega, sigma, reversible=self.reversible)
        else:
            raise NotImplementedError(
                'OOM based MSM estimation is only implemented for connectivity=\'largest\'.'
            )

        # Update active set and derived quantities:
        if lcc_new.size < self._nstates:
            self._active_set = self._active_set[lcc_new]
            self._C_active = dtrajstats.count_matrix(subset=self.active_set)
            self._nstates = self._C_active.shape[0]
            self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
            self._full2active[self.active_set] = _np.arange(
                len(self.active_set))
            warnings.warn(
                "Caution: Re-estimation of count matrix resulted in reduction of the active set."
            )

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = connected_sets(self._C_full)
        self._Xi = Xi
        self._omega = omega
        self._sigma = sigma
        self._eigenvalues_OOM = l
        self._rank_ind = rank_ind
        self._oom_rank = self._sigma.size
        self._C2t = C2t
        self.set_model_params(P=P,
                              pi=None,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self