示例#1
0
    def _estimate(self, iterable, **kw):
        covar = LaggedCovariance(c00=True,
                                 c0t=True,
                                 ctt=False,
                                 remove_data_mean=True,
                                 reversible=self.reversible,
                                 lag=self.lag,
                                 bessel=False,
                                 stride=self.stride,
                                 skip=self.skip,
                                 weights=self.weights,
                                 ncov_max=self.ncov_max)
        indim = iterable.dimension()

        if not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))

        if self._logger_is_active(self._loglevel_DEBUG):
            self.logger.debug(
                "Running TICA with tau=%i; Estimating two covariance matrices"
                " with dimension (%i, %i)", self._lag, indim, indim)
        covar.estimate(iterable, chunksize=self.chunksize, **kw)
        self.model.update_model_params(mean=covar.mean,
                                       cov=covar.C00_,
                                       cov_tau=covar.C0t_)
        self._diagonalize()

        return self.model
示例#2
0
 def _init_covar(self, partial=False):
     # in case of partial, we need to store the state of running covar in this estimator.
     args = dict(c00=True,
                 c0t=True,
                 ctt=True,
                 remove_data_mean=True,
                 reversible=False,
                 lag=self.lag,
                 bessel=False,
                 stride=self.stride,
                 skip=self.skip,
                 weights=None,
                 ncov_max=self.ncov_max)
     if partial:
         if not hasattr(self, '_covar') or self._covar is None:
             self._covar = LaggedCovariance(**args)
             # remember running covar for serialization
             assert '_covar' not in self.__serialize_fields
             self.__serialize_fields.append('_covar')
         return self._covar
     elif not partial:
         # if the previous estimation was a partial_fit, we might have a running covar object,
         # which we can safely omit now.
         if '_covar' in self.__serialize_fields:
             self.__serialize_fields.remove('_covar')
         return LaggedCovariance(**args)
示例#3
0
    def __init__(self,
                 lag,
                 epsilon=1e-6,
                 stride=1,
                 skip=0,
                 chunksize=None,
                 ncov_max=float('inf')):

        super(_KoopmanEstimator, self).__init__(chunksize=chunksize)

        self._covar = LaggedCovariance(c00=True,
                                       c0t=True,
                                       remove_data_mean=True,
                                       reversible=False,
                                       lag=lag,
                                       bessel=False,
                                       stride=stride,
                                       skip=skip,
                                       ncov_max=ncov_max)

        self.set_params(lag=lag,
                        epsilon=epsilon,
                        stride=stride,
                        skip=skip,
                        ncov_max=ncov_max)
        self._estimation_finished = False
示例#4
0
def koopman_est(x, y, t):
	'''
	Description:
		Function that calculates the Koopman estimators that satisfy the functions:
			chi(x_{t+1}) = (K_xx.T)(chi(x_t))
			chi(y_{t+1}) = (K_xy.T)(chi(y_t))

	Parameters:
		x,y: the matrices we want estimate the operator for
		t: timelag
	Returns:
		Two Koopman matrices that solve the above equation
	'''

	chi_data, gamma_data = whitening(x, y)

	chi_data = chi(x)
	n = len(x)

	chi_0 = chi_data[:n-t]
	chi_t = chi_data[t:]

	gamma_t = gamma(y)[t:]

	# K_xx = np.linalg.lstsq(chi_0, chi_t)[0]
	K_xy = np.linalg.lstsq(chi_0, gamma_t)[0]

	obj_x = LC(c0t = True, lag = t)

	obj_x.fit(chi(x))
	C_00, C_01 = (obj_x.C00_, obj_x.C0t_)
	K_xx = np.linalg.inv(C_00).dot(C_01)
	# K_xy = 0

	return (K_xx, K_xy)
示例#5
0
    def partial_fit(self, X):
        """ incrementally update the covariances and mean.

        Parameters
        ----------
        X: array, list of arrays, PyEMMA reader
            input data.

        Notes
        -----
        The projection matrix is first being calculated upon its first access.
        """
        from pyemma.coordinates import source
        iterable = source(X)

        if isinstance(self.dim, int):
            indim = iterable.dimension()
            if not self.dim <= indim:
                raise RuntimeError(
                    "requested more output dimensions (%i) than dimension"
                    " of input data (%i)" % (self.dim, indim))

        if self._covar is None:
            self._covar = LaggedCovariance(c00=True,
                                           c0t=True,
                                           ctt=True,
                                           remove_data_mean=True,
                                           reversible=False,
                                           lag=self.lag,
                                           bessel=False,
                                           stride=self.stride,
                                           skip=self.skip,
                                           weights=None,
                                           ncov_max=self.ncov_max)
        self._covar.partial_fit(iterable)
        self._model.update_model_params(
            mean_0=self._covar.mean,  # TODO: inefficient, fixme
            mean_t=self._covar.mean_tau,
            C00=self._covar.C00_,
            C0t=self._covar.C0t_,
            Ctt=self._covar.Ctt_)

        self._estimated = False
        if '_covar' not in self.__serialize_fields:
            self.__serialize_fields.append('_covar')
        return self
示例#6
0
    def _estimate(self, iterable, **kw):
        self._covar = LaggedCovariance(c00=True,
                                       c0t=True,
                                       ctt=True,
                                       remove_data_mean=True,
                                       reversible=False,
                                       lag=self.lag,
                                       bessel=False,
                                       stride=self.stride,
                                       skip=self.skip,
                                       weights=None,
                                       ncov_max=self.ncov_max)
        indim = iterable.dimension()

        if isinstance(self.dim, int) and not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))

        if self._logger_is_active(self._loglevel_DEBUG):
            self.logger.debug(
                "Running VAMP with tau=%i; Estimating two covariance matrices"
                " with dimension (%i, %i)" % (self._lag, indim, indim))

        self._covar.estimate(iterable, **kw)
        self._model.update_model_params(mean_0=self._covar.mean,
                                        mean_t=self._covar.mean_tau,
                                        C00=self._covar.C00_,
                                        C0t=self._covar.C0t_,
                                        Ctt=self._covar.Ctt_)
        self.model._diagonalize()
        # if the previous estimation was a partial_fit, we might have a running covar object, which we can safely omit now.
        if '_covar' in self.__serialize_fields:
            self.__serialize_fields.remove('_covar')

        return self._model
示例#7
0
class VAMP(StreamingEstimationTransformer, SerializableMixIn):
    r"""Variational approach for Markov processes (VAMP)"""

    __serialize_version = 0
    __serialize_fields = []

    def describe(self):
        return "[VAMP, lag = %i; max. output dim. = %s]" % (self._lag,
                                                            str(self.dim))

    def __init__(self,
                 lag,
                 dim=None,
                 scaling=None,
                 right=True,
                 epsilon=1e-6,
                 stride=1,
                 skip=0,
                 ncov_max=float('inf')):
        r""" Variational approach for Markov processes (VAMP) [1]_.

          Parameters
          ----------
          lag : int
              lag time
          dim : float or int, default=None
              Number of dimensions to keep:

              * if dim is not set (None) all available ranks are kept:
                  `n_components == min(n_samples, n_uncorrelated_features)`
              * if dim is an integer >= 1, this number specifies the number
                of dimensions to keep.
              * if dim is a float with ``0 < dim < 1``, select the number
                of dimensions such that the amount of kinetic variance
                that needs to be explained is greater than the percentage
                specified by dim.
          scaling : None or string
              Scaling to be applied to the VAMP order parameters upon transformation

              * None: no scaling will be applied, variance of the order parameters is 1
              * 'kinetic map' or 'km': order parameters are scaled by singular value.
                Only the left singular functions induce a kinetic map.
                Therefore scaling='km' is only effective if `right` is False.
          right : boolean
              Whether to compute the right singular functions.
              If `right==True`, `get_output()` will return the right singular
              functions. Otherwise, `get_output()` will return the left singular
              functions.
              Beware that only `frames[tau:, :]` of each trajectory returned
              by `get_output()` contain valid values of the right singular
              functions. Conversely, only `frames[0:-tau, :]` of each
              trajectory returned by `get_output()` contain valid values of
              the left singular functions. The remaining frames might
              possibly be interpreted as some extrapolation.
          epsilon : float
              eigenvalue cutoff. Eigenvalues of :math:`C_{00}` and :math:`C_{11}`
              with norms <= epsilon will be cut off. The remaining number of
              eigenvalues together with the value of `dim` define the size of the output.
          stride: int, optional, default = 1
              Use only every stride-th time step. By default, every time step is used.
          skip : int, default=0
              skip the first initial n frames per trajectory.
          ncov_max : int, default=infinity
              limit the memory usage of the algorithm from [3]_ to an amount that corresponds
              to ncov_max additional copies of each correlation matrix

          Notes
          -----
          VAMP is a method for dimensionality reduction of Markov processes.

          The Koopman operator :math:`\mathcal{K}` is an integral operator
          that describes conditional future expectation values. Let
          :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability
          density of visiting an infinitesimal phase space volume around
          point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase
          space point :math:`\mathbf{x}` was visited at the earlier time
          :math:`t`. Then the action of the Koopman operator on a function
          :math:`f` can be written as follows:

          .. math::

              \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right]

          The Koopman operator is defined without any reference to an
          equilibrium distribution. Therefore it is well-defined in
          situations where the dynamics is irreversible or/and non-stationary
          such that no equilibrium distribution exists.

          If we approximate :math:`f` by a linear superposition of ansatz
          functions :math:`\boldsymbol{\chi}` of the conformational
          degrees of freedom (features), the operator :math:`\mathcal{K}`
          can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`.

          The approximation is computed as follows: From the time-dependent
          input features :math:`\boldsymbol{\chi}(t)`, we compute the mean
          :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from
          all data excluding the last (first) :math:`\tau` steps of every
          trajectory as follows:

          .. math::

              \boldsymbol{\mu}_{0}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t)

              \boldsymbol{\mu}_{1}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t)

          Next, we compute the instantaneous covariance matrices
          :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the
          time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows:

          .. math::

              \mathbf{C}_{00}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]

              \mathbf{C}_{11}	:=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]

              \mathbf{C}_{01}	:=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right]

          The Koopman matrix is then computed as follows:

          .. math::

              \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01}

          It can be shown [1]_ that the leading singular functions of the
          half-weighted Koopman matrix

          .. math::

              \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}}

          encode the best reduced dynamical model for the time series.

          The singular functions can be computed by first performing the
          singular value decomposition

          .. math::

              \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime}

          and then mapping the input conformation to the left singular
          functions :math:`\boldsymbol{\psi}` and right singular
          functions :math:`\boldsymbol{\phi}` as follows:

          .. math::

              \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]

              \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]


          References
          ----------
          .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
              arXiv:1707.04659v1
          .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
              J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
          .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwiese algorithms for
             computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University.
          """
        StreamingEstimationTransformer.__init__(self)
        self.set_params(lag=lag,
                        dim=dim,
                        scaling=scaling,
                        right=right,
                        epsilon=epsilon,
                        stride=stride,
                        skip=skip,
                        ncov_max=ncov_max)

    @property
    def dim(self):
        """ Number of dimensions to keep

        * if dim is not set (None) all available ranks are kept:
          `n_components == min(n_samples, n_features)`
        * if dim is an integer >= 1, this number specifies the number
        of dimensions to keep.
        * if dim is a float with ``0 < dim < 1``, select the number
        of dimensions such that the amount of kinetic variance
        that needs to be explained is greater than the percentage
        specified by dim.
        """
        return self.model.dim

    @dim.setter
    def dim(self, value):
        self.model.dim = value

    @property
    def epsilon(self):
        """singular value cutoff.

        Singular values of :math:`C0` with norms <= epsilon will be cut off. The remaining number of
        singular values define the size of the output.
        """
        return self.model.epsilon

    @epsilon.setter
    def epsilon(self, value):
        self.model.epsilon = value

    @property
    def scaling(self):
        """Scaling to be applied to the VAMP order parameters upon transformation

        * None: no scaling will be applied, variance of the order parameters is 1
        * 'kinetic map' or 'km': order parameters are scaled by singular value
        Only the left singular functions induce a kinetic map.
        Therefore scaling='km' is only effective if `right` is False.
        """
        return self.model.scaling

    @scaling.setter
    def scaling(self, value):
        self.model.scaling = value

    def _init_covar(self, partial=False):
        # in case of partial, we need to store the state of running covar in this estimator.
        args = dict(c00=True,
                    c0t=True,
                    ctt=True,
                    remove_data_mean=True,
                    reversible=False,
                    lag=self.lag,
                    bessel=False,
                    stride=self.stride,
                    skip=self.skip,
                    weights=None,
                    ncov_max=self.ncov_max)
        if partial:
            if not hasattr(self, '_covar') or self._covar is None:
                self._covar = LaggedCovariance(**args)
                # remember running covar for serialization
                assert '_covar' not in self.__serialize_fields
                self.__serialize_fields.append('_covar')
            return self._covar
        elif not partial:
            # if the previous estimation was a partial_fit, we might have a running covar object,
            # which we can safely omit now.
            if '_covar' in self.__serialize_fields:
                self.__serialize_fields.remove('_covar')
            return LaggedCovariance(**args)

    @property
    def model(self):
        # this should ensure we always have a model, in case this estimator is used in sklearn-like fashion.
        if not hasattr(self, '_model'):
            self._model = VAMPModel()
        return self._model

    def _estimate(self, iterable, **kw):
        covar = self._init_covar()
        indim = iterable.dimension()

        if isinstance(self.dim, int) and not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))

        if self._logger_is_active(self._loglevel_DEBUG):
            self.logger.debug(
                "Running VAMP with tau=%i; Estimating two covariance matrices"
                " with dimension (%i, %i)" % (self._lag, indim, indim))

        covar.estimate(iterable, **kw)
        self.model.update_model_params(mean_0=covar.mean,
                                       mean_t=covar.mean_tau,
                                       C00=covar.C00_,
                                       C0t=covar.C0t_,
                                       Ctt=covar.Ctt_)
        self.model._diagonalize()
        return self.model

    def partial_fit(self, X):
        """ incrementally update the covariances and mean.

        Parameters
        ----------
        X: array, list of arrays, PyEMMA reader
            input data.

        Notes
        -----
        The projection matrix is first being calculated upon its first access.
        """
        from pyemma.coordinates import source
        iterable = source(X)

        if isinstance(self.dim, int):
            indim = iterable.dimension()
            if not self.dim <= indim:
                raise RuntimeError(
                    "requested more output dimensions (%i) than dimension"
                    " of input data (%i)" % (self.dim, indim))

        self._covar = self._init_covar(partial=True)
        self._covar.partial_fit(iterable)
        self.model.update_model_params(
            mean_0=self._covar.mean,  # TODO: inefficient, fixme
            mean_t=self._covar.mean_tau,
            C00=self._covar.C00_,
            C0t=self._covar.C0t_,
            Ctt=self._covar.Ctt_)

        self._estimated = False
        return self.model

    def dimension(self):
        """real output dimension after low-rank approximation."""
        return self.model.dimension()

    def _transform_array(self, X):
        r"""Projects the data onto the dominant singular functions.

        Parameters
        ----------
        X : ndarray(n, m)
            the input data

        Returns
        -------
        Y : ndarray(n,)
            the projected data
            If `self.right` is True, projection will be on the right singular
            functions. Otherwise, projection will be on the left singular
            functions.
        """
        # TODO: in principle get_output should not return data for *all* frames!
        # TODO: implement our own iterators? This would also include random access to be complete...
        if self.right:
            X_meanfree = X - self._model.mean_t
            Y = np.dot(X_meanfree, self._model.V[:, 0:self.dimension()])
        else:
            X_meanfree = X - self._model.mean_0
            Y = np.dot(X_meanfree, self._model.U[:, 0:self.dimension()])

        return Y.astype(self.output_type())

    @property
    def singular_values(self):
        r"""Singular values of the half-weighted Koopman matrix (usually denoted :math:`\sigma`)

        Returns
        -------
        singular values: 1-D np.array
        """
        return self.model.singular_values

    @property
    def singular_vectors_right(self):
        r"""Transformation matrix that represents the linear map from feature space to the space of right singular functions.

        Notes
        -----
        Right "singular vectors" V of the VAMP problem (equation 13 in [1]_), columnwise

        Returns
        -------
        vectors: 2-D ndarray
            Coefficients that express the right singular functions in the
            basis of mean-free input features.

        References
        ----------
        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
            arXiv:1707.04659v1
        """
        return self.model.V

    @property
    def singular_vectors_left(self):
        r"""Transformation matrix that represents the linear map from feature space to the space of left singular functions.

        Notes
        -----
        Left "singular vectors" U of the VAMP problem (equation 13 in [1]_), columnwise

        Returns
        -------
        vectors: 2-D ndarray
            Coefficients that express the left singular functions in the
            basis of mean-free input features.

        References
        ----------
        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
            arXiv:1707.04659v1
        """
        return self.model.U

    @property
    def cumvar(self):
        r"""Cumulative sum of the squared and normalized singular values

        Returns
        -------
        cumvar: 1D np.array
        """
        return self.model.cumvar

    @property
    def show_progress(self):
        if self._covar is None:
            return False
        else:
            return self._covar.show_progress

    @show_progress.setter
    def show_progress(self, value):
        if self._covar is not None:
            self._covar.show_progress = value

    def expectation(self,
                    observables,
                    statistics,
                    lag_multiple=1,
                    observables_mean_free=False,
                    statistics_mean_free=False):
        r"""Compute future expectation of observable or covariance using the approximated Koopman operator.

        Parameters
        ----------
        observables : np.ndarray((input_dimension, n_observables))
            Coefficients that express one or multiple observables in
            the basis of the input features.

        statistics : np.ndarray((input_dimension, n_statistics)), optional
            Coefficients that express one or multiple statistics in
            the basis of the input features.
            This parameter can be None. In that case, this method
            returns the future expectation value of the observable(s).

        lag_multiple : int
            If > 1, extrapolate to a multiple of the estimator's lag
            time by assuming Markovianity of the approximated Koopman
            operator.

        observables_mean_free : bool, default=False
            If true, coefficients in `observables` refer to the input
            features with feature means removed.
            If false, coefficients in `observables` refer to the
            unmodified input features.

        statistics_mean_free : bool, default=False
            If true, coefficients in `statistics` refer to the input
            features with feature means removed.
            If false, coefficients in `statistics` refer to the
            unmodified input features.

        Notes
        -----
        A "future expectation" of a observable g is the average of g computed
        over a time window that has the same total length as the input data
        from which the Koopman operator was estimated but is shifted
        by lag_multiple*tau time steps into the future (where tau is the lag
        time).

        It is computed with the equation:

        .. math::

            \mathbb{E}[g]_{\rho_{n}}=\mathbf{q}^{T}\mathbf{P}^{n-1}\mathbf{e}_{1}

        where

        .. math::

            P_{ij}=\sigma_{i}\langle\psi_{i},\phi_{j}\rangle_{\rho_{1}}

        and

        .. math::

            q_{i}=\langle g,\phi_{i}\rangle_{\rho_{1}}

        and :math:`\mathbf{e}_{1}` is the first canonical unit vector.


        A model prediction of time-lagged covariances between the
        observable f and the statistic g at a lag-time of lag_multiple*tau
        is computed with the equation:

        .. math::

            \mathrm{cov}[g,\,f;n\tau]=\mathbf{q}^{T}\mathbf{P}^{n-1}\boldsymbol{\Sigma}\mathbf{r}

        where :math:`r_{i}=\langle\psi_{i},f\rangle_{\rho_{0}}` and
        :math:`\boldsymbol{\Sigma}=\mathrm{diag(\boldsymbol{\sigma})}` .
        """
        return self.model.expectation(
            observables,
            statistics,
            lag_multiple=lag_multiple,
            statistics_mean_free=statistics_mean_free,
            observables_mean_free=observables_mean_free)

    def cktest(self,
               n_observables=None,
               observables='phi',
               statistics='psi',
               mlags=10,
               n_jobs=1,
               show_progress=True,
               iterable=None):
        r"""Do the Chapman-Kolmogorov test by computing predictions for higher lag times and by performing estimations at higher lag times.

        Notes
        -----

        This method computes two sets of time-lagged covariance matrices

        * estimates at higher lag times :

          .. math::

              \left\langle \mathbf{K}(n\tau)g_{i},f_{j}\right\rangle_{\rho_{0}}

          where :math:`\rho_{0}` is the empirical distribution implicitly defined
          by all data points from time steps 0 to T-tau in all trajectories,
          :math:`\mathbf{K}(n\tau)` is a rank-reduced Koopman matrix estimated
          at the lag-time n*tau and g and f are some functions of the data.
          Rank-reduction of the Koopman matrix is controlled by the `dim`
          parameter of :func:`vamp <pyemma.coordinates.vamp>`.

        * predictions at higher lag times :

          .. math::

              \left\langle \mathbf{K}^{n}(\tau)g_{i},f_{j}\right\rangle_{\rho_{0}}

          where :math:`\mathbf{K}^{n}` is the n'th power of the rank-reduced
          Koopman matrix contained in self.


        The Champan-Kolmogorov test is to compare the predictions to the
        estimates.

        Parameters
        ----------
        n_observables : int, optional, default=None
            Limit the number of default observables (and of default statistics)
            to this number.
            Only used if `observables` are None or `statistics` are None.

        observables : np.ndarray((input_dimension, n_observables)) or 'phi'
            Coefficients that express one or multiple observables :math:`g`
            in the basis of the input features.
            This parameter can be 'phi'. In that case, the dominant
            right singular functions of the Koopman operator estimated
            at the smallest lag time are used as default observables.

        statistics : np.ndarray((input_dimension, n_statistics)) or 'psi'
            Coefficients that express one or multiple statistics :math:`f`
            in the basis of the input features.
            This parameter can be 'psi'. In that case, the dominant
            left singular functions of the Koopman operator estimated
            at the smallest lag time are used as default statistics.

        mlags : int or int-array, default=10
            multiples of lag times for testing the Model, e.g. range(10).
            A single int will trigger a range, i.e. mlags=10 maps to
            mlags=range(10).
            Note that you need to be able to do a model prediction for each
            of these lag time multiples, e.g. the value 0 only make sense
            if model.expectation(lag_multiple=0) will work.

        n_jobs : int, default=1
            how many jobs to use during calculation

        show_progress : bool, default=True
            Show progressbars for calculation?

        iterable : any data format that `pyemma.coordinates.vamp()` accepts as input, optional
            It `iterable` is None, the same data source with which VAMP
            was initialized will be used for all estimation.
            Otherwise, all estimates (not predictions) from data will be computed
            from the data contained in `iterable`.

        Returns
        -------
        vckv : :class:`VAMPChapmanKolmogorovValidator <pyemma.coordinates.transform.VAMPChapmanKolmogorovValidator>`
            Contains the estimated and the predicted covarince matrices.
            The object can be plotted with :func:`plot_cktest <pyemma.plots.plot_cktest>` with the option `y01=False`.
        """
        if n_observables is not None:
            if n_observables > self.dimension():
                warnings.warn(
                    'Selected singular functions as observables but dimension '
                    'is lower than requested number of observables.')
                n_observables = self.dimension()
        else:
            n_observables = self.dimension()

        if isinstance(observables, str) and observables == 'phi':
            observables = self.singular_vectors_right[:, 0:n_observables]
            observables_mean_free = True
        else:
            ensure_ndarray(observables, ndim=2)
            observables_mean_free = False

        if isinstance(statistics, str) and statistics == 'psi':
            statistics = self.singular_vectors_left[:, 0:n_observables]
            statistics_mean_free = True
        else:
            ensure_ndarray_or_None(statistics, ndim=2)
            statistics_mean_free = False

        ck = VAMPChapmanKolmogorovValidator(self.model,
                                            self,
                                            observables,
                                            statistics,
                                            observables_mean_free,
                                            statistics_mean_free,
                                            mlags=mlags,
                                            n_jobs=n_jobs,
                                            show_progress=show_progress)

        if iterable is None:
            iterable = self.data_producer

        ck.estimate(iterable)
        return ck

    def score(self, test_data=None, score_method='VAMP2'):
        """Compute the VAMP score for this model or the cross-validation score between self and a second model estimated form different data.

        Parameters
        ----------
        test_data : any data format that `pyemma.coordinates.vamp()` accepts as input

            If `test_data` is not None, this method computes the cross-validation score
            between self and a VAMP model estimated from `test_data`. It is assumed that
            self was estimated from the "training" data and `test_data` is the test data.
            The score is computed for one realization of self and `test_data`. Estimation
            of the average cross-validation score and partitioning of data into test and
            training part is not performed by this method.

            If `test_data` is None, this method computes the VAMP score for the model
            contained in self.

            The model that is estimated from `test_data` will inherit all hyperparameters
            from self.

        score_method : str, optional, default='VAMP2'
            Available scores are based on the variational approach for Markov processes [1]_:

            *  'VAMP1'  Sum of singular values of the half-weighted Koopman matrix [1]_ .
                        If the model is reversible, this is equal to the sum of
                        Koopman matrix eigenvalues, also called Rayleigh quotient [1]_.
            *  'VAMP2'  Sum of squared singular values of the half-weighted Koopman matrix [1]_ .
                        If the model is reversible, this is equal to the kinetic variance [2]_ .
            *  'VAMPE'  Approximation error of the estimated Koopman operator with respect to
                        the true Koopman operator up to an additive constant [1]_ .

        Returns
        -------
        score : float
            If `test_data` is not None, returns the cross-validation VAMP score between
            self and the model estimated from `test_data`. Otherwise return the selected
            VAMP-score of self.

        References
        ----------
        .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data.
            arXiv:1707.04659v1
        .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
            J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
        """
        from pyemma._ext.sklearn.base import clone as clone_estimator
        est = clone_estimator(self)
        # clone does not invoke our constructor, so we have explicitly create a new model instance.
        est._model = VAMPModel()
        if test_data is None:
            return self.model.score(None, score_method=score_method)
        else:
            est.estimate(test_data)
            return self.model.score(est.model, score_method=score_method)
示例#8
0
文件: tica.py 项目: zzmjohn/PyEMMA
    def __init__(self,
                 lag,
                 dim=-1,
                 var_cutoff=0.95,
                 kinetic_map=True,
                 commute_map=False,
                 epsilon=1e-6,
                 stride=1,
                 skip=0,
                 reversible=True,
                 weights=None,
                 ncov_max=float('inf')):
        r""" Time-lagged independent component analysis (TICA) [1]_, [2]_, [3]_.

        Parameters
        ----------
        lag : int
            lag time
        dim : int, optional, default -1
            Maximum number of significant independent components to use to reduce dimension of input data. -1 means
            all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff.
            Setting dim to a positive value is exclusive with var_cutoff.
        var_cutoff : float in the range [0,1], optional, default 0.95
            Determines the number of output dimensions by including dimensions until their cumulative kinetic variance
            exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions
            (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim
        kinetic_map : bool, optional, default True
            Eigenvectors will be scaled by eigenvalues. As a result, Euclidean distances in the transformed data
            approximate kinetic distances [4]_. This is a good choice when the data is further processed by clustering.
        commute_map : bool, optional, default False
            Eigenvector_i will be scaled by sqrt(timescale_i / 2). As a result, Euclidean distances in the transformed
            data will approximate commute distances [5]_.
        epsilon : float
            eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be
            cut off. The remaining number of eigenvalues define the size
            of the output.
        stride: int, optional, default = 1
            Use only every stride-th time step. By default, every time step is used.
        skip : int, default=0
            skip the first initial n frames per trajectory.
        reversible: bool, default=True
            symmetrize correlation matrices C_0, C_{\tau}. At the moment, setting reversible=False is not implemented.
        weights: object, optional, default = None
            An object that allows to compute re-weighting factors to estimate equilibrium means and correlations from
            off-equilibrium data. The only requirement is that weights possesses a method weights(X), that accepts a
            trajectory X (np.ndarray(T, n)) and returns a vector of re-weighting factors (np.ndarray(T,)).

        Notes
        -----
        Given a sequence of multivariate data :math:`X_t`, computes the mean-free
        covariance and time-lagged covariance matrix:

        .. math::

            C_0 &=      (X_t - \mu)^T (X_t - \mu) \\
            C_{\tau} &= (X_t - \mu)^T (X_{t + \tau} - \mu)

        and solves the eigenvalue problem

        .. math:: C_{\tau} r_i = C_0 \lambda_i(tau) r_i,

        where :math:`r_i` are the independent components and :math:`\lambda_i(tau)` are
        their respective normalized time-autocorrelations. The eigenvalues are
        related to the relaxation timescale by

        .. math:: t_i(tau) = -\tau / \ln |\lambda_i|.

        When used as a dimension reduction method, the input data is projected
        onto the dominant independent components.

        References
        ----------
        .. [1] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013.
           Identification of slow molecular order parameters for Markov model construction
           J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489
        .. [2] Schwantes C, V S Pande. 2013.
           Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9
           J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a
        .. [3] L. Molgedey and H. G. Schuster. 1994.
           Separation of a mixture of independent signals using time delayed correlations
           Phys. Rev. Lett. 72, 3634.
        .. [4] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
            J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
        .. [5] Noe, F., Banisch, R., Clementi, C. 2016. Commute maps: separating slowly-mixing molecular configurations
           for kinetic modeling. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.6b00762

        """
        default_var_cutoff = get_default_args(self.__init__)['var_cutoff']
        if dim != -1 and var_cutoff != default_var_cutoff:
            raise ValueError(
                'Trying to set both the number of dimension and the subspace variance. Use either or.'
            )
        if kinetic_map and commute_map:
            raise ValueError(
                'Trying to use both kinetic_map and commute_map. Use either or.'
            )
        if not reversible:
            raise NotImplementedError(
                "Reversible=False is currently not implemented.")
        # if (kinetic_map or commute_map) and not reversible:
        #     raise NotImplementedError('kinetic_map and commute_map are not yet implemented for irreversible processes.')
        super(TICA, self).__init__()

        if dim > -1:
            var_cutoff = 1.0

        self._covar = LaggedCovariance(c00=True,
                                       c0t=True,
                                       ctt=False,
                                       remove_data_mean=True,
                                       reversible=reversible,
                                       lag=lag,
                                       bessel=False,
                                       stride=stride,
                                       skip=skip,
                                       weights=weights,
                                       ncov_max=ncov_max)

        # empty dummy model instance
        self._model = TICAModel()
        self.set_params(lag=lag,
                        dim=dim,
                        var_cutoff=var_cutoff,
                        kinetic_map=kinetic_map,
                        commute_map=commute_map,
                        epsilon=epsilon,
                        reversible=reversible,
                        stride=stride,
                        skip=skip,
                        weights=weights,
                        ncov_max=ncov_max)
示例#9
0
文件: tica.py 项目: zzmjohn/PyEMMA
class TICA(StreamingEstimationTransformer):
    r""" Time-lagged independent component analysis (TICA)"""
    def __init__(self,
                 lag,
                 dim=-1,
                 var_cutoff=0.95,
                 kinetic_map=True,
                 commute_map=False,
                 epsilon=1e-6,
                 stride=1,
                 skip=0,
                 reversible=True,
                 weights=None,
                 ncov_max=float('inf')):
        r""" Time-lagged independent component analysis (TICA) [1]_, [2]_, [3]_.

        Parameters
        ----------
        lag : int
            lag time
        dim : int, optional, default -1
            Maximum number of significant independent components to use to reduce dimension of input data. -1 means
            all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff.
            Setting dim to a positive value is exclusive with var_cutoff.
        var_cutoff : float in the range [0,1], optional, default 0.95
            Determines the number of output dimensions by including dimensions until their cumulative kinetic variance
            exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions
            (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim
        kinetic_map : bool, optional, default True
            Eigenvectors will be scaled by eigenvalues. As a result, Euclidean distances in the transformed data
            approximate kinetic distances [4]_. This is a good choice when the data is further processed by clustering.
        commute_map : bool, optional, default False
            Eigenvector_i will be scaled by sqrt(timescale_i / 2). As a result, Euclidean distances in the transformed
            data will approximate commute distances [5]_.
        epsilon : float
            eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be
            cut off. The remaining number of eigenvalues define the size
            of the output.
        stride: int, optional, default = 1
            Use only every stride-th time step. By default, every time step is used.
        skip : int, default=0
            skip the first initial n frames per trajectory.
        reversible: bool, default=True
            symmetrize correlation matrices C_0, C_{\tau}. At the moment, setting reversible=False is not implemented.
        weights: object, optional, default = None
            An object that allows to compute re-weighting factors to estimate equilibrium means and correlations from
            off-equilibrium data. The only requirement is that weights possesses a method weights(X), that accepts a
            trajectory X (np.ndarray(T, n)) and returns a vector of re-weighting factors (np.ndarray(T,)).

        Notes
        -----
        Given a sequence of multivariate data :math:`X_t`, computes the mean-free
        covariance and time-lagged covariance matrix:

        .. math::

            C_0 &=      (X_t - \mu)^T (X_t - \mu) \\
            C_{\tau} &= (X_t - \mu)^T (X_{t + \tau} - \mu)

        and solves the eigenvalue problem

        .. math:: C_{\tau} r_i = C_0 \lambda_i(tau) r_i,

        where :math:`r_i` are the independent components and :math:`\lambda_i(tau)` are
        their respective normalized time-autocorrelations. The eigenvalues are
        related to the relaxation timescale by

        .. math:: t_i(tau) = -\tau / \ln |\lambda_i|.

        When used as a dimension reduction method, the input data is projected
        onto the dominant independent components.

        References
        ----------
        .. [1] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013.
           Identification of slow molecular order parameters for Markov model construction
           J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489
        .. [2] Schwantes C, V S Pande. 2013.
           Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9
           J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a
        .. [3] L. Molgedey and H. G. Schuster. 1994.
           Separation of a mixture of independent signals using time delayed correlations
           Phys. Rev. Lett. 72, 3634.
        .. [4] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
            J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
        .. [5] Noe, F., Banisch, R., Clementi, C. 2016. Commute maps: separating slowly-mixing molecular configurations
           for kinetic modeling. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.6b00762

        """
        default_var_cutoff = get_default_args(self.__init__)['var_cutoff']
        if dim != -1 and var_cutoff != default_var_cutoff:
            raise ValueError(
                'Trying to set both the number of dimension and the subspace variance. Use either or.'
            )
        if kinetic_map and commute_map:
            raise ValueError(
                'Trying to use both kinetic_map and commute_map. Use either or.'
            )
        if not reversible:
            raise NotImplementedError(
                "Reversible=False is currently not implemented.")
        # if (kinetic_map or commute_map) and not reversible:
        #     raise NotImplementedError('kinetic_map and commute_map are not yet implemented for irreversible processes.')
        super(TICA, self).__init__()

        if dim > -1:
            var_cutoff = 1.0

        self._covar = LaggedCovariance(c00=True,
                                       c0t=True,
                                       ctt=False,
                                       remove_data_mean=True,
                                       reversible=reversible,
                                       lag=lag,
                                       bessel=False,
                                       stride=stride,
                                       skip=skip,
                                       weights=weights,
                                       ncov_max=ncov_max)

        # empty dummy model instance
        self._model = TICAModel()
        self.set_params(lag=lag,
                        dim=dim,
                        var_cutoff=var_cutoff,
                        kinetic_map=kinetic_map,
                        commute_map=commute_map,
                        epsilon=epsilon,
                        reversible=reversible,
                        stride=stride,
                        skip=skip,
                        weights=weights,
                        ncov_max=ncov_max)

    @property
    def lag(self):
        """ lag time of correlation matrix :math:`C_{\tau}` """
        return self._lag

    @lag.setter
    def lag(self, new_tau):
        self._lag = new_tau

    def describe(self):
        try:
            dim = self.dimension()
        except AttributeError:
            dim = self.dim
        return "[TICA, lag = %i; max. output dim. = %i]" % (self._lag, dim)

    def dimension(self):
        """ output dimension """
        if self.dim > -1:
            return self.dim
        d = None
        if self.dim != -1 and not self._estimated:  # fixed parametrization
            d = self.dim
        elif self._estimated:  # parametrization finished. Dimension is known
            dim = len(self.eigenvalues)
            if self.var_cutoff < 1.0:  # if subspace_variance, reduce the output dimension if needed
                dim = min(dim,
                          np.searchsorted(self.cumvar, self.var_cutoff) + 1)
            d = dim
        elif self.var_cutoff == 1.0:  # We only know that all dimensions are wanted, so return input dim
            d = self.data_producer.dimension()
        else:  # We know nothing. Give up
            raise RuntimeError(
                'Requested dimension, but the dimension depends on the cumulative variance and the '
                'transformer has not yet been estimated. Call estimate() before.'
            )
        return d

    @property
    def mean(self):
        """ mean of input features """
        return self._model.mean

    @property
    @deprecated('please use the "mean" property')
    def mu(self):
        """DEPRECATED: please use the "mean" property"""
        return self.mean

    @mean.setter
    def mean(self, value):
        self._model.mean = value

    def estimate(self, X, **kwargs):
        r"""
        Chunk-based parameterization of TICA. Iterates over all data and estimates
        the mean, covariance and time lagged covariance. Finally, the
        generalized eigenvalue problem is solved to determine
        the independent components.
        """
        return super(TICA, self).estimate(X, **kwargs)

    def partial_fit(self, X):
        """ incrementally update the covariances and mean.

        Parameters
        ----------
        X: array, list of arrays, PyEMMA reader
            input data.

        Notes
        -----
        The projection matrix is first being calculated upon its first access.
        """
        from pyemma.coordinates import source
        iterable = source(X)

        indim = iterable.dimension()
        if not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))

        self._covar.partial_fit(iterable)
        self._model.update_model_params(
            mean=self._covar.mean,  # TODO: inefficient, fixme
            cov=self._covar.cov,
            cov_tau=self._covar.cov_tau)

        self._used_data = self._covar._used_data
        self._estimated = False

        return self

    def _estimate(self, iterable, **kw):
        indim = iterable.dimension()

        if not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))

        if self._logger_is_active(self._loglevel_DEBUG):
            self._logger.debug(
                "Running TICA with tau=%i; Estimating two covariance matrices"
                " with dimension (%i, %i)" % (self._lag, indim, indim))

        self._covar.estimate(iterable, **kw)
        self._model.update_model_params(mean=self._covar.mean,
                                        cov=self._covar.cov,
                                        cov_tau=self._covar.cov_tau)
        self._diagonalize()

        return self._model

    def _transform_array(self, X):
        r"""Projects the data onto the dominant independent components.

        Parameters
        ----------
        X : ndarray(n, m)
            the input data

        Returns
        -------
        Y : ndarray(n,)
            the projected data
        """
        X_meanfree = X - self.mean
        Y = np.dot(X_meanfree, self.eigenvectors[:, 0:self.dimension()])

        return Y.astype(self.output_type())

    def _diagonalize(self):
        # diagonalize with low rank approximation
        self._logger.debug("diagonalize Cov and Cov_tau.")
        try:
            eigenvalues, eigenvectors = eig_corr(self._covar.cov,
                                                 self._covar.cov_tau,
                                                 self.epsilon,
                                                 sign_maxelement=True)
        except ZeroRankError:
            raise ZeroRankError(
                'All input features are constant in all time steps. No dimension would be left after dimension reduction.'
            )
        if self.kinetic_map and self.commute_map:
            raise ValueError(
                'Trying to use both kinetic_map and commute_map. Use either or.'
            )
        if self.kinetic_map:  # scale by eigenvalues
            eigenvectors *= eigenvalues[None, :]
        if self.commute_map:  # scale by (regularized) timescales
            timescales = 1 - self.lag / np.log(np.abs(eigenvalues))
            # dampen timescales smaller than the lag time, as in section 2.5 of ref. [5]
            regularized_timescales = 0.5 * timescales * np.maximum(
                np.tanh(np.pi * ((timescales - self.lag) / self.lag) + 1), 0)

            eigenvectors *= np.sqrt(regularized_timescales / 2)
        self._logger.debug("finished diagonalisation.")

        # compute cumulative variance
        cumvar = np.cumsum(np.abs(eigenvalues)**2)
        cumvar /= cumvar[-1]

        self._model.update_model_params(cumvar=cumvar,
                                        eigenvalues=eigenvalues,
                                        eigenvectors=eigenvectors)

        self._estimated = True

    @property
    @_lazy_estimation
    def timescales(self):
        r"""Implied timescales of the TICA transformation

        For each :math:`i`-th eigenvalue, this returns

        .. math::

            t_i = -\frac{\tau}{\log(|\lambda_i|)}

        where :math:`\tau` is the :py:obj:`lag` of the TICA object and :math:`\lambda_i` is the `i`-th
        :py:obj:`eigenvalue <eigenvalues>` of the TICA object.

        Returns
        -------
        timescales: 1D np.array
            numpy array with the implied timescales. In principle, one should expect as many timescales as
            input coordinates were available. However, less eigenvalues will be returned if the TICA matrices
            were not full rank or :py:obj:`var_cutoff` was parsed
        """
        return -self.lag / np.log(np.abs(self.eigenvalues))

    @property
    @_lazy_estimation
    def feature_TIC_correlation(self):
        r"""Instantaneous correlation matrix between mean-free input features and TICs

        Denoting the input features as :math:`X_i` and the TICs as :math:`\theta_j`, the instantaneous, linear correlation
        between them can be written as

        .. math::

            \mathbf{Corr}(X_i - \mu_i, \mathbf{\theta}_j) = \frac{1}{\sigma_{X_i - \mu_i}}\sum_l \sigma_{(X_i - \mu_i)(X_l - \mu_l} \mathbf{U}_{li}

        The matrix :math:`\mathbf{U}` is the matrix containing, as column vectors, the eigenvectors of the TICA
        generalized eigenvalue problem .

        Returns
        -------
        feature_TIC_correlation : ndarray(n,m)
            correlation matrix between input features and TICs. There is a row for each feature and a column
            for each TIC.
        """
        feature_sigma = np.sqrt(np.diag(self.cov))
        return np.dot(self.cov, self.eigenvectors[:, :self.dimension()]
                      ) / feature_sigma[:, np.newaxis]

    @property
    def cov(self):
        """ covariance matrix of input data. """
        return self._model.cov

    @cov.setter
    def cov(self, value):
        self._model.cov = value

    @property
    def cov_tau(self):
        """ covariance matrix of time-lagged input data. """
        return self._model.cov_tau

    @cov_tau.setter
    def cov_tau(self, value):
        self._model.cov_tau = value

    @property
    @_lazy_estimation
    def eigenvalues(self):
        r"""Eigenvalues of the TICA problem (usually denoted :math:`\lambda`

        Returns
        -------
        eigenvalues: 1D np.array
        """
        return self._model.eigenvalues

    @property
    @_lazy_estimation
    def eigenvectors(self):
        r"""Eigenvectors of the TICA problem, columnwise

        Returns
        -------
        eigenvectors: (N,M) ndarray
        """
        return self._model.eigenvectors

    @property
    @_lazy_estimation
    def cumvar(self):
        r"""Cumulative sum of the the TICA eigenvalues

        Returns
        -------
        cumvar: 1D np.array
        """
        return self._model.cumvar

    def output_type(self):
        # TODO: handle the case of conjugate pairs
        if np.all(np.isreal(self.eigenvectors[:, 0:self.dimension()])) or \
                np.allclose(np.imag(self.eigenvectors[:, 0:self.dimension()]), 0):
            return super(TICA, self).output_type()
        else:
            return np.complex64
示例#10
0
class _KoopmanEstimator(StreamingEstimator):
    '''only for computing u
       The user-accessible way for computing K is TICA()
    '''
    def __init__(self,
                 lag,
                 epsilon=1e-6,
                 stride=1,
                 skip=0,
                 chunksize=None,
                 ncov_max=float('inf')):

        super(_KoopmanEstimator, self).__init__(chunksize=chunksize)

        self._covar = LaggedCovariance(c00=True,
                                       c0t=True,
                                       remove_data_mean=True,
                                       reversible=False,
                                       lag=lag,
                                       bessel=False,
                                       stride=stride,
                                       skip=skip,
                                       ncov_max=ncov_max)

        self.set_params(lag=lag,
                        epsilon=epsilon,
                        stride=stride,
                        skip=skip,
                        ncov_max=ncov_max)
        self._estimation_finished = False

    def partial_fit(self, X):
        from pyemma.coordinates import source
        self._covar.partial_fit(source(X))
        self._estimation_finished = False
        self._estimated = True
        return self

    def _finish_estimation(self):
        R = spd_inv_split(self._covar.cov,
                          epsilon=self.epsilon,
                          canonical_signs=True)
        # Set the new correlation matrix:
        M = R.shape[1]
        K = np.dot(R.T, np.dot((self._covar.cov_tau), R))
        K = np.vstack((K, np.dot((self._covar.mean_tau - self._covar.mean),
                                 R)))
        ex1 = np.zeros((M + 1, 1))
        ex1[M, 0] = 1.0
        self._K = np.hstack((K, ex1))
        self._R = R

        self._estimation_finished = True
        self._estimated = True

    def _estimate(self, iterable, **kwargs):
        self._covar.estimate(iterable, **kwargs)
        self._finish_estimation()
        return self

    @property
    def K_pc_1(self):
        'Koopman operator on the modified basis (PC|1)'
        self._check_estimated()
        if not self._estimation_finished:
            self._finish_estimation()
        return self._K

    @property
    def u_pc_1(self):
        'weights in the modified basis'
        self._check_estimated()
        return _compute_u(self.K_pc_1)

    @property
    def u(self):
        'weights in the input basis'
        self._check_estimated()
        u_mod = self.u_pc_1
        N = self._R.shape[0]
        u_input = np.zeros(N + 1)
        u_input[0:N] = self._R.dot(u_mod[0:-1])  # in input basis
        u_input[N] = u_mod[-1] - self.mean.dot(self._R.dot(u_mod[0:-1]))
        return u_input

    @property
    def weights(self):
        'weights in the input basis (encapsulated in an object)'
        self._check_estimated()
        u_input = self.u
        return _KoopmanWeights(u_input[0:-1], u_input[-1])

    @property
    def R(self):
        'weightening transformation'
        self._check_estimated()
        if not self._estimation_finished:
            self._finish_estimation()
        return self._R

    @property
    def mean(self):
        self._check_estimated()
        return self._covar.mean
示例#11
0
class TICA(TICABase, SerializableMixIn):
    r""" Time-lagged independent component analysis (TICA)"""
    __serialize_version = 0

    def __init__(self,
                 lag,
                 dim=-1,
                 var_cutoff=0.95,
                 kinetic_map=True,
                 commute_map=False,
                 epsilon=1e-6,
                 stride=1,
                 skip=0,
                 reversible=True,
                 weights=None,
                 ncov_max=float('inf')):
        r""" Time-lagged independent component analysis (TICA) [1]_, [2]_, [3]_.

        Parameters
        ----------
        lag : int
            lag time
        dim : int, optional, default -1
            Maximum number of significant independent components to use to reduce dimension of input data. -1 means
            all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff.
            Setting dim to a positive value is exclusive with var_cutoff.
        var_cutoff : float in the range [0,1], optional, default 0.95
            Determines the number of output dimensions by including dimensions until their cumulative kinetic variance
            exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions
            (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim
        kinetic_map : bool, optional, default True
            Eigenvectors will be scaled by eigenvalues. As a result, Euclidean distances in the transformed data
            approximate kinetic distances [4]_. This is a good choice when the data is further processed by clustering.
        commute_map : bool, optional, default False
            Eigenvector_i will be scaled by sqrt(timescale_i / 2). As a result, Euclidean distances in the transformed
            data will approximate commute distances [5]_.
        epsilon : float
            eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be
            cut off. The remaining number of eigenvalues define the size
            of the output.
        stride: int, optional, default = 1
            Use only every stride-th time step. By default, every time step is used.
        skip : int, default=0
            skip the first initial n frames per trajectory.
        reversible: bool, default=True
            symmetrize correlation matrices C_0, C_{\tau}.
        weights: object or list of ndarrays, optional, default = None
            * An object that allows to compute re-weighting factors to estimate equilibrium means and correlations from
              off-equilibrium data. The only requirement is that weights possesses a method weights(X), that accepts a
              trajectory X (np.ndarray(T, n)) and returns a vector of re-weighting factors (np.ndarray(T,)).
            * A list of ndarrays (ndim=1) specifies the weights for each frame of each trajectory.

        Notes
        -----
        Given a sequence of multivariate data :math:`X_t`, computes the mean-free
        covariance and time-lagged covariance matrix:

        .. math::

            C_0 &=      (X_t - \mu)^T (X_t - \mu) \\
            C_{\tau} &= (X_t - \mu)^T (X_{t + \tau} - \mu)

        and solves the eigenvalue problem

        .. math:: C_{\tau} r_i = C_0 \lambda_i(tau) r_i,

        where :math:`r_i` are the independent components and :math:`\lambda_i(tau)` are
        their respective normalized time-autocorrelations. The eigenvalues are
        related to the relaxation timescale by

        .. math:: t_i(tau) = -\tau / \ln |\lambda_i|.

        When used as a dimension reduction method, the input data is projected
        onto the dominant independent components.

        References
        ----------
        .. [1] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013.
           Identification of slow molecular order parameters for Markov model construction
           J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489
        .. [2] Schwantes C, V S Pande. 2013.
           Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9
           J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a
        .. [3] L. Molgedey and H. G. Schuster. 1994.
           Separation of a mixture of independent signals using time delayed correlations
           Phys. Rev. Lett. 72, 3634.
        .. [4] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation.
            J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553
        .. [5] Noe, F., Banisch, R., Clementi, C. 2016. Commute maps: separating slowly-mixing molecular configurations
           for kinetic modeling. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.6b00762

        """
        super(TICA, self).__init__()
        if kinetic_map and commute_map:
            raise ValueError(
                'Trying to use both kinetic_map and commute_map. Use either or.'
            )
        if (kinetic_map or commute_map) and not reversible:
            kinetic_map = False
            commute_map = False
            warnings.warn(
                "Cannot use kinetic_map or commute_map for non-reversible processes, both will be set to"
                "False.")

        # this instance will be set by partial fit.
        self._covar = None

        self.dim = dim
        self.var_cutoff = var_cutoff

        self.set_params(lag=lag,
                        dim=dim,
                        var_cutoff=var_cutoff,
                        kinetic_map=kinetic_map,
                        commute_map=commute_map,
                        epsilon=epsilon,
                        reversible=reversible,
                        stride=stride,
                        skip=skip,
                        weights=weights,
                        ncov_max=ncov_max)

    @property
    def model(self):
        if not hasattr(self, '_model') or self._model is None:
            self._model = TICAModelBase()
        return self._model

    def describe(self):
        try:
            dim = self.dimension()
        except RuntimeError:
            dim = self.dim
        return "[TICA, lag = %i; max. output dim. = %i]" % (self._lag, dim)

    def estimate(self, X, **kwargs):
        r"""
        Chunk-based parameterization of TICA. Iterates over all data and estimates
        the mean, covariance and time lagged covariance. Finally, the
        generalized eigenvalue problem is solved to determine
        the independent components.
        """
        return super(TICA, self).estimate(X, **kwargs)

    def partial_fit(self, X):
        """ incrementally update the covariances and mean.

        Parameters
        ----------
        X: array, list of arrays, PyEMMA reader
            input data.

        Notes
        -----
        The projection matrix is first being calculated upon its first access.
        """
        from pyemma.coordinates import source
        iterable = source(X, chunksize=self.chunksize)

        indim = iterable.dimension()
        if not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))
        if self._covar is None:
            self._covar = LaggedCovariance(c00=True,
                                           c0t=True,
                                           ctt=False,
                                           remove_data_mean=True,
                                           reversible=self.reversible,
                                           lag=self.lag,
                                           bessel=False,
                                           stride=self.stride,
                                           skip=self.skip,
                                           weights=self.weights,
                                           ncov_max=self.ncov_max)
        self._covar.partial_fit(iterable)
        self.model.update_model_params(
            mean=self._covar.mean,  # TODO: inefficient, fixme
            cov=self._covar.C00_,
            cov_tau=self._covar.C0t_)

        self._estimated = False

        return self

    def _estimate(self, iterable, **kw):
        covar = LaggedCovariance(c00=True,
                                 c0t=True,
                                 ctt=False,
                                 remove_data_mean=True,
                                 reversible=self.reversible,
                                 lag=self.lag,
                                 bessel=False,
                                 stride=self.stride,
                                 skip=self.skip,
                                 weights=self.weights,
                                 ncov_max=self.ncov_max)
        indim = iterable.dimension()

        if not self.dim <= indim:
            raise RuntimeError(
                "requested more output dimensions (%i) than dimension"
                " of input data (%i)" % (self.dim, indim))

        if self._logger_is_active(self._loglevel_DEBUG):
            self.logger.debug(
                "Running TICA with tau=%i; Estimating two covariance matrices"
                " with dimension (%i, %i)", self._lag, indim, indim)
        covar.estimate(iterable, chunksize=self.chunksize, **kw)
        self.model.update_model_params(mean=covar.mean,
                                       cov=covar.C00_,
                                       cov_tau=covar.C0t_)
        self._diagonalize()

        return self.model

    def _diagonalize(self):
        # diagonalize with low rank approximation
        self.logger.debug("diagonalize Cov and Cov_tau.")
        try:
            eigenvalues, eigenvectors = eig_corr(self.cov,
                                                 self.cov_tau,
                                                 self.epsilon,
                                                 sign_maxelement=True)
        except ZeroRankError:
            raise ZeroRankError(
                'All input features are constant in all time steps. No dimension would be left after dimension reduction.'
            )
        if self.kinetic_map and self.commute_map:
            raise ValueError(
                'Trying to use both kinetic_map and commute_map. Use either or.'
            )
        if self.kinetic_map:  # scale by eigenvalues
            eigenvectors *= eigenvalues[None, :]
        if self.commute_map:  # scale by (regularized) timescales
            timescales = 1 - self.lag / np.log(np.abs(eigenvalues))
            # dampen timescales smaller than the lag time, as in section 2.5 of ref. [5]
            regularized_timescales = 0.5 * timescales * np.maximum(
                np.tanh(np.pi * ((timescales - self.lag) / self.lag) + 1), 0)

            eigenvectors *= np.sqrt(regularized_timescales / 2)
        self.logger.debug("finished diagonalisation.")

        # compute cumulative variance
        cumvar = np.cumsum(np.abs(eigenvalues)**2)
        cumvar /= cumvar[-1]

        self.model.update_model_params(cumvar=cumvar,
                                       eigenvalues=eigenvalues,
                                       eigenvectors=eigenvectors)

        self._estimated = True
示例#12
0
    def __init__(self, lag, max_columns,
                 dim=-1, var_cutoff=TICABase._DEFAULT_VARIANCE_CUTOFF, epsilon=1e-6,
                 stride=1, skip=0, reversible=True, ncov_max=float('inf'),
                 initial_columns=None, nsel=1, selection_strategy='spectral-oasis', neig=None):
        r""" Sparse sampling implementation [1]_ of time-lagged independent component analysis (TICA) [2]_, [3]_, [4]_.

        Parameters
        ----------
        lag : int
            lag time
        max_columns : int
            Maximum number of columns (features) to use in the approximation.
        dim : int, optional, default -1
            Maximum number of significant independent components to use to reduce dimension of input data. -1 means
            all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff.
            Setting dim to a positive value is exclusive with var_cutoff.
        var_cutoff : float in the range [0,1], optional, default 0.95
            Determines the number of output dimensions by including dimensions until their cumulative kinetic variance
            exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions
            (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim.
        epsilon : float, optional, default 1e-6
            Eigenvalue norm cutoff. Eigenvalues of :math:`C_0` with norms <= epsilon will be
            cut off. The remaining number of eigenvalues define the size
            of the output.
        stride: int, optional, default 1
            Use only every stride-th time step. By default, every time step is used.
        skip : int, optional, default 0
            Skip the first initial n frames per trajectory.
        reversible: bool, optional, default True
            Symmetrize correlation matrices :math:`C_0`, :math:`C_{\tau}`.
        initial_columns : list, ndarray(k, dtype=int), int, or None, optional, default None
            Columns used for an initial approximation. If a list or an 1-d ndarray
            of integers is given, use these column indices. If an integer is given,
            use that number of randomly selected indices. If None is given, use
            one randomly selected column.
        nsel : int, optional, default 1
            Number of columns to select and add per iteration and pass through the data.
            Larger values provide for better pass-efficiency.
        selection_strategy : str, optional, default 'spectral-oasis'
            Strategy to use for selecting new columns for the approximation.
            Can be 'random', 'oasis' or 'spectral-oasis'.
        neig : int or None, optional, default None
            Number of eigenvalues to be optimized by the selection process.
            If None, use the whole available eigenspace

        Notes
        -----
        Perform a sparse approximation of time-lagged independent component analysis (TICA)
        :class:`TICA <pyemma.coordinates.transform.TICA>`. The starting point is the
        generalized eigenvalue problem

        .. math:: C_{\tau} r_i = C_0 \lambda_i(\tau) r_i.

        Instead of computing the full matrices involved in this problem, we conduct
        a Nyström approximation [5]_ of the matrix :math:`C_0` by means of the
        accelerated sequential incoherence selection (oASIS) algorithm [6]_ and,
        in particular, its extension called spectral oASIS [1]_.

        Iteratively, we select a small number of columns such that the resulting
        Nyström approximation is sufficiently accurate. This selection represents
        in turn a subset of important features, for which we obtain a generalized
        eigenvalue problem similar to the one above, but much smaller in size.
        Its generalized eigenvalues and eigenvectors provide an approximation
        to those of the full TICA solution [1]_.

        References
        ----------
        .. [1] F. Litzinger, L. Boninsegna, H. Wu, F. Nüske, R. Patel, R. Baraniuk, F. Noé, and C. Clementi.
           Rapid calculation of molecular kinetics using compressed sensing (2018). (submitted)
        .. [2] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013.
           Identification of slow molecular order parameters for Markov model construction
           J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489
        .. [3] Schwantes C, V S Pande. 2013.
           Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9
           J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a
        .. [4] L. Molgedey and H. G. Schuster. 1994.
           Separation of a mixture of independent signals using time delayed correlations
           Phys. Rev. Lett. 72, 3634.
        .. [5] P. Drineas and M. W. Mahoney.
           On the Nystrom method for approximating a Gram matrix for improved kernel-based learning.
           Journal of Machine Learning Research, 6:2153-2175 (2005).
        .. [6] Raajen Patel, Thomas A. Goldstein, Eva L. Dyer, Azalia Mirhoseini, Richard G. Baraniuk.
           oASIS: Adaptive Column Sampling for Kernel Matrix Approximation.
           arXiv: 1505.05208 [stat.ML].

        """
        super(NystroemTICA, self).__init__()

        self._covar = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible,
                                       lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max)
        self._diag = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible,
                                      lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max,
                                      diag_only=True)
        self._oasis = None

        self.dim = dim
        self.var_cutoff = var_cutoff

        self.set_params(lag=lag, max_columns=max_columns,
                        epsilon=epsilon, reversible=reversible, stride=stride, skip=skip,
                        ncov_max=ncov_max,
                        initial_columns=initial_columns, nsel=nsel, selection_strategy=selection_strategy, neig=neig)
示例#13
0
class NystroemTICA(TICABase, SerializableMixIn):
    r""" Sparse sampling implementation of time-lagged independent component analysis (TICA)"""
    __serialize_version = 0
    __serialize_fields = ()

    def __init__(self, lag, max_columns,
                 dim=-1, var_cutoff=TICABase._DEFAULT_VARIANCE_CUTOFF, epsilon=1e-6,
                 stride=1, skip=0, reversible=True, ncov_max=float('inf'),
                 initial_columns=None, nsel=1, selection_strategy='spectral-oasis', neig=None):
        r""" Sparse sampling implementation [1]_ of time-lagged independent component analysis (TICA) [2]_, [3]_, [4]_.

        Parameters
        ----------
        lag : int
            lag time
        max_columns : int
            Maximum number of columns (features) to use in the approximation.
        dim : int, optional, default -1
            Maximum number of significant independent components to use to reduce dimension of input data. -1 means
            all numerically available dimensions (see epsilon) will be used unless reduced by var_cutoff.
            Setting dim to a positive value is exclusive with var_cutoff.
        var_cutoff : float in the range [0,1], optional, default 0.95
            Determines the number of output dimensions by including dimensions until their cumulative kinetic variance
            exceeds the fraction subspace_variance. var_cutoff=1.0 means all numerically available dimensions
            (see epsilon) will be used, unless set by dim. Setting var_cutoff smaller than 1.0 is exclusive with dim.
        epsilon : float, optional, default 1e-6
            Eigenvalue norm cutoff. Eigenvalues of :math:`C_0` with norms <= epsilon will be
            cut off. The remaining number of eigenvalues define the size
            of the output.
        stride: int, optional, default 1
            Use only every stride-th time step. By default, every time step is used.
        skip : int, optional, default 0
            Skip the first initial n frames per trajectory.
        reversible: bool, optional, default True
            Symmetrize correlation matrices :math:`C_0`, :math:`C_{\tau}`.
        initial_columns : list, ndarray(k, dtype=int), int, or None, optional, default None
            Columns used for an initial approximation. If a list or an 1-d ndarray
            of integers is given, use these column indices. If an integer is given,
            use that number of randomly selected indices. If None is given, use
            one randomly selected column.
        nsel : int, optional, default 1
            Number of columns to select and add per iteration and pass through the data.
            Larger values provide for better pass-efficiency.
        selection_strategy : str, optional, default 'spectral-oasis'
            Strategy to use for selecting new columns for the approximation.
            Can be 'random', 'oasis' or 'spectral-oasis'.
        neig : int or None, optional, default None
            Number of eigenvalues to be optimized by the selection process.
            If None, use the whole available eigenspace

        Notes
        -----
        Perform a sparse approximation of time-lagged independent component analysis (TICA)
        :class:`TICA <pyemma.coordinates.transform.TICA>`. The starting point is the
        generalized eigenvalue problem

        .. math:: C_{\tau} r_i = C_0 \lambda_i(\tau) r_i.

        Instead of computing the full matrices involved in this problem, we conduct
        a Nyström approximation [5]_ of the matrix :math:`C_0` by means of the
        accelerated sequential incoherence selection (oASIS) algorithm [6]_ and,
        in particular, its extension called spectral oASIS [1]_.

        Iteratively, we select a small number of columns such that the resulting
        Nyström approximation is sufficiently accurate. This selection represents
        in turn a subset of important features, for which we obtain a generalized
        eigenvalue problem similar to the one above, but much smaller in size.
        Its generalized eigenvalues and eigenvectors provide an approximation
        to those of the full TICA solution [1]_.

        References
        ----------
        .. [1] F. Litzinger, L. Boninsegna, H. Wu, F. Nüske, R. Patel, R. Baraniuk, F. Noé, and C. Clementi.
           Rapid calculation of molecular kinetics using compressed sensing (2018). (submitted)
        .. [2] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013.
           Identification of slow molecular order parameters for Markov model construction
           J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489
        .. [3] Schwantes C, V S Pande. 2013.
           Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9
           J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a
        .. [4] L. Molgedey and H. G. Schuster. 1994.
           Separation of a mixture of independent signals using time delayed correlations
           Phys. Rev. Lett. 72, 3634.
        .. [5] P. Drineas and M. W. Mahoney.
           On the Nystrom method for approximating a Gram matrix for improved kernel-based learning.
           Journal of Machine Learning Research, 6:2153-2175 (2005).
        .. [6] Raajen Patel, Thomas A. Goldstein, Eva L. Dyer, Azalia Mirhoseini, Richard G. Baraniuk.
           oASIS: Adaptive Column Sampling for Kernel Matrix Approximation.
           arXiv: 1505.05208 [stat.ML].

        """
        super(NystroemTICA, self).__init__()

        self._covar = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible,
                                       lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max)
        self._diag = LaggedCovariance(c00=True, c0t=True, ctt=False, remove_data_mean=True, reversible=reversible,
                                      lag=lag, bessel=False, stride=stride, skip=skip, ncov_max=ncov_max,
                                      diag_only=True)
        self._oasis = None

        self.dim = dim
        self.var_cutoff = var_cutoff

        self.set_params(lag=lag, max_columns=max_columns,
                        epsilon=epsilon, reversible=reversible, stride=stride, skip=skip,
                        ncov_max=ncov_max,
                        initial_columns=initial_columns, nsel=nsel, selection_strategy=selection_strategy, neig=neig)

    @property
    def model(self):
        if not hasattr(self, '_model') or self._model is None:
            self._model = NystroemTICAModel()
        return self._model

    @property
    def initial_columns(self):
        return self._initial_columns

    @initial_columns.setter
    def initial_columns(self, initial_columns):
        if not (initial_columns is None
                or isinstance(initial_columns, (int, FunctionType, np.ndarray))):
            raise ValueError('initial_columns has to be one of these types (None, int, function, ndarray),'
                             'but was {}'.format(type(initial_columns)))
        if initial_columns is None:
            initial_columns = 1
        if isinstance(initial_columns, int):
            i = initial_columns
            initial_columns = lambda N: np.random.choice(N, i, replace=False)
        if isinstance(initial_columns, np.ndarray):
            initial_columns = ensure_int_vector(initial_columns)
        self._initial_columns = initial_columns

    def describe(self):
        try:
            dim = self.dimension()
        except RuntimeError:
            dim = self.dim
        return "[NystroemTICA, lag = %i; max. columns = %i; max. output dim. = %i]" % (self.lag, self.max_columns, dim)

    def estimate(self, X, **kwargs):
        r"""
        Chunk-based parameterization of NystroemTICA.
        Iterates over all data several times to select important columns and
        estimate the mean, covariance and time-lagged covariance. Finally, the
        small-scale generalized eigenvalue problem is solved to determine
        the approximate independent components.
        """
        return super(NystroemTICA, self).estimate(X, **kwargs)

    def _estimate(self, iterable, **kw):
        from pyemma.coordinates.data import DataInMemory
        if not isinstance(iterable, DataInMemory):
            self.logger.warning('Every iteration of the selection process involves streaming of all data and featurization. '
                                'Depending on your setup, this might be inefficient.')

        indim = iterable.dimension()
        if not self.dim <= indim:
            raise RuntimeError("requested more output dimensions (%i) than dimension"
                               " of input data (%i)" % (self.dim, indim))

        if callable(self.initial_columns):
            self.initial_columns = self.initial_columns(indim)
        if not len(np.array(self.initial_columns).shape) == 1:
            raise ValueError('initial_columns must be either None, an integer, a list, or a 1-d numpy array.')

        self._diag.estimate(iterable, **kw)

        self._covar.column_selection = self.initial_columns
        self._covar.estimate(iterable, **kw)
        self.model.update_model_params(cov_tau=self._covar.C0t_)

        self._oasis = oASIS_Nystroem(self._diag.C00_, self._covar.C00_, self.initial_columns)
        self._oasis.set_selection_strategy(strategy=self.selection_strategy, nsel=self.nsel, neig=self.neig)

        while self._oasis.k < np.min((self.max_columns, self._oasis.n)):
            cols = self._oasis.select_columns()
            if cols is None or len(cols) == 0 or np.all(np.in1d(cols, self._oasis.column_indices)):
                self.logger.warning("Iteration ended prematurely: No more columns to select.")
                break
            self._covar.column_selection = cols
            self._covar.estimate(iterable, **kw)
            ix = self._oasis.add_columns(self._covar.C00_, cols)
            ix = np.in1d(cols, ix)
            if np.any(ix):
                added_columns = self._covar.C0t_[:, ix]
                self.model.update_model_params(cov_tau=np.concatenate((self._model.cov_tau, added_columns), axis=1))

        self.model.update_model_params(mean=self._covar.mean,
                                        diag=self._diag.C00_,
                                        cov=self._oasis.Ck,
                                        column_indices=self._oasis.column_indices)
        self._diagonalize()

        return self.model

    def _diagonalize(self):
        # diagonalize with low rank approximation
        self.logger.debug("Diagonalize Cov and Cov_tau.")
        Wktau = self._model.cov_tau[self._model.column_indices, :]
        try:
            eigenvalues, eigenvectors = eig_corr(self._oasis.Wk, Wktau, self.epsilon, sign_maxelement=True)
        except ZeroRankError:
            raise ZeroRankError('All input features are constant in all time steps. '
                                'No dimension would be left after dimension reduction.')
        self.logger.debug("Finished diagonalization.")

        # compute cumulative variance
        cumvar = np.cumsum(np.abs(eigenvalues) ** 2)
        cumvar /= cumvar[-1]

        self._model.update_model_params(cumvar=cumvar,
                                        eigenvalues=eigenvalues,
                                        eigenvectors=eigenvectors)

        self._estimated = True

    @property
    def column_indices(self):
        """ Indices of columns used in the approximation. """
        return self.model.column_indices