예제 #1
0
파일: tica.py 프로젝트: prokia/htmd
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA
    units : str
        The units of lag. Can be 'frames' or any time unit given as a string.
    dimensions : list
        A list of dimensions of the original data on which to apply TICA. All other dimensions will stay unaltered.
        If None is given, it will apply on all dimensions.

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> data = metr.project()
    >>> tica = TICA(data, 20)
    >>> datatica = tica.project(3)
    Alternatively you can pass a Metric object to TICA. Uses less memory but is slower.
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> slowtica = TICA(metr, 20)
    >>> datatica = slowtica.project(3)


    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """
    def __init__(self, data, lag, units='frames', dimensions=None):
        from pyemma.coordinates.transform.tica import TICA as TICApyemma
        from tqdm import tqdm

        self.data = data
        self.dimensions = dimensions

        if isinstance(
                data, Metric
        ):  # Memory efficient TICA projecting trajectories on the fly
            if units != 'frames':
                raise RuntimeError(
                    'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.'
                )
            self.tic = TICApyemma(lag)
            metr = data

            pbar = tqdm(total=len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    if pro is None:
                        continue
                    if self.dimensions is None:
                        self.tic.partial_fit(pro[0])
                    else:  # Sub-select dimensions for fitting
                        self.tic.partial_fit(pro[0][:, self.dimensions])
                pbar.update(len(proj))
            pbar.close()
        else:  # In-memory TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError(
                    'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.'
                )

            self.tic = TICApyemma(lag)
            if self.dimensions is None:
                datalist = data.dat.tolist()
            else:  # Sub-select dimensions for fitting
                datalist = [x[:, self.dimensions].copy() for x in data.dat]
            self.tic.fit(datalist)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        from tqdm import tqdm
        if ndim is not None:
            self.tic.set_params(dim=ndim)

        keepdata = []
        keepdim = None
        keepdimdesc = None
        if isinstance(
                self.data, Metric
        ):  # Memory efficient TICA projecting trajectories on the fly
            proj = []
            refs = []
            fstep = None

            metr = self.data
            k = -1
            droppedsims = []
            pbar = tqdm(total=len(metr.simulations))
            for projecteddata in _projectionGenerator(metr, _getNcpus()):
                for pro in projecteddata:
                    k += 1
                    if pro is None:
                        droppedsims.append(k)
                        continue
                    if self.dimensions is not None:
                        numDimensions = pro[0].shape[1]
                        keepdim = np.setdiff1d(range(numDimensions),
                                               self.dimensions)
                        keepdata.append(pro[0][:, keepdim])
                        proj.append(
                            self.tic.transform(
                                pro[0][:, self.dimensions]).astype(np.float32)
                        )  # Sub-select dimensions for projecting
                    else:
                        proj.append(
                            self.tic.transform(pro[0]).astype(np.float32))
                    refs.append(pro[1])
                    if fstep is None:
                        fstep = pro[2]
                pbar.update(len(projecteddata))
            pbar.close()

            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            parent = None
            if self.dimensions is not None:
                from htmd.projections.metric import _singleMolfile
                from htmd.molecule.molecule import Molecule
                (single, molfile) = _singleMolfile(metr.simulations)
                if single:
                    keepdimdesc = metr.getMapping(Molecule(molfile))
                    keepdimdesc = keepdimdesc.iloc[keepdim]
        else:
            if ndim is not None and self.data.numDimensions < ndim:
                raise RuntimeError(
                    'TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'
                    .format(self.data.numDimensions, ndim))

            if self.dimensions is not None:
                keepdim = np.setdiff1d(range(self.data.numDimensions),
                                       self.dimensions)
                keepdata = [x[:, keepdim] for x in self.data.dat]
                if self.data.description is not None:
                    keepdimdesc = self.data.description.iloc[keepdim]
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        # If TICA is done on a subset of dimensions, combine non-projected data with projected data
        if self.dimensions is not None:
            newproj = []
            for k, t in zip(keepdata, proj):
                newproj.append(np.hstack((k, t)))
            proj = newproj

        if ndim is None:
            ndim = self.tic.dimension()
            logger.info(
                'Kept {} dimension(s) to cover 95% of kinetic variance.'.
                format(ndim))

        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj),
                              simlist=simlist,
                              ref=ref,
                              fstep=fstep,
                              parent=parent)
        from pandas import DataFrame
        # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame
        types = []
        indexes = []
        description = []
        for i in range(ndim):
            types += ['tica']
            indexes += [-1]
            description += ['TICA dimension {}'.format(i + 1)]
        datatica.description = DataFrame({
            'type': types,
            'atomIndexes': indexes,
            'description': description
        })

        if self.dimensions is not None and keepdimdesc is not None:  # If TICA is done on a subset of dims
            datatica.description = keepdimdesc.append(datatica.description,
                                                      ignore_index=True)

        return datatica
예제 #2
0
파일: tica.py 프로젝트: jeiros/htmd
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA
    units : str
        The units of lag. Can be 'frames' or any time unit given as a string.
    dimensions : list
        A list of dimensions of the original data on which to apply TICA. All other dimensions will stay unaltered.
        If None is given, it will apply on all dimensions.

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> data = metr.project()
    >>> tica = TICA(data, 20)
    >>> datatica = tica.project(3)
    Alternatively you can pass a Metric object to TICA. Uses less memory but is slower.
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> slowtica = TICA(metr, 20)
    >>> datatica = slowtica.project(3)


    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """

    def __init__(self, data, lag, units='frames', dimensions=None):
        from pyemma.coordinates.transform.tica import TICA as TICApyemma

        self.data = data
        self.dimensions = dimensions

        if isinstance(data, Metric):  # Memory efficient TICA projecting trajectories on the fly
            if units != 'frames':
                raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.')
            self.tic = TICApyemma(lag)
            metr = data

            p = ProgressBar(len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    if pro is None:
                        continue
                    if self.dimensions is None:
                        self.tic.partial_fit(pro[0])
                    else:  # Sub-select dimensions for fitting
                        self.tic.partial_fit(pro[0][:, self.dimensions])
                p.progress(len(proj))
            p.stop()
        else:  # In-memory TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.')

            self.tic = TICApyemma(lag)
            if self.dimensions is None:
                datalist = data.dat.tolist()
            else:  # Sub-select dimensions for fitting
                datalist = [x[:, self.dimensions].copy() for x in data.dat]
            self.tic.fit(datalist)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        if ndim is not None:
            self.tic.set_params(dim=ndim)

        keepdata = []
        keepdim = None
        keepdimdesc = None
        if isinstance(self.data, Metric):  # Memory efficient TICA projecting trajectories on the fly
            proj = []
            refs = []
            fstep = None

            metr = self.data
            p = ProgressBar(len(metr.simulations))
            k = -1
            droppedsims = []
            for projecteddata in _projectionGenerator(metr, _getNcpus()):
                for pro in projecteddata:
                    k += 1
                    if pro is None:
                        droppedsims.append(k)
                        continue
                    if self.dimensions is not None:
                        numDimensions = pro[0].shape[1]
                        keepdim = np.setdiff1d(range(numDimensions), self.dimensions)
                        keepdata.append(pro[0][:, keepdim])
                        proj.append(self.tic.transform(pro[0][:, self.dimensions]).astype(np.float32))  # Sub-select dimensions for projecting
                    else:
                        proj.append(self.tic.transform(pro[0]).astype(np.float32))
                    refs.append(pro[1])
                    if fstep is None:
                        fstep = pro[2]
                p.progress(len(projecteddata))
            p.stop()

            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            parent = None
            if self.dimensions is not None:
                from htmd.projections.metric import _singleMolfile
                from htmd.molecule.molecule import Molecule
                (single, molfile) = _singleMolfile(metr.simulations)
                if single:
                    keepdimdesc = metr.getMapping(Molecule(molfile))
                    keepdimdesc = keepdimdesc.iloc[keepdim]
        else:
            if ndim is not None and self.data.numDimensions < ndim:
                raise RuntimeError('TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'.format(self.data.numDimensions, ndim))

            if self.dimensions is not None:
                keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions)
                keepdata = [x[:, keepdim] for x in self.data.dat]
                if self.data.description is not None:
                    keepdimdesc = self.data.description.iloc[keepdim]
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        # If TICA is done on a subset of dimensions, combine non-projected data with projected data
        if self.dimensions is not None:
            newproj = []
            for k, t in zip(keepdata, proj):
                newproj.append(np.hstack((k, t)))
            proj = newproj

        if ndim is None:
            ndim = self.tic.dimension()
            logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(ndim))

        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent)
        from pandas import DataFrame
        # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame
        types = []
        indexes = []
        description = []
        for i in range(ndim):
            types += ['tica']
            indexes += [-1]
            description += ['TICA dimension {}'.format(i+1)]
        datatica.description = DataFrame({'type': types, 'atomIndexes': indexes, 'description': description})

        if self.dimensions is not None and keepdimdesc is not None:  # If TICA is done on a subset of dims
            datatica.description = keepdimdesc.append(datatica.description, ignore_index=True)

        return datatica