Exemplo n.º 1
0
Arquivo: tica.py Projeto: jhprinz/htmd
    def __init__(self, data, lag, units='frames'):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            if units != 'frames':
                raise RuntimeError(
                    'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.'
                )
            metr = data
            from pyemma.coordinates.transform.tica import TICA
            self.tic = TICA(lag)

            p = ProgressBar(len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    self.tic.partial_fit(pro[0])
                p.progress(len(proj))
            p.stop()
        else:
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError(
                    'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.'
                )
            self.tic = tica(data.dat.tolist(), lag=lag)
Exemplo n.º 2
0
    def testChunksizeResultsTica(self):
        chunk = 40
        lag = 100
        np.random.seed(0)
        X = np.random.randn(23000, 3)

        # un-chunked
        d = DataInMemory(X)

        tica = TICA(lag=lag, output_dimension=1)
        tica.data_producer = d
        tica.parametrize()

        cov = tica.cov.copy()
        mean = tica.mu.copy()

        # ------- run again with new chunksize -------
        d = DataInMemory(X)
        d.chunksize = chunk
        tica = TICA(lag=lag, output_dimension=1)
        tica.data_producer = d

        tica.parametrize()

        np.testing.assert_allclose(tica.mu, mean)
        np.testing.assert_allclose(tica.cov, cov)
Exemplo n.º 3
0
    def __init__(self, data, lag, units='frames', dimensions=None):
        from pyemma.coordinates.transform.tica import TICA as TICApyemma

        self.data = data
        self.dimensions = dimensions

        if isinstance(data, Metric):  # Memory efficient TICA projecting trajectories on the fly
            if units != 'frames':
                raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.')
            self.tic = TICApyemma(lag)
            metr = data

            p = ProgressBar(len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    if pro is None:
                        continue
                    if self.dimensions is None:
                        self.tic.partial_fit(pro[0])
                    else:  # Sub-select dimensions for fitting
                        self.tic.partial_fit(pro[0][:, self.dimensions])
                p.progress(len(proj))
            p.stop()
        else:  # In-memory TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.')

            self.tic = TICApyemma(lag)
            if self.dimensions is None:
                datalist = data.dat.tolist()
            else:  # Sub-select dimensions for fitting
                datalist = [x[:, self.dimensions].copy() for x in data.dat]
            self.tic.fit(datalist)
Exemplo n.º 4
0
    def test_singular_zeros(self):
        tica = TICA(lag=1, output_dimension=1)

        # make some data that has one column of all zeros
        X = np.random.randn(100, 2)
        X = np.hstack((X, np.zeros((100, 1))))

        d = DataInMemory(X)

        tica.data_producer = d
        tica.parametrize()

        assert tica.eigenvectors.dtype == np.float64
        assert tica.eigenvalues.dtype == np.float64
Exemplo n.º 5
0
    def test_duplicated_data(self):
        tica = TICA(lag=1, output_dimension=1)

        # make some data that has one column repeated twice
        X = np.random.randn(100, 2)
        X = np.hstack((X, X[:, 0, np.newaxis]))

        d = DataInMemory(X)

        tica.data_producer = d
        tica.parametrize()

        assert tica.eigenvectors.dtype == np.float64
        assert tica.eigenvalues.dtype == np.float64
Exemplo n.º 6
0
    def __init__(self, data, lag, units="frames", dimensions=None, njobs=None):
        from pyemma.coordinates.transform.tica import TICA as TICApyemma
        from tqdm import tqdm
        from htmd.util import _getNjobs

        self.data = data
        self.dimensions = dimensions
        self.njobs = njobs if njobs is not None else _getNjobs()

        if isinstance(
                data, Metric
        ):  # Memory efficient TICA projecting trajectories on the fly
            if units != "frames":
                raise RuntimeError(
                    "Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues."
                )
            self.tic = TICApyemma(lag)
            metr = data

            pbar = tqdm(total=len(metr.simulations))
            for proj in _projectionGenerator(metr, self.njobs):
                for pro in proj:
                    if pro is None:
                        continue
                    if self.dimensions is None:
                        self.tic.partial_fit(pro[0])
                    else:  # Sub-select dimensions for fitting
                        self.tic.partial_fit(pro[0][:, self.dimensions])
                pbar.update(len(proj))
            pbar.close()
        else:  # In-memory TICA
            lag = unitconvert(units, "frames", lag, data.fstep)
            if lag == 0:
                raise RuntimeError(
                    "Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA."
                )

            self.tic = TICApyemma(lag)
            if self.dimensions is None:
                datalist = data.dat.tolist()
            else:  # Sub-select dimensions for fitting
                datalist = [x[:, self.dimensions].copy() for x in data.dat]
            self.tic.fit(datalist)
Exemplo n.º 7
0
    def __init__(self, data, lag, units='frames'):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            from pyemma.coordinates.transform.tica import TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            self.tic = TICA(lag)

            p = ProgressBar(len(data.simulations))
            for i in range(len(data.simulations)):
                # Fix for pyemma bug. Remove eventually:
                d, _, _ = data._projectSingle(i)
                if d is None or d.shape[0] < lag:
                    continue
                self.tic.partial_fit(d)
                p.progress()
            p.stop()
        else:
            self.tic = tica(data.dat.tolist(), lag=lag)
Exemplo n.º 8
0
    def test(self):
        np.random.seed(0)

        tica = TICA(lag=50, output_dimension=1)
        data = np.random.randn(100, 10)
        ds = DataInMemory(data)
        tica.data_producer = ds

        tica.parametrize()

        Y = tica.map(data)
Exemplo n.º 9
0
    def __init__(self, data, lag):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            from pyemma.coordinates.transform.tica import TICA
            self.tic = TICA(lag)

            p = ProgressBar(len(data.simulations))
            for i in range(len(data.simulations)):
                # Fix for pyemma bug. Remove eventually:
                d, _, _ = data._projectSingle(i)
                if d is None or d.shape[0] < lag:
                    continue
                self.tic.partial_fit(d)
                p.progress()
            p.stop()
        else:
            self.tic = tica(data.dat.tolist(), lag=lag)
Exemplo n.º 10
0
Arquivo: tica.py Projeto: jhprinz/htmd
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA
    units : str
        The units of lag. Can be 'frames' or any time unit given as a string.

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> tica = TICA(data,20)

    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """
    def __init__(self, data, lag, units='frames'):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            if units != 'frames':
                raise RuntimeError(
                    'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.'
                )
            metr = data
            from pyemma.coordinates.transform.tica import TICA
            self.tic = TICA(lag)

            p = ProgressBar(len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    self.tic.partial_fit(pro[0])
                p.progress(len(proj))
            p.stop()
        else:
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError(
                    'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.'
                )
            self.tic = tica(data.dat.tolist(), lag=lag)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        if ndim is not None:
            # self.tic._dim = ndim  # Old way of doing it. Deprecated since pyEMMA 2.1
            self.tic.set_params(
                dim=ndim)  # Change to this in 2.1 pyEMMA version

        if isinstance(
                self.data,
                Metric):  # Doesn't project on correct number of dimensions
            proj = []
            refs = []
            fstep = None

            metr = self.data
            p = ProgressBar(len(metr.simulations))
            k = -1
            droppedsims = []
            for projecteddata in _projectionGenerator(metr, _getNcpus()):
                for pro in projecteddata:
                    k += 1
                    if pro is None:
                        droppedsims.append(k)
                        continue
                    proj.append(self.tic.transform(pro[0]))
                    refs.append(pro[1])
                    if fstep is None:
                        fstep = pro[2]
                p.progress(len(projecteddata))
            p.stop()

            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            #fstep = 0
            parent = None
        else:
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        if ndim is None:
            logger.info(
                'Kept {} dimension(s) to cover 95% of kinetic variance.'.
                format(self.tic.dimension()))

        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj, dtype=object),
                              simlist=simlist,
                              ref=ref,
                              fstep=fstep,
                              parent=parent)
        from pandas import DataFrame
        types = []
        indexes = []
        description = []
        for i in range(ndim):
            types += ['tica']
            indexes += [-1]
            description += ['TICA dimension {}'.format(i + 1)]
        datatica.map = DataFrame({
            'type': types,
            'indexes': indexes,
            'description': description
        })

        return datatica
Exemplo n.º 11
0
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA
    units : str
        The units of lag. Can be 'frames' or any time unit given as a string.

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> tica = TICA(data,20)

    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """
    def __init__(self, data, lag, units='frames'):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            from pyemma.coordinates.transform.tica import TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            self.tic = TICA(lag)

            p = ProgressBar(len(data.simulations))
            for i in range(len(data.simulations)):
                # Fix for pyemma bug. Remove eventually:
                d, _, _ = data._projectSingle(i)
                if d is None or d.shape[0] < lag:
                    continue
                self.tic.partial_fit(d)
                p.progress()
            p.stop()
        else:
            self.tic = tica(data.dat.tolist(), lag=lag)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        if ndim is not None:
            # self.tic._dim = ndim  # Old way of doing it. Deprecated since pyEMMA 2.1
            self.tic.set_params(
                dim=ndim)  # Change to this in 2.1 pyEMMA version

        if isinstance(
                self.data,
                Metric):  # Doesn't project on correct number of dimensions
            proj = []
            refs = []
            fstep = None
            '''from htmd.config import _config
            from joblib import Parallel, delayed
            results = Parallel(n_jobs=_config['ncpus'], verbose=11)(
                delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations)))

            for i in range(len(results)):
                proj.append(results[i][0])
                refs.append(results[i][1])
                fstep.append(results[i][2])'''

            droppedsims = []
            p = ProgressBar(len(self.data.simulations))
            for i in range(len(self.data.simulations)):
                d, r, f = self.data._projectSingle(i)
                if d is None:
                    droppedsims.append(i)
                    continue
                if fstep is None:
                    fstep = f
                refs.append(r)
                proj.append(self.tic.transform(d))
                p.progress()
            p.stop()
            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            #fstep = 0
            parent = None
        else:
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        if ndim is None:
            logger.info(
                'Kept {} dimension(s) to cover 95% of kinetic variance.'.
                format(self.tic.dimension()))
        #print(np.shape(proj))

        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj, dtype=object),
                              simlist=simlist,
                              ref=ref,
                              fstep=fstep,
                              parent=parent)
        '''datatica = self.data.copy()
        #datatica.dat = self.data.deconcatenate(np.squeeze(proj))
        datatica.dat = np.array(proj, dtype=object)
        datatica.parent = self.data
        datatica.St = None
        datatica.Centers = None
        datatica.N = None
        datatica.K = None
        datatica._dataid = random.random()
        datatica._clusterid = None'''
        return datatica
Exemplo n.º 12
0
Arquivo: tica.py Projeto: prokia/htmd
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA
    units : str
        The units of lag. Can be 'frames' or any time unit given as a string.
    dimensions : list
        A list of dimensions of the original data on which to apply TICA. All other dimensions will stay unaltered.
        If None is given, it will apply on all dimensions.

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> data = metr.project()
    >>> tica = TICA(data, 20)
    >>> datatica = tica.project(3)
    Alternatively you can pass a Metric object to TICA. Uses less memory but is slower.
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> slowtica = TICA(metr, 20)
    >>> datatica = slowtica.project(3)


    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """
    def __init__(self, data, lag, units='frames', dimensions=None):
        from pyemma.coordinates.transform.tica import TICA as TICApyemma
        from tqdm import tqdm

        self.data = data
        self.dimensions = dimensions

        if isinstance(
                data, Metric
        ):  # Memory efficient TICA projecting trajectories on the fly
            if units != 'frames':
                raise RuntimeError(
                    'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.'
                )
            self.tic = TICApyemma(lag)
            metr = data

            pbar = tqdm(total=len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    if pro is None:
                        continue
                    if self.dimensions is None:
                        self.tic.partial_fit(pro[0])
                    else:  # Sub-select dimensions for fitting
                        self.tic.partial_fit(pro[0][:, self.dimensions])
                pbar.update(len(proj))
            pbar.close()
        else:  # In-memory TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError(
                    'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.'
                )

            self.tic = TICApyemma(lag)
            if self.dimensions is None:
                datalist = data.dat.tolist()
            else:  # Sub-select dimensions for fitting
                datalist = [x[:, self.dimensions].copy() for x in data.dat]
            self.tic.fit(datalist)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        from tqdm import tqdm
        if ndim is not None:
            self.tic.set_params(dim=ndim)

        keepdata = []
        keepdim = None
        keepdimdesc = None
        if isinstance(
                self.data, Metric
        ):  # Memory efficient TICA projecting trajectories on the fly
            proj = []
            refs = []
            fstep = None

            metr = self.data
            k = -1
            droppedsims = []
            pbar = tqdm(total=len(metr.simulations))
            for projecteddata in _projectionGenerator(metr, _getNcpus()):
                for pro in projecteddata:
                    k += 1
                    if pro is None:
                        droppedsims.append(k)
                        continue
                    if self.dimensions is not None:
                        numDimensions = pro[0].shape[1]
                        keepdim = np.setdiff1d(range(numDimensions),
                                               self.dimensions)
                        keepdata.append(pro[0][:, keepdim])
                        proj.append(
                            self.tic.transform(
                                pro[0][:, self.dimensions]).astype(np.float32)
                        )  # Sub-select dimensions for projecting
                    else:
                        proj.append(
                            self.tic.transform(pro[0]).astype(np.float32))
                    refs.append(pro[1])
                    if fstep is None:
                        fstep = pro[2]
                pbar.update(len(projecteddata))
            pbar.close()

            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            parent = None
            if self.dimensions is not None:
                from htmd.projections.metric import _singleMolfile
                from htmd.molecule.molecule import Molecule
                (single, molfile) = _singleMolfile(metr.simulations)
                if single:
                    keepdimdesc = metr.getMapping(Molecule(molfile))
                    keepdimdesc = keepdimdesc.iloc[keepdim]
        else:
            if ndim is not None and self.data.numDimensions < ndim:
                raise RuntimeError(
                    'TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'
                    .format(self.data.numDimensions, ndim))

            if self.dimensions is not None:
                keepdim = np.setdiff1d(range(self.data.numDimensions),
                                       self.dimensions)
                keepdata = [x[:, keepdim] for x in self.data.dat]
                if self.data.description is not None:
                    keepdimdesc = self.data.description.iloc[keepdim]
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        # If TICA is done on a subset of dimensions, combine non-projected data with projected data
        if self.dimensions is not None:
            newproj = []
            for k, t in zip(keepdata, proj):
                newproj.append(np.hstack((k, t)))
            proj = newproj

        if ndim is None:
            ndim = self.tic.dimension()
            logger.info(
                'Kept {} dimension(s) to cover 95% of kinetic variance.'.
                format(ndim))

        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj),
                              simlist=simlist,
                              ref=ref,
                              fstep=fstep,
                              parent=parent)
        from pandas import DataFrame
        # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame
        types = []
        indexes = []
        description = []
        for i in range(ndim):
            types += ['tica']
            indexes += [-1]
            description += ['TICA dimension {}'.format(i + 1)]
        datatica.description = DataFrame({
            'type': types,
            'atomIndexes': indexes,
            'description': description
        })

        if self.dimensions is not None and keepdimdesc is not None:  # If TICA is done on a subset of dims
            datatica.description = keepdimdesc.append(datatica.description,
                                                      ignore_index=True)

        return datatica
Exemplo n.º 13
0
Arquivo: tica.py Projeto: jeiros/htmd
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA
    units : str
        The units of lag. Can be 'frames' or any time unit given as a string.
    dimensions : list
        A list of dimensions of the original data on which to apply TICA. All other dimensions will stay unaltered.
        If None is given, it will apply on all dimensions.

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> data = metr.project()
    >>> tica = TICA(data, 20)
    >>> datatica = tica.project(3)
    Alternatively you can pass a Metric object to TICA. Uses less memory but is slower.
    >>> metr = Metric(sims)
    >>> metr.set(MetricSelfDistance('protein and name CA'))
    >>> slowtica = TICA(metr, 20)
    >>> datatica = slowtica.project(3)


    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """

    def __init__(self, data, lag, units='frames', dimensions=None):
        from pyemma.coordinates.transform.tica import TICA as TICApyemma

        self.data = data
        self.dimensions = dimensions

        if isinstance(data, Metric):  # Memory efficient TICA projecting trajectories on the fly
            if units != 'frames':
                raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.')
            self.tic = TICApyemma(lag)
            metr = data

            p = ProgressBar(len(metr.simulations))
            for proj in _projectionGenerator(metr, _getNcpus()):
                for pro in proj:
                    if pro is None:
                        continue
                    if self.dimensions is None:
                        self.tic.partial_fit(pro[0])
                    else:  # Sub-select dimensions for fitting
                        self.tic.partial_fit(pro[0][:, self.dimensions])
                p.progress(len(proj))
            p.stop()
        else:  # In-memory TICA
            lag = unitconvert(units, 'frames', lag, data.fstep)
            if lag == 0:
                raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.')

            self.tic = TICApyemma(lag)
            if self.dimensions is None:
                datalist = data.dat.tolist()
            else:  # Sub-select dimensions for fitting
                datalist = [x[:, self.dimensions].copy() for x in data.dat]
            self.tic.fit(datalist)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        if ndim is not None:
            self.tic.set_params(dim=ndim)

        keepdata = []
        keepdim = None
        keepdimdesc = None
        if isinstance(self.data, Metric):  # Memory efficient TICA projecting trajectories on the fly
            proj = []
            refs = []
            fstep = None

            metr = self.data
            p = ProgressBar(len(metr.simulations))
            k = -1
            droppedsims = []
            for projecteddata in _projectionGenerator(metr, _getNcpus()):
                for pro in projecteddata:
                    k += 1
                    if pro is None:
                        droppedsims.append(k)
                        continue
                    if self.dimensions is not None:
                        numDimensions = pro[0].shape[1]
                        keepdim = np.setdiff1d(range(numDimensions), self.dimensions)
                        keepdata.append(pro[0][:, keepdim])
                        proj.append(self.tic.transform(pro[0][:, self.dimensions]).astype(np.float32))  # Sub-select dimensions for projecting
                    else:
                        proj.append(self.tic.transform(pro[0]).astype(np.float32))
                    refs.append(pro[1])
                    if fstep is None:
                        fstep = pro[2]
                p.progress(len(projecteddata))
            p.stop()

            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            parent = None
            if self.dimensions is not None:
                from htmd.projections.metric import _singleMolfile
                from htmd.molecule.molecule import Molecule
                (single, molfile) = _singleMolfile(metr.simulations)
                if single:
                    keepdimdesc = metr.getMapping(Molecule(molfile))
                    keepdimdesc = keepdimdesc.iloc[keepdim]
        else:
            if ndim is not None and self.data.numDimensions < ndim:
                raise RuntimeError('TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'.format(self.data.numDimensions, ndim))

            if self.dimensions is not None:
                keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions)
                keepdata = [x[:, keepdim] for x in self.data.dat]
                if self.data.description is not None:
                    keepdimdesc = self.data.description.iloc[keepdim]
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        # If TICA is done on a subset of dimensions, combine non-projected data with projected data
        if self.dimensions is not None:
            newproj = []
            for k, t in zip(keepdata, proj):
                newproj.append(np.hstack((k, t)))
            proj = newproj

        if ndim is None:
            ndim = self.tic.dimension()
            logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(ndim))

        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent)
        from pandas import DataFrame
        # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame
        types = []
        indexes = []
        description = []
        for i in range(ndim):
            types += ['tica']
            indexes += [-1]
            description += ['TICA dimension {}'.format(i+1)]
        datatica.description = DataFrame({'type': types, 'atomIndexes': indexes, 'description': description})

        if self.dimensions is not None and keepdimdesc is not None:  # If TICA is done on a subset of dims
            datatica.description = keepdimdesc.append(datatica.description, ignore_index=True)

        return datatica
Exemplo n.º 14
0
class TICA(object):
    """ Class for calculating the TICA projections of a MetricData  object

    Time-based Independent Component Analysis
    Projects your data on the slowest coordinates identified for a
    given lagtime.

    Parameters
    ----------
    data : :class:`MetricData <htmd.metricdata.MetricData>` object
        The object whose data we wish to project onto the top TICA dimensions
    lag : int
        The correlation lagtime to use for TICA

    Example
    -------
    >>> from htmd.projections.tica import TICA
    >>> tica = TICA(data,20)

    References
    ----------
    Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G.
    and Noe, F. (2013) Identification of slow molecular order parameters
    for Markov model construction. J. Chem. Phys., 139 . 015102.
    """

    def __init__(self, data, lag):
        from pyemma.coordinates import tica
        # data.dat.tolist() might be better?
        self.data = data
        if isinstance(data, Metric):
            from pyemma.coordinates.transform.tica import TICA
            self.tic = TICA(lag)

            p = ProgressBar(len(data.simulations))
            for i in range(len(data.simulations)):
                # Fix for pyemma bug. Remove eventually:
                d, _, _ = data._projectSingle(i)
                if d is None or d.shape[0] < lag:
                    continue
                self.tic.partial_fit(d)
                p.progress()
            p.stop()
        else:
            self.tic = tica(data.dat.tolist(), lag=lag)

    def project(self, ndim=None):
        """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions

        Parameters
        ----------
        ndim : int
            The number of TICA dimensions we want to project the data on. If None is given it will use choose a number
            of dimensions to cover 95% of the kinetic variance.

        Returns
        -------
        dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object
            A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data

        Example
        -------
        >>> from htmd.projections.tica import TICA
        >>> tica = TICA(data,20)
        >>> dataTica = tica.project(5)
        """
        if ndim is not None:
            # self.tic._dim = ndim  # Old way of doing it. Deprecated since pyEMMA 2.1
            self.tic.set_params(dim=ndim)  # Change to this in 2.1 pyEMMA version

        if isinstance(self.data, Metric):  # Doesn't project on correct number of dimensions
            proj = []
            refs = []
            fstep = None

            '''from htmd.config import _config
            from joblib import Parallel, delayed
            results = Parallel(n_jobs=_config['ncpus'], verbose=11)(
                delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations)))

            for i in range(len(results)):
                proj.append(results[i][0])
                refs.append(results[i][1])
                fstep.append(results[i][2])'''

            droppedsims = []
            p = ProgressBar(len(self.data.simulations))
            for i in range(len(self.data.simulations)):
                d, r, f = self.data._projectSingle(i)
                if d is None:
                    droppedsims.append(i)
                    continue
                if fstep is None:
                    fstep = f
                refs.append(r)
                proj.append(self.tic.transform(d))
                p.progress()
            p.stop()
            simlist = self.data.simulations
            simlist = np.delete(simlist, droppedsims)
            ref = np.array(refs, dtype=object)
            #fstep = 0
            parent = None
        else:
            proj = self.tic.get_output()
            simlist = self.data.simlist
            ref = self.data.ref
            fstep = self.data.fstep
            parent = self.data

        if ndim is None:
            logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(self.tic.dimension()))
        #print(np.shape(proj))


        from htmd.metricdata import MetricData
        datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent)

        '''datatica = self.data.copy()
        #datatica.dat = self.data.deconcatenate(np.squeeze(proj))
        datatica.dat = np.array(proj, dtype=object)
        datatica.parent = self.data
        datatica.St = None
        datatica.Centers = None
        datatica.N = None
        datatica.K = None
        datatica._dataid = random.random()
        datatica._clusterid = None'''
        return datatica