def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): if units != 'frames': raise RuntimeError( 'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.' ) metr = data from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: self.tic.partial_fit(pro[0]) p.progress(len(proj)) p.stop() else: lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError( 'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.' ) self.tic = tica(data.dat.tolist(), lag=lag)
def testChunksizeResultsTica(self): chunk = 40 lag = 100 np.random.seed(0) X = np.random.randn(23000, 3) # un-chunked d = DataInMemory(X) tica = TICA(lag=lag, output_dimension=1) tica.data_producer = d tica.parametrize() cov = tica.cov.copy() mean = tica.mu.copy() # ------- run again with new chunksize ------- d = DataInMemory(X) d.chunksize = chunk tica = TICA(lag=lag, output_dimension=1) tica.data_producer = d tica.parametrize() np.testing.assert_allclose(tica.mu, mean) np.testing.assert_allclose(tica.cov, cov)
def __init__(self, data, lag, units='frames', dimensions=None): from pyemma.coordinates.transform.tica import TICA as TICApyemma self.data = data self.dimensions = dimensions if isinstance(data, Metric): # Memory efficient TICA projecting trajectories on the fly if units != 'frames': raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.') self.tic = TICApyemma(lag) metr = data p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: if pro is None: continue if self.dimensions is None: self.tic.partial_fit(pro[0]) else: # Sub-select dimensions for fitting self.tic.partial_fit(pro[0][:, self.dimensions]) p.progress(len(proj)) p.stop() else: # In-memory TICA lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.') self.tic = TICApyemma(lag) if self.dimensions is None: datalist = data.dat.tolist() else: # Sub-select dimensions for fitting datalist = [x[:, self.dimensions].copy() for x in data.dat] self.tic.fit(datalist)
def test_singular_zeros(self): tica = TICA(lag=1, output_dimension=1) # make some data that has one column of all zeros X = np.random.randn(100, 2) X = np.hstack((X, np.zeros((100, 1)))) d = DataInMemory(X) tica.data_producer = d tica.parametrize() assert tica.eigenvectors.dtype == np.float64 assert tica.eigenvalues.dtype == np.float64
def test_duplicated_data(self): tica = TICA(lag=1, output_dimension=1) # make some data that has one column repeated twice X = np.random.randn(100, 2) X = np.hstack((X, X[:, 0, np.newaxis])) d = DataInMemory(X) tica.data_producer = d tica.parametrize() assert tica.eigenvectors.dtype == np.float64 assert tica.eigenvalues.dtype == np.float64
def __init__(self, data, lag, units="frames", dimensions=None, njobs=None): from pyemma.coordinates.transform.tica import TICA as TICApyemma from tqdm import tqdm from htmd.util import _getNjobs self.data = data self.dimensions = dimensions self.njobs = njobs if njobs is not None else _getNjobs() if isinstance( data, Metric ): # Memory efficient TICA projecting trajectories on the fly if units != "frames": raise RuntimeError( "Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues." ) self.tic = TICApyemma(lag) metr = data pbar = tqdm(total=len(metr.simulations)) for proj in _projectionGenerator(metr, self.njobs): for pro in proj: if pro is None: continue if self.dimensions is None: self.tic.partial_fit(pro[0]) else: # Sub-select dimensions for fitting self.tic.partial_fit(pro[0][:, self.dimensions]) pbar.update(len(proj)) pbar.close() else: # In-memory TICA lag = unitconvert(units, "frames", lag, data.fstep) if lag == 0: raise RuntimeError( "Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA." ) self.tic = TICApyemma(lag) if self.dimensions is None: datalist = data.dat.tolist() else: # Sub-select dimensions for fitting datalist = [x[:, self.dimensions].copy() for x in data.dat] self.tic.fit(datalist)
def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA lag = unitconvert(units, 'frames', lag, data.fstep) self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag)
def test(self): np.random.seed(0) tica = TICA(lag=50, output_dimension=1) data = np.random.randn(100, 10) ds = DataInMemory(data) tica.data_producer = ds tica.parametrize() Y = tica.map(data)
def __init__(self, data, lag): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag)
class TICA(object): """ Class for calculating the TICA projections of a MetricData object Time-based Independent Component Analysis Projects your data on the slowest coordinates identified for a given lagtime. Parameters ---------- data : :class:`MetricData <htmd.metricdata.MetricData>` object The object whose data we wish to project onto the top TICA dimensions lag : int The correlation lagtime to use for TICA units : str The units of lag. Can be 'frames' or any time unit given as a string. Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) References ---------- Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G. and Noe, F. (2013) Identification of slow molecular order parameters for Markov model construction. J. Chem. Phys., 139 . 015102. """ def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): if units != 'frames': raise RuntimeError( 'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.' ) metr = data from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: self.tic.partial_fit(pro[0]) p.progress(len(proj)) p.stop() else: lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError( 'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.' ) self.tic = tica(data.dat.tolist(), lag=lag) def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue proj.append(self.tic.transform(pro[0])) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.map = DataFrame({ 'type': types, 'indexes': indexes, 'description': description }) return datatica
class TICA(object): """ Class for calculating the TICA projections of a MetricData object Time-based Independent Component Analysis Projects your data on the slowest coordinates identified for a given lagtime. Parameters ---------- data : :class:`MetricData <htmd.metricdata.MetricData>` object The object whose data we wish to project onto the top TICA dimensions lag : int The correlation lagtime to use for TICA units : str The units of lag. Can be 'frames' or any time unit given as a string. Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) References ---------- Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G. and Noe, F. (2013) Identification of slow molecular order parameters for Markov model construction. J. Chem. Phys., 139 . 015102. """ def __init__(self, data, lag, units='frames'): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA lag = unitconvert(units, 'frames', lag, data.fstep) self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag) def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params( dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance( self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None '''from htmd.config import _config from joblib import Parallel, delayed results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations))) for i in range(len(results)): proj.append(results[i][0]) refs.append(results[i][1]) fstep.append(results[i][2])''' droppedsims = [] p = ProgressBar(len(self.data.simulations)) for i in range(len(self.data.simulations)): d, r, f = self.data._projectSingle(i) if d is None: droppedsims.append(i) continue if fstep is None: fstep = f refs.append(r) proj.append(self.tic.transform(d)) p.progress() p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(self.tic.dimension())) #print(np.shape(proj)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) '''datatica = self.data.copy() #datatica.dat = self.data.deconcatenate(np.squeeze(proj)) datatica.dat = np.array(proj, dtype=object) datatica.parent = self.data datatica.St = None datatica.Centers = None datatica.N = None datatica.K = None datatica._dataid = random.random() datatica._clusterid = None''' return datatica
class TICA(object): """ Class for calculating the TICA projections of a MetricData object Time-based Independent Component Analysis Projects your data on the slowest coordinates identified for a given lagtime. Parameters ---------- data : :class:`MetricData <htmd.metricdata.MetricData>` object The object whose data we wish to project onto the top TICA dimensions lag : int The correlation lagtime to use for TICA units : str The units of lag. Can be 'frames' or any time unit given as a string. dimensions : list A list of dimensions of the original data on which to apply TICA. All other dimensions will stay unaltered. If None is given, it will apply on all dimensions. Example ------- >>> from htmd.projections.tica import TICA >>> metr = Metric(sims) >>> metr.set(MetricSelfDistance('protein and name CA')) >>> data = metr.project() >>> tica = TICA(data, 20) >>> datatica = tica.project(3) Alternatively you can pass a Metric object to TICA. Uses less memory but is slower. >>> metr = Metric(sims) >>> metr.set(MetricSelfDistance('protein and name CA')) >>> slowtica = TICA(metr, 20) >>> datatica = slowtica.project(3) References ---------- Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G. and Noe, F. (2013) Identification of slow molecular order parameters for Markov model construction. J. Chem. Phys., 139 . 015102. """ def __init__(self, data, lag, units='frames', dimensions=None): from pyemma.coordinates.transform.tica import TICA as TICApyemma from tqdm import tqdm self.data = data self.dimensions = dimensions if isinstance( data, Metric ): # Memory efficient TICA projecting trajectories on the fly if units != 'frames': raise RuntimeError( 'Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.' ) self.tic = TICApyemma(lag) metr = data pbar = tqdm(total=len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: if pro is None: continue if self.dimensions is None: self.tic.partial_fit(pro[0]) else: # Sub-select dimensions for fitting self.tic.partial_fit(pro[0][:, self.dimensions]) pbar.update(len(proj)) pbar.close() else: # In-memory TICA lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError( 'Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.' ) self.tic = TICApyemma(lag) if self.dimensions is None: datalist = data.dat.tolist() else: # Sub-select dimensions for fitting datalist = [x[:, self.dimensions].copy() for x in data.dat] self.tic.fit(datalist) def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ from tqdm import tqdm if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance( self.data, Metric ): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data k = -1 droppedsims = [] pbar = tqdm(total=len(metr.simulations)) for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append( self.tic.transform( pro[0][:, self.dimensions]).astype(np.float32) ) # Sub-select dimensions for projecting else: proj.append( self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] pbar.update(len(projecteddata)) pbar.close() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError( 'TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions' .format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info( 'Kept {} dimension(s) to cover 95% of kinetic variance.'. format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i + 1)] datatica.description = DataFrame({ 'type': types, 'atomIndexes': indexes, 'description': description }) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica
class TICA(object): """ Class for calculating the TICA projections of a MetricData object Time-based Independent Component Analysis Projects your data on the slowest coordinates identified for a given lagtime. Parameters ---------- data : :class:`MetricData <htmd.metricdata.MetricData>` object The object whose data we wish to project onto the top TICA dimensions lag : int The correlation lagtime to use for TICA units : str The units of lag. Can be 'frames' or any time unit given as a string. dimensions : list A list of dimensions of the original data on which to apply TICA. All other dimensions will stay unaltered. If None is given, it will apply on all dimensions. Example ------- >>> from htmd.projections.tica import TICA >>> metr = Metric(sims) >>> metr.set(MetricSelfDistance('protein and name CA')) >>> data = metr.project() >>> tica = TICA(data, 20) >>> datatica = tica.project(3) Alternatively you can pass a Metric object to TICA. Uses less memory but is slower. >>> metr = Metric(sims) >>> metr.set(MetricSelfDistance('protein and name CA')) >>> slowtica = TICA(metr, 20) >>> datatica = slowtica.project(3) References ---------- Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G. and Noe, F. (2013) Identification of slow molecular order parameters for Markov model construction. J. Chem. Phys., 139 . 015102. """ def __init__(self, data, lag, units='frames', dimensions=None): from pyemma.coordinates.transform.tica import TICA as TICApyemma self.data = data self.dimensions = dimensions if isinstance(data, Metric): # Memory efficient TICA projecting trajectories on the fly if units != 'frames': raise RuntimeError('Cannot use delayed projection TICA with units other than frames for now. Report this to HTMD issues.') self.tic = TICApyemma(lag) metr = data p = ProgressBar(len(metr.simulations)) for proj in _projectionGenerator(metr, _getNcpus()): for pro in proj: if pro is None: continue if self.dimensions is None: self.tic.partial_fit(pro[0]) else: # Sub-select dimensions for fitting self.tic.partial_fit(pro[0][:, self.dimensions]) p.progress(len(proj)) p.stop() else: # In-memory TICA lag = unitconvert(units, 'frames', lag, data.fstep) if lag == 0: raise RuntimeError('Lag time conversion resulted in 0 frames. Please use a larger lag-time for TICA.') self.tic = TICApyemma(lag) if self.dimensions is None: datalist = data.dat.tolist() else: # Sub-select dimensions for fitting datalist = [x[:, self.dimensions].copy() for x in data.dat] self.tic.fit(datalist) def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: self.tic.set_params(dim=ndim) keepdata = [] keepdim = None keepdimdesc = None if isinstance(self.data, Metric): # Memory efficient TICA projecting trajectories on the fly proj = [] refs = [] fstep = None metr = self.data p = ProgressBar(len(metr.simulations)) k = -1 droppedsims = [] for projecteddata in _projectionGenerator(metr, _getNcpus()): for pro in projecteddata: k += 1 if pro is None: droppedsims.append(k) continue if self.dimensions is not None: numDimensions = pro[0].shape[1] keepdim = np.setdiff1d(range(numDimensions), self.dimensions) keepdata.append(pro[0][:, keepdim]) proj.append(self.tic.transform(pro[0][:, self.dimensions]).astype(np.float32)) # Sub-select dimensions for projecting else: proj.append(self.tic.transform(pro[0]).astype(np.float32)) refs.append(pro[1]) if fstep is None: fstep = pro[2] p.progress(len(projecteddata)) p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) parent = None if self.dimensions is not None: from htmd.projections.metric import _singleMolfile from htmd.molecule.molecule import Molecule (single, molfile) = _singleMolfile(metr.simulations) if single: keepdimdesc = metr.getMapping(Molecule(molfile)) keepdimdesc = keepdimdesc.iloc[keepdim] else: if ndim is not None and self.data.numDimensions < ndim: raise RuntimeError('TICA cannot increase the dimensionality of your data. Your data has {} dimensions and you requested {} TICA dimensions'.format(self.data.numDimensions, ndim)) if self.dimensions is not None: keepdim = np.setdiff1d(range(self.data.numDimensions), self.dimensions) keepdata = [x[:, keepdim] for x in self.data.dat] if self.data.description is not None: keepdimdesc = self.data.description.iloc[keepdim] proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data # If TICA is done on a subset of dimensions, combine non-projected data with projected data if self.dimensions is not None: newproj = [] for k, t in zip(keepdata, proj): newproj.append(np.hstack((k, t))) proj = newproj if ndim is None: ndim = self.tic.dimension() logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(ndim)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj), simlist=simlist, ref=ref, fstep=fstep, parent=parent) from pandas import DataFrame # TODO: Make this messy pandas creation cleaner. I'm sure I can append rows to DataFrame types = [] indexes = [] description = [] for i in range(ndim): types += ['tica'] indexes += [-1] description += ['TICA dimension {}'.format(i+1)] datatica.description = DataFrame({'type': types, 'atomIndexes': indexes, 'description': description}) if self.dimensions is not None and keepdimdesc is not None: # If TICA is done on a subset of dims datatica.description = keepdimdesc.append(datatica.description, ignore_index=True) return datatica
class TICA(object): """ Class for calculating the TICA projections of a MetricData object Time-based Independent Component Analysis Projects your data on the slowest coordinates identified for a given lagtime. Parameters ---------- data : :class:`MetricData <htmd.metricdata.MetricData>` object The object whose data we wish to project onto the top TICA dimensions lag : int The correlation lagtime to use for TICA Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) References ---------- Perez-Hernandez, G. and Paul, F. and Giorgino, T. and de Fabritiis, G. and Noe, F. (2013) Identification of slow molecular order parameters for Markov model construction. J. Chem. Phys., 139 . 015102. """ def __init__(self, data, lag): from pyemma.coordinates import tica # data.dat.tolist() might be better? self.data = data if isinstance(data, Metric): from pyemma.coordinates.transform.tica import TICA self.tic = TICA(lag) p = ProgressBar(len(data.simulations)) for i in range(len(data.simulations)): # Fix for pyemma bug. Remove eventually: d, _, _ = data._projectSingle(i) if d is None or d.shape[0] < lag: continue self.tic.partial_fit(d) p.progress() p.stop() else: self.tic = tica(data.dat.tolist(), lag=lag) def project(self, ndim=None): """ Projects the data object given to the constructor onto the top `ndim` TICA dimensions Parameters ---------- ndim : int The number of TICA dimensions we want to project the data on. If None is given it will use choose a number of dimensions to cover 95% of the kinetic variance. Returns ------- dataTica : :class:`MetricData <htmd.metricdata.MetricData>` object A new :class:`MetricData <htmd.metricdata.MetricData>` object containing the TICA projected data Example ------- >>> from htmd.projections.tica import TICA >>> tica = TICA(data,20) >>> dataTica = tica.project(5) """ if ndim is not None: # self.tic._dim = ndim # Old way of doing it. Deprecated since pyEMMA 2.1 self.tic.set_params(dim=ndim) # Change to this in 2.1 pyEMMA version if isinstance(self.data, Metric): # Doesn't project on correct number of dimensions proj = [] refs = [] fstep = None '''from htmd.config import _config from joblib import Parallel, delayed results = Parallel(n_jobs=_config['ncpus'], verbose=11)( delayed(_test)(self.data, self.tic, i) for i in range(len(self.data.simulations))) for i in range(len(results)): proj.append(results[i][0]) refs.append(results[i][1]) fstep.append(results[i][2])''' droppedsims = [] p = ProgressBar(len(self.data.simulations)) for i in range(len(self.data.simulations)): d, r, f = self.data._projectSingle(i) if d is None: droppedsims.append(i) continue if fstep is None: fstep = f refs.append(r) proj.append(self.tic.transform(d)) p.progress() p.stop() simlist = self.data.simulations simlist = np.delete(simlist, droppedsims) ref = np.array(refs, dtype=object) #fstep = 0 parent = None else: proj = self.tic.get_output() simlist = self.data.simlist ref = self.data.ref fstep = self.data.fstep parent = self.data if ndim is None: logger.info('Kept {} dimension(s) to cover 95% of kinetic variance.'.format(self.tic.dimension())) #print(np.shape(proj)) from htmd.metricdata import MetricData datatica = MetricData(dat=np.array(proj, dtype=object), simlist=simlist, ref=ref, fstep=fstep, parent=parent) '''datatica = self.data.copy() #datatica.dat = self.data.deconcatenate(np.squeeze(proj)) datatica.dat = np.array(proj, dtype=object) datatica.parent = self.data datatica.St = None datatica.Centers = None datatica.N = None datatica.K = None datatica._dataid = random.random() datatica._clusterid = None''' return datatica