Exemplo n.º 1
0
    def _estimate(self, iterable, **kw):
        partial_fit = 'partial' in kw
        it = iterable.iterator(return_trajindex=False,
                               chunk=self.chunksize,
                               stride=self.stride,
                               skip=self.skip)
        from pyemma._base.progress import ProgressReporter
        pg = ProgressReporter()
        pg.register(it.n_chunks, "calc mean+cov", 0)

        with it, pg.context():
            self._init_covar(partial_fit, it.n_chunks)

            for chunk in it:
                self._covar.add(chunk)
                pg.update(1)

        self.cov = self._covar.cov_XX(bessel=True)
        self.mu = self._covar.mean_X()

        self._model.update_model_params(mean=self._covar.mean_X())
        if not partial_fit:
            self._diagonalize()

        return self._model
Exemplo n.º 2
0
 def test_ctx4(self):
     pg = ProgressReporter()
     pg.register(100, 'test')
     pg.register(40, 'test2')
     try:
         with pg.context():
             pg.update(50, stage='all')
             raise Exception()
     except Exception:
         assert pg.num_registered == 0
Exemplo n.º 3
0
 def test_ctx2(self):
     pg = ProgressReporter()
     assert pg.show_progress
     pg.register(100, stage='test')
     pg.register(40, stage='test2')
     try:
         with pg.context(stage='test'):
             pg.update(50, stage='test')
             raise Exception()
     except Exception:
         assert pg.num_registered == 1
         assert 'test2' in pg.registered_stages
Exemplo n.º 4
0
    def filenames(self, filename_list):

        if isinstance(filename_list, str):
            filename_list = [filename_list]

        uniq = set(filename_list)
        if len(uniq) != len(filename_list):
            self.logger.warning("duplicate files/arrays detected")
            filename_list = list(uniq)

        from pyemma.coordinates.data.data_in_memory import DataInMemory

        if self._is_reader:
            if isinstance(self, DataInMemory):
                import warnings
                warnings.warn('filenames are not being used for DataInMemory')
                return

            self._ntraj = len(filename_list)
            if self._ntraj == 0:
                raise ValueError("empty file list")

            # validate files
            for f in filename_list:
                try:
                    stat = os.stat(f)
                except EnvironmentError:
                    self.logger.exception('Error during access of file "%s"' %
                                          f)
                    raise ValueError('could not read file "%s"' % f)

                if not os.path.isfile(
                        f):  # can be true for symlinks to directories
                    raise ValueError('"%s" is not a valid file')

                if stat.st_size == 0:
                    raise ValueError('file "%s" is empty' % f)

            # number of trajectories/data sets
            self._filenames = filename_list
            # determine len and dim via cache lookup,
            lengths = []
            offsets = []
            ndims = []
            # avoid cyclic imports
            from pyemma.coordinates.data.util.traj_info_cache import TrajectoryInfoCache
            from pyemma._base.progress import ProgressReporter
            pg = ProgressReporter()
            pg.register(len(filename_list), 'Obtaining file info')
            with pg.context():
                for filename in filename_list:
                    if config.use_trajectory_lengths_cache:
                        info = TrajectoryInfoCache.instance()[filename, self]
                    else:
                        info = self._get_traj_info(filename)
                    # nested data set support.
                    if hasattr(info, 'children'):
                        lengths.append(info.length)
                        offsets.append(info.offsets)
                        ndims.append(info.ndim)
                        for c in info.children:
                            lengths.append(c.length)
                            offsets.append(c.offsets)
                            ndims.append(c.ndim)
                    else:
                        lengths.append(info.length)
                        offsets.append(info.offsets)
                        ndims.append(info.ndim)
                    if len(filename_list) > 3:
                        pg.update(1)

            # ensure all trajs have same dim
            if not np.unique(ndims).size == 1:
                # group files by their dimensions to give user indicator
                ndims = np.array(ndims)
                filename_list = np.asarray(filename_list)
                sort_inds = np.argsort(ndims)
                import itertools, operator
                res = {}
                for dim, files in itertools.groupby(
                        zip(ndims[sort_inds], filename_list[sort_inds]),
                        operator.itemgetter(0)):
                    res[dim] = list(f[1] for f in files)

                raise ValueError(
                    "Input data has different dimensions ({dims})!"
                    " Files grouped by dimensions: {groups}".format(
                        dims=res.keys(), groups=res))

            self._ndim = ndims[0]
            self._lengths = lengths
            self._offsets = offsets

        else:
            # propagate this until we finally have a a reader
            self.data_producer.filenames = filename_list
Exemplo n.º 5
0
    def write_to_csv(self,
                     filename=None,
                     extension='.dat',
                     overwrite=False,
                     stride=1,
                     chunksize=None,
                     **kw):
        """ write all data to csv with numpy.savetxt

        Parameters
        ----------
        filename : str, optional
            filename string, which may contain placeholders {itraj} and {stride}:

            * itraj will be replaced by trajetory index
            * stride is stride argument of this method

            If filename is not given, it is being tried to obtain the filenames
            from the data source of this iterator.
        extension : str, optional, default='.dat'
            filename extension of created files
        overwrite : bool, optional, default=False
            shall existing files be overwritten? If a file exists, this method will raise.
        stride : int
            omit every n'th frame
        chunksize: int, default=None
            how many frames to process at once
        kw : dict, optional
            named arguments passed into numpy.savetxt (header, seperator etc.)

        Example
        -------
        Assume you want to save features calculated by some FeatureReader to ASCII:

        >>> import numpy as np, pyemma
        >>> import os
        >>> from pyemma.util.files import TemporaryDirectory
        >>> from pyemma.util.contexts import settings
        >>> data = [np.random.random((10,3))] * 3
        >>> reader = pyemma.coordinates.source(data)
        >>> filename = "distances_{itraj}.dat"
        >>> with TemporaryDirectory() as td, settings(show_progress_bars=False):
        ...    out = os.path.join(td, filename)
        ...    reader.write_to_csv(out, header='', delimiter=';')
        ...    print(sorted(os.listdir(td)))
        ['distances_0.dat', 'distances_1.dat', 'distances_2.dat']
        """
        import os
        if not filename:
            assert hasattr(self, 'filenames')
            #    raise RuntimeError("could not determine filenames")
            filenames = []
            for f in self.filenames:
                base, _ = os.path.splitext(f)
                filenames.append(base + extension)
        elif isinstance(filename, str):
            filename = filename.replace('{stride}', str(stride))
            filenames = [
                filename.replace('{itraj}', str(itraj))
                for itraj in range(self.number_of_trajectories())
            ]
        else:
            raise TypeError("filename should be str or None")
        self.logger.debug("write_to_csv, filenames=%s" % filenames)
        # check files before starting to write
        import errno
        for f in filenames:
            try:
                st = os.stat(f)
                raise OSError(errno.EEXIST)
            except OSError as e:
                if e.errno == errno.EEXIST:
                    if overwrite:
                        continue
                elif e.errno == errno.ENOENT:
                    continue
                raise
        f = None
        from pyemma._base.progress import ProgressReporter
        pg = ProgressReporter()
        it = self.iterator(stride, chunk=chunksize, return_trajindex=False)
        pg.register(it.n_chunks, "saving to csv")
        with it, pg.context():
            oldtraj = -1
            for X in it:
                if oldtraj != it.current_trajindex:
                    if f is not None:
                        f.close()
                    fn = filenames[it.current_trajindex]
                    self.logger.debug("opening file %s for writing csv." % fn)
                    f = open(fn, 'wb')
                    oldtraj = it.current_trajindex
                np.savetxt(f, X, **kw)
                f.flush()
                pg.update(1, 0)
        if f is not None:
            f.close()
Exemplo n.º 6
0
    def write_to_hdf5(self,
                      filename,
                      group='/',
                      data_set_prefix='',
                      overwrite=False,
                      stride=1,
                      chunksize=None,
                      h5_opt=None):
        """ writes all data of this Iterable to a given HDF5 file.
        This is equivalent of writing the result of func:`pyemma.coordinates.data._base.DataSource.get_output` to a file.

        Parameters
        ----------
        filename: str
            file name of output HDF5 file
        group: str, default='/'
            write all trajectories to this HDF5 group. The group name may not already exist in the file.
        data_set_prefix: str, default=None
            data set name prefix, will postfixed with the index of the trajectory.
        overwrite: bool, default=False
            if group and data sets already exist, shall we overwrite data?
        stride: int, default=1
            stride argument to iterator
        chunksize: int, default=None
            how many frames to process at once
        h5_opt: dict
            optional parameters for h5py.create_dataset

        Notes
        -----
        You can pass the following via h5_opt to enable compression/filters/shuffling etc:

        chunks
            (Tuple) Chunk shape, or True to enable auto-chunking.
        maxshape
            (Tuple) Make the dataset resizable up to this shape.  Use None for
            axes you want to be unlimited.
        compression
            (String or int) Compression strategy.  Legal values are 'gzip',
            'szip', 'lzf'.  If an integer in range(10), this indicates gzip
            compression level. Otherwise, an integer indicates the number of a
            dynamically loaded compression filter.
        compression_opts
            Compression settings.  This is an integer for gzip, 2-tuple for
            szip, etc. If specifying a dynamically loaded compression filter
            number, this must be a tuple of values.
        scaleoffset
            (Integer) Enable scale/offset filter for (usually) lossy
            compression of integer or floating-point data. For integer
            data, the value of scaleoffset is the number of bits to
            retain (pass 0 to let HDF5 determine the minimum number of
            bits necessary for lossless compression). For floating point
            data, scaleoffset is the number of digits after the decimal
            place to retain; stored values thus have absolute error
            less than 0.5*10**(-scaleoffset).
        shuffle
            (T/F) Enable shuffle filter. Only effective in combination with chunks.
        fletcher32
            (T/F) Enable fletcher32 error detection. Not permitted in
            conjunction with the scale/offset filter.
        fillvalue
            (Scalar) Use this value for uninitialized parts of the dataset.
        track_times
            (T/F) Enable dataset creation timestamps.
        """
        if h5_opt is None:
            h5_opt = {}
        import h5py
        from pyemma._base.progress import ProgressReporter
        pg = ProgressReporter()
        it = self.iterator(stride=stride,
                           chunk=chunksize,
                           return_trajindex=True)
        pg.register(it.n_chunks, 'writing output')
        with h5py.File(filename, mode='a') as f, it, pg.context():
            if group not in f:
                g = f.create_group(group)
            elif group == '/':  # root always exists.
                g = f[group]
            elif group in f and overwrite:
                self.logger.info('overwriting group "{}"'.format(group))
                del f[group]
                g = f.create_group(group)
            else:
                raise ValueError(
                    'Given group "{}" already exists. Choose another one.'.
                    format(group))

            # check output data sets
            data_sets = {}
            for itraj in np.arange(self.ntraj):
                template = '{prefix}_{index}' if data_set_prefix else '{index}'
                ds_name = template.format(prefix=data_set_prefix,
                                          index='{:04d}'.format(itraj))
                # group can be reused, eg. was empty before now check if we will overwrite something
                if ds_name in g:
                    if not overwrite:
                        raise ValueError(
                            'Refusing to overwrite data in group "{}".'.format(
                                group))
                else:
                    data_sets[itraj] = g.require_dataset(
                        ds_name,
                        shape=(self.trajectory_length(itraj=itraj,
                                                      stride=stride),
                               self.ndim),
                        dtype=self.output_type(),
                        **h5_opt)
            for itraj, X in it:
                ds = data_sets[itraj]
                ds[it.pos:it.pos + len(X)] = X
                pg.update(1)
Exemplo n.º 7
0
    def get_output(self,
                   dimensions=slice(0, None),
                   stride=1,
                   skip=0,
                   chunk=None):
        """Maps all input data of this transformer and returns it as an array or list of arrays

        Parameters
        ----------
        dimensions : list-like of indexes or slice, default=all
           indices of dimensions you like to keep.
        stride : int, default=1
           only take every n'th frame.
        skip : int, default=0
            initially skip n frames of each file.
        chunk: int, default=None
            How many frames to process at once. If not given obtain the chunk size
            from the source.

        Returns
        -------
        output : list of ndarray(T_i, d)
           the mapped data, where T is the number of time steps of the input data, or if stride > 1,
           floor(T_in / stride). d is the output dimension of this transformer.
           If the input consists of a list of trajectories, Y will also be a corresponding list of trajectories

        """
        if isinstance(dimensions, int):
            ndim = 1
            dimensions = slice(dimensions, dimensions + 1)
        elif isinstance(dimensions, (list, np.ndarray, tuple, slice)):
            if hasattr(dimensions, 'ndim') and dimensions.ndim > 1:
                raise ValueError(
                    'dimension indices can\'t have more than one dimension')
            ndim = len(np.zeros(self.ndim)[dimensions])
        else:
            raise ValueError('unsupported type (%s) of "dimensions"' %
                             type(dimensions))

        assert ndim > 0, "ndim was zero in %s" % self.__class__.__name__

        if chunk is None:
            chunk = self.chunksize

        # create iterator
        if self.in_memory and not self._mapping_to_mem_active:
            from pyemma.coordinates.data.data_in_memory import DataInMemory
            assert self._Y is not None
            it = DataInMemory(self._Y)._create_iterator(skip=skip,
                                                        chunk=chunk,
                                                        stride=stride,
                                                        return_trajindex=True)
        else:
            it = self._create_iterator(skip=skip,
                                       chunk=chunk,
                                       stride=stride,
                                       return_trajindex=True)

        with it:
            # allocate memory
            try:
                from pyemma import config
                if config.coordinates_check_output:
                    trajs = [
                        np.full((l, ndim), np.nan, dtype=self.output_type())
                        for l in it.trajectory_lengths()
                    ]
                else:
                    # TODO: avoid having a copy here, if Y is already filled
                    trajs = [
                        np.empty((l, ndim), dtype=self.output_type())
                        for l in it.trajectory_lengths()
                    ]
            except MemoryError:
                self.logger.exception(
                    "Could not allocate enough memory to map all data."
                    " Consider using a larger stride.")
                return

            if self._logger_is_active(self._loglevel_DEBUG):
                self.logger.debug("get_output(): dimensions=%s" %
                                  str(dimensions))
                self.logger.debug(
                    "get_output(): created output trajs with shapes: %s" %
                    [x.shape for x in trajs])
                self.logger.debug("nchunks :%s, chunksize=%s" %
                                  (it.n_chunks, it.chunksize))
            # fetch data
            from pyemma._base.progress import ProgressReporter
            pg = ProgressReporter()
            pg.register(it.n_chunks,
                        description='getting output of %s' %
                        self.__class__.__name__)
            with pg.context(), it:
                for itraj, chunk in it:
                    i = slice(it.pos, it.pos + len(chunk))
                    assert i.stop - i.start > 0
                    trajs[itraj][i, :] = chunk[:, dimensions]
                    pg.update(1)

        if config.coordinates_check_output:
            for i, t in enumerate(trajs):
                finite = self._chunk_finite(t)
                if not np.all(finite):
                    # determine position
                    frames = np.where(np.logical_not(finite))
                    if not len(frames):
                        raise RuntimeError(
                            'nothing got assigned for traj {}'.format(i))
                    raise RuntimeError(
                        'unassigned sections in traj {i} in range [{frames}]'.
                        format(frames=frames, i=i))

        return trajs
Exemplo n.º 8
0
    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            from pyemma.util.reflection import getargspec_no_self
            argspec = getargspec_no_self(msmest.effective_count_matrix)
            kw = {}
            if show_progress and 'callback' in argspec.args:
                from pyemma._base.progress import ProgressReporter
                from pyemma._base.parallel import get_n_jobs

                pg = ProgressReporter()
                # this is a fast operation
                C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True)
                pg.register(C_temp.nnz, 'compute statistical inefficiencies')
                del C_temp
                callback = lambda: pg.update(1)
                kw['callback'] = callback
                kw['n_jobs'] = get_n_jobs()

            self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        # Compute reversibly connected sets
        if self._mincount_connectivity > 0:
            self._connected_sets = \
                self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity)
        else:
            self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            # self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True
Exemplo n.º 9
0
    def _estimate(self, iterable, partial_fit=False):
        indim = iterable.dimension()
        if not indim:
            raise ValueError("zero dimension from data source!")

        if not any(
                iterable.trajectory_lengths(stride=self.stride,
                                            skip=self.lag + self.skip) > 0):
            if partial_fit:
                self.logger.warning(
                    "Could not use data passed to partial_fit(), "
                    "because no single data set [longest=%i] is longer than lag+skip [%i]",
                    max(
                        iterable.trajectory_lengths(self.stride,
                                                    skip=self.skip)),
                    self.lag + self.skip)
                return self
            else:
                raise ValueError(
                    "None single dataset [longest=%i] is longer than"
                    " lag+skip [%i]." % (max(
                        iterable.trajectory_lengths(
                            self.stride,
                            skip=self.skip)), self.lag + self.skip))

        self.logger.debug(
            "will use %s total frames for %s",
            iterable.trajectory_lengths(self.stride, skip=self.skip),
            self.name)

        chunksize = 0 if partial_fit else iterable.chunksize
        it = iterable.iterator(lag=self.lag,
                               return_trajindex=False,
                               stride=self.stride,
                               skip=self.skip,
                               chunk=chunksize)
        # iterator over input weights
        if hasattr(self.weights, 'iterator'):
            if hasattr(self.weights, '_transform_array'):
                self.weights.data_producer = iterable
            it_weights = self.weights.iterator(lag=0,
                                               return_trajindex=False,
                                               stride=self.stride,
                                               skip=self.skip,
                                               chunk=chunksize)
            if it_weights.number_of_trajectories(
            ) != iterable.number_of_trajectories():
                raise ValueError(
                    "number of weight arrays did not match number of input data sets. {} vs. {}"
                    .format(it_weights.number_of_trajectories(),
                            iterable.number_of_trajectories()))
        else:
            # if we only have a scalar, repeat it.
            import itertools
            it_weights = itertools.repeat(self.weights)

        # TODO: we could possibly optimize the case lag>0 and c0t=False using skip.
        # Access how much iterator hassle this would be.
        #self.skipped=0
        pg = ProgressReporter()
        pg.register(it.n_chunks, 'calculate covariances', stage=0)
        with it, pg.context(stage=0):
            self._init_covar(partial_fit, it.n_chunks)
            for data, weight in zip(it, it_weights):
                if self.lag != 0:
                    X, Y = data
                else:
                    X, Y = data, None

                if weight is not None:
                    if isinstance(weight, np.ndarray):
                        weight = weight.squeeze()[:len(X)]
                        # TODO: if the weight is exactly zero it makes not sense to add the chunk to running moments.
                        # however doing so, leads to wrong results...
                        # if np.all(np.abs(weight) < np.finfo(np.float).eps):
                        #     #print("skip")
                        #     self.skipped += len(X)
                        #     continue
                if self.remove_constant_mean is not None:
                    X = X - self.remove_constant_mean[np.newaxis, :]
                    if Y is not None:
                        Y = Y - self.remove_constant_mean[np.newaxis, :]

                try:
                    self._rc.add(X, Y, weights=weight)
                except MemoryError:
                    raise MemoryError(
                        'Covariance matrix does not fit into memory. '
                        'Input is too high-dimensional ({} dimensions). '.
                        format(X.shape[1]))
                pg.update(1, stage=0)

        if partial_fit:
            if '_rc' not in self.__serialize_fields:
                self.__serialize_fields.append('_rc')
        else:
            if '_rc' in self.__serialize_fields:
                self.__serialize_fields.remove('_rc')
        return self
Exemplo n.º 10
0
 def test_below_threshold(self):
     # show not raise
     pg = ProgressReporter()
     pg.register(2)
     pg.update(1)
     pg.set_description('dummy')