Пример #1
0
def load(filename, chunks=10, **kwargs):
    """ A loader that will mimic mdtraj.Trajectory.load, but construct a
    dasktraj.Trajectory with a dask.array as xyz
    """

    top = kwargs.pop('top', None)
    extension = _get_extension(filename)
    if extension not in _TOPOLOGY_EXTS:
        topology = _parse_topology(top)

    length = len(open(filename))
    n_chunks = int(length / chunks)
    frames_left = length % chunks
    if frames_left != 0:
        n_chunks += 1
    # TODO this needs to be closed at some point
    data = load_chunks(filename, extension, chunks, range(n_chunks), **kwargs)

    #TODO: use this to construct unitcells
    # Pop out irelevant info
    uv = data.pop('unitcell_vectors')
    traj = Trajectory(topology=topology, delayed_objects=data, **data)
    if uv is not None:
        traj.unitcell_vectors = uv
    return traj
Пример #2
0
def load(filename, chunks=10, **kwargs):
    """
    A loader that will mimic :py:func:`mdtraj.Trajectory.load()`, but
    construct a :py:class:`dasktraj.Trajectory` with a :py:class:`dask.array`
    as xyz, time, and unitcell properties.

    Parameters
    ----------
    filename : string
        Filename of the file to load.
    chunks : int
        Number of frames per chunk.

    Returns
    -------
    trajectory
        A :py:class:`dasktraj.Trajectory`
    """

    top = kwargs.pop("top", None)
    extension = _get_extension(filename)
    if extension not in _TOPOLOGY_EXTS:
        topology = _parse_topology(top)

    filename = os.path.abspath(filename)
    length = len(open(filename))
    n_chunks = int(length / chunks)
    frames_left = length % chunks
    if frames_left != 0:
        n_chunks += 1
    # TODO this needs to be closed at some point
    data = load_chunks(filename, extension, chunks, range(n_chunks), **kwargs)

    # TODO: use this to construct unitcells
    # Pop out irrelevant info
    uv = data.pop("unitcell_vectors")
    traj = Trajectory(topology=topology, delayed_objects=data, **data)
    if uv is not None:
        traj.unitcell_vectors = uv
    return traj
Пример #3
0
    def __init__(self, filename, chunk=1000, **kwargs):
        """An iterator over a trajectory from one or more files on disk, in fragments

        This may be more memory efficient than loading an entire trajectory at
        once

        Parameters
        ----------
        filename : str
            Path to the trajectory file on disk
        chunk : int
            Number of frames to load at once from disk per iteration.  If 0, load all.

        Other Parameters
        ----------------
        top : {str, Trajectory, Topology}
            Most trajectory formats do not contain topology information. Pass in
            either the path to a RCSB PDB file, a trajectory, or a topology to
            supply this information. This option is not required for the .h5, .lh5,
            and .pdb formats, which already contain topology information.
        stride : int, default=None
            Only read every stride-th frame.
        atom_indices : array_like, optional
            If not none, then read only a subset of the atoms coordinates from the
            file. This may be slightly slower than the standard read because it
            requires an extra copy, but will save memory.

        See Also
        --------
        load, load_frame

        Examples
        --------

        >>> import mdtraj as md
        >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP
        ...     print chunk # doctest: +SKIP

        <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
        <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
        <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
        <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
        <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>

        """
        self._filename = filename
        self._stride = kwargs.pop('stride', 1)
        self._atom_indices = cast_indices(kwargs.pop('atom_indices', None))
        self._top = kwargs.pop('top', None)
        self._skip = kwargs.pop('skip', 0)
        self._kwargs = kwargs
        self._chunksize = chunk
        self._extension = _get_extension(self._filename)
        self._closed = False
        self._seeked = False
        if self._extension not in _TOPOLOGY_EXTS:
            self._topology = load_topology_cached(self._top)
        else:
            self._topology = self._top

        if self._extension in ('pdb', 'pdb.gz'):
            raise Exception("Not supported as trajectory format {ext}".format(
                ext=self._extension))

        self._mode = None
        if isinstance(self._stride, np.ndarray):
            self._mode = 'random_access'
            self._f = (lambda x: md_open(x, n_atoms=self._topology.n_atoms)
                       if self._extension in
                       ('.crd', '.mdcrd') else md_open(self._filename))(
                           self._filename)
            self._ra_it = self._random_access_generator(self._f)
        else:
            self._mode = 'traj'
            self._f = (lambda x: md_open(x, n_atoms=self._topology.n_atoms)
                       if self._extension in
                       ('.crd', '.mdcrd') else md_open(self._filename))(
                           self._filename)

            # offset array handling
            offsets = kwargs.pop('offsets', None)
            if hasattr(self._f, 'offsets') and offsets is not None:
                self._f.offsets = offsets
Пример #4
0
def iterload(filename, chunk=100, **kwargs):
    """An iterator over a trajectory from one or more files on disk, in fragments

    This may be more memory efficient than loading an entire trajectory at
    once

    Parameters
    ----------
    filename : str
        Path to the trajectory file on disk
    chunk : int
        Number of frames to load at once from disk per iteration.  If 0, load all.

    Other Parameters
    ----------------
    top : {str, Trajectory, Topology}
        Most trajectory formats do not contain topology information. Pass in
        either the path to a RCSB PDB file, a trajectory, or a topology to
        supply this information. This option is not required for the .h5, .lh5,
        and .pdb formats, which already contain topology information.
    stride : int, default=None
        Only read every stride-th frame.
    atom_indices : array_like, optional
        If not none, then read only a subset of the atoms coordinates from the
        file. This may be slightly slower than the standard read because it
        requires an extra copy, but will save memory.

    See Also
    --------
    load, load_frame

    Examples
    --------

    >>> import mdtraj as md
    >>> for chunk in md.iterload('output.xtc', top='topology.pdb') # doctest: +SKIP
    ...     print chunk # doctest: +SKIP

    <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
    <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
    <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
    <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>
    <mdtraj.Trajectory with 100 frames, 423 atoms at 0x110740a90>

    """
    stride = kwargs.pop('stride', 1)
    atom_indices = cast_indices(kwargs.pop('atom_indices', None))
    top = kwargs.pop('top', None)
    skip = kwargs.pop('skip', 0)

    extension = _get_extension(filename)
    if extension not in _TOPOLOGY_EXTS:
        topology = _parse_topology(top)
    else:
        topology = top

    if chunk == 0:
        # If chunk was 0 then we want to avoid filetype-specific code
        # in case of undefined behavior in various file parsers.
        # TODO: this will first apply stride, then skip!
        if extension not in _TOPOLOGY_EXTS:
            kwargs['top'] = top
        yield load(filename, **kwargs)[skip:]
    elif extension in ('.pdb', '.pdb.gz'):
        # the PDBTrajectortFile class doesn't follow the standard API. Fixing it
        # to support iterload could be worthwhile, but requires a deep refactor.
        t = load(filename, stride=stride, atom_indices=atom_indices)
        for i in range(0, len(t), chunk):
            yield t[i:i + chunk]

    elif isinstance(stride, np.ndarray):
        with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in
              ('.crd', '.mdcrd') else open(filename))(filename) as f:
            x_prev = 0
            curr_size = 0
            traj = []
            leftovers = []
            for k, g in groupby(enumerate(stride), lambda a: a[0] - a[1]):
                grouped_stride = list(map(itemgetter(1), g))
                seek_offset = (1 if x_prev != 0 else 0)
                seek_to = grouped_stride[0] - x_prev - seek_offset
                f.seek(seek_to, whence=1)
                x_prev = grouped_stride[-1]
                group_size = len(grouped_stride)
                if curr_size + group_size > chunk:
                    leftovers = grouped_stride
                else:
                    local_traj = _get_local_traj_object(
                        atom_indices, extension, f, group_size, topology,
                        **kwargs)
                    traj.append(local_traj)
                    curr_size += len(grouped_stride)
                if curr_size == chunk:
                    yield _efficient_traj_join(traj)
                    curr_size = 0
                    traj = []
                while leftovers:
                    local_chunk = leftovers[:min(chunk, len(leftovers))]
                    local_traj = _get_local_traj_object(
                        atom_indices, extension, f, len(local_chunk), topology,
                        **kwargs)
                    traj.append(local_traj)
                    leftovers = leftovers[min(chunk, len(leftovers)):]
                    curr_size += len(local_chunk)
                    if curr_size == chunk:
                        yield _efficient_traj_join(traj)
                        curr_size = 0
                        traj = []
            if traj:
                yield _efficient_traj_join(traj)
            raise StopIteration()

    else:
        with (lambda x: open(x, n_atoms=topology.n_atoms) if extension in
              ('.crd', '.mdcrd') else open(filename))(filename) as f:
            if skip > 0:
                f.seek(skip)
            while True:
                if extension not in _TOPOLOGY_EXTS:
                    traj = f.read_as_traj(topology,
                                          n_frames=chunk * stride,
                                          stride=stride,
                                          atom_indices=atom_indices,
                                          **kwargs)
                else:
                    traj = f.read_as_traj(n_frames=chunk * stride,
                                          stride=stride,
                                          atom_indices=atom_indices,
                                          **kwargs)

                if len(traj) == 0:
                    raise StopIteration()

                yield traj