Пример #1
0
    def __init__(self, filename, convert_units=True, **kwargs):
        """
        Parameters
        ----------
        filename : str or :class:`h5py.File`
            trajectory filename or open h5py file
        convert_units : bool (optional)
            convert units to MDAnalysis units
        **kwargs : dict
            General reader arguments.

        Raises
        ------
        RuntimeError
            when `H5PY`_ is not installed
        RuntimeError
            when a unit is not recognized by MDAnalysis
        ValueError
            when ``n_atoms`` changes values between timesteps
        ValueError
            when ``convert_units=True`` but the H5MD file contains no units
        NoDataError
            when the H5MD file has no 'position', 'velocity', or
            'force' group

        """
        if not HAS_H5PY:
            raise RuntimeError("Please install h5py")
        super(H5MDReader, self).__init__(filename, **kwargs)
        self.filename = filename
        ## add parallel MPI kwwargs
        ## kwwargs needs to get pulled out
        self.open_trajectory()

        # _has dictionary used for checking whether h5md file has
        # 'position', 'velocity', or 'force' groups in the file
        self._has = {name: name in self._particle_group for
                     name in ('position', 'velocity', 'force')}

        # Gets n_atoms from first available group
        for name, value in self._has.items():
            if value:
                self.n_atoms = self._particle_group[name]['value'].shape[1]
                break
        else:
            raise NoDataError("Provide at least a position, velocity"
                              " or force group in the h5md file.")
        self.ts = self._Timestep(self.n_atoms,
                                 velocities=self.has_velocities,
                                 forces=self.has_forces,
                                 **self._ts_kwargs)
        self.units = {'time': None,
                      'length': None,
                      'velocity': None,
                      'force': None}
        self._set_translated_units()  # fills units dictionary
        self._read_next_timestep()
Пример #2
0
    def save(self, filename=None):
        """Save RMSD from :attr:`RMSD.rmsd` to text file *filename*.

        If *filename* is not supplied then the default provided to the
        constructor is used.

        The data are saved with :func:`np.savetxt`.
        """
        filename = filename or self.filename
        if filename is not None:
            if self.rmsd is None:
                raise NoDataError("rmsd has not been calculated yet")
            np.savetxt(filename, self.rmsd)
            logger.info("Wrote RMSD timeseries  to file %r", filename)
        return filename
Пример #3
0
    def save(self, filename=None):
        """Save RMSD from :attr:`RMSD.rmsd` to text file *filename*.

        Parameters
        ----------
        filename : str (optional)
            if no filename is given the default provided to the constructor is
            used.
        """
        filename = filename or self.filename
        if filename is not None:
            if self.rmsd is None:
                raise NoDataError("rmsd has not been calculated yet")
            np.savetxt(filename, self.rmsd)
            logger.info("Wrote RMSD timeseries  to file %r", filename)
        return filename
Пример #4
0
    def _read_frame(self, frame):
        """reads data from h5md file and copies to current timestep"""
        try:
            for name, value in self._has.items():
                if value:
                    myframe = self._particle_group[name]['step'][frame]
                    break
            else:
                raise NoDataError("Provide at least a position, velocity"
                                  " or force group in the h5md file.")
        except ValueError:
            raise IOError from None

        self._frame = frame
        ts = self.ts
        particle_group = self._particle_group
        ts.frame = frame

        # this block populates the data dictionary
        # DT not read
        if 'observables' in self._file:
            data = self._file['observables']
            for name in self._data_keywords:
                self._copy_data(name, data)
        for name, value in self._has.items():
            if value:
                self.ts.data['time'] = particle_group[name]['time'][frame]
                break

        # set frame box dimensions
        # set triclinic box vectors
        ts._unitcell[:] = particle_group['box/edges/value'][frame, :]

        # set the timestep positions, velocities, and forces with
        # set_ts_attribute() method
        ts.positions = self._set_ts_attribute('position')
        ts.velocities = self._set_ts_attribute('velocity')
        ts.forces = self._set_ts_attribute('force')

        # unit conversion
        if self.convert_units:
            # ensures h5md file has units if convert_units=True
            self._check_and_convert_units()

        return ts
Пример #5
0
    def _get_dh_pairs(self):
        """Finds donor-hydrogen pairs.

        Returns
        -------
        donors, hydrogens: AtomGroup, AtomGroup
            AtomGroups corresponding to all donors and all hydrogens. AtomGroups are ordered such that, if zipped, will
            produce a list of donor-hydrogen pairs.
        """

        # If donors_sel is not provided, use topology to find d-h pairs
        if not self.donors_sel:

            # We're using u._topology.bonds rather than u.bonds as it is a million times faster to access.
            # This is because u.bonds also calculates properties of each bond (e.g bond length).
            # See https://github.com/MDAnalysis/mdanalysis/issues/2396#issuecomment-596251787
            if not (hasattr(self.u._topology, 'bonds')
                    and len(self.u._topology.bonds.values) != 0):
                raise NoDataError(
                    'Cannot assign donor-hydrogen pairs via topology as no bond information is present. '
                    'Please either: load a topology file with bond information; use the guess_bonds() '
                    'topology guesser; or set HydrogenBondAnalysis.donors_sel so that a distance cutoff '
                    'can be used.')

            hydrogens = self.u.select_atoms(self.hydrogens_sel)
            donors = sum(h.bonded_atoms[0] for h in hydrogens) if hydrogens \
                else AtomGroup([], self.u)

        # Otherwise, use d_h_cutoff as a cutoff distance
        else:

            hydrogens = self.u.select_atoms(self.hydrogens_sel)
            donors = self.u.select_atoms(self.donors_sel)
            donors_indices, hydrogen_indices = capped_distance(
                donors.positions,
                hydrogens.positions,
                max_cutoff=self.d_h_cutoff,
                box=self.u.dimensions,
                return_distances=False).T

            donors = donors[donors_indices]
            hydrogens = hydrogens[hydrogen_indices]

        return donors, hydrogens
Пример #6
0
    def largest_cluster(self, cluster_sel=None, filter_by=None, return_indices=False):
        """Find the largest cluster of lipids at each frame.
        
        Parameters
        ----------
        cluster_sel : str, optional
            Selection string for lipids to include in the cluster analysis. The default is `None`, in
            which case all lipid used in identiying neighbouring lipids will be used for finding
            the largest cluster.
        filter_by : numpy.ndarray, optional
            A boolean array indicating whether or not to include each lipid in the cluster analysis. If
            the array is 1D and of shape (n_lipids), the same lipids will be used in the cluster
            analysis at every frame. If the array is 2D and of shape (n_lipids, n_frames), the boolean
            value of each lipid at each frame will be taken into account. The default is `None`, in which
            case all lipids used in identiying neighbours will be used for finding
            the largest cluster.
        return_indices : bool, optional
            If `True`, a list of NumPy arrays will also be returned, on for each frame. Each NumPy array
            will contain the residue indices of the lipids in the largest cluster at that frame. Note, if
            there are two largest clusters of equal size, only the residue indices of lipids in one
            cluster will be returned (the cluster that has the lipid with the smallest residue index). The
            default is `False`, in which case no reidue indices are returned.
        
        Returns
        -------
        
        largest_cluster : numpy.ndarray
            An array containing the number of lipids in the largest cluster at each frame.
        indices : list
            A list of 1D NumPy arrays, where each array corresponds to a single frame and contains the
            residue indices of lipids in the largest cluster at that frame.
            
        Note
        ----
        
        Neighbours must be found by using `Neighbours.run()` before calling either
        `Neighbours.count_neighbours()` or `Neighbours.largest_cluster()`.
        
        """

        if self.neighbours is None:
            raise NoDataError(".neighbours attribute is None: use .run() before calling .largest_cluster()")
        
        if filter_by is not None and np.array(filter_by).ndim not in [1, 2]:
            raise ValueError("'filter_by' must either be a 1D array containing non-changing boolean"
                             "values for each lipid, or a 2D array of shape (n_residues, n_frames)"
                             " containing a boolean value for each lipid at each frame."
                             )

        elif filter_by is not None and len(filter_by) != self.membrane.n_residues:
            raise ValueError("The shape of 'filter_by' must be (n_residues,)")
        
        # determine which lipids to use in the analysis at each frame
        if filter_by is None:
            
            filter_by = np.full(
                (self.membrane.n_residues, self.n_frames),
                fill_value=True,
                dtype=bool
            )
        elif filter_by.ndim == 1:
            
            filter_by = np.full(
                (self.membrane.n_residues, self.n_frames),
                fill_value=filter_by[:, np.newaxis],
                dtype=bool
            )
            
        # also create mask based on `cluster_sel`
        if cluster_sel is None:
            
            filter_lipids = np.full(
                self.membrane.n_residues,
                fill_value=True,
                dtype=bool
            )
        else:
            
            lipids = self.u.select_atoms(cluster_sel).residues
            
            if lipids.n_residues == 0:
                raise ValueError(
                    "'cluster_sel' produces atom empty AtomGroup. Please check the selection string."
                )
            
            filter_lipids = np.in1d(
                self.membrane.residues.resindices,
                lipids.resindices
            )
            
        # combine the masks
        filter_by[filter_lipids == False] = False  # noqa: E712
                
        # output arrays
        largest_cluster = np.zeros(self.n_frames, dtype=int)
        largest_cluster_resindices = np.full(self.n_frames, fill_value=0, dtype=object)
        
        for frame_index, neighbours in tqdm(enumerate(self.neighbours), total=self.n_frames):
            
            frame_filter = filter_by[:, frame_index]
            frame_neighbours = neighbours[frame_filter][:, frame_filter]
            
            # find all connected components
            _, com_labels = scipy.sparse.csgraph.connected_components(frame_neighbours)
            
            unique_com_labels, counts = np.unique(com_labels, return_counts=True)
            largest_label = unique_com_labels[np.argmax(counts)]
            
            # largest cluster and resindices of lipids in the cluster
            largest_cluster[frame_index] = max(counts)
            
            frame_resindices = self.membrane.residues.resindices[frame_filter]
            largest_cluster_resindices[frame_index] = frame_resindices[com_labels == largest_label]
            
        if return_indices is True:
            return largest_cluster, largest_cluster_resindices
        else:
            return largest_cluster
Пример #7
0
    def count_neighbours(self, count_by=None, count_by_labels=None, return_enrichment=False):
        """Count the number of each neighbour type at each frame.

        Parameters
        ----------
        count_by : numpy.ndarray, optional
            An array containing ordinal data describing each lipid at each frame. For example,
            it may be an array containing information on the ordered state or each lipid.
            Defaults to None, in which case the lipid species (resnames) are used for counting neighbours.
        count_by_labels : dict, optional
            A dictionary of labels describing what each unique value in `count_by` refers to, e.g
            if `count_by` contains information on the ordered state of each lipid at each frame, whereby
            0 corresponds to disordered and 1 corresponds to ordered, then
            `count_by_labels = {'Ld': 0, 'Lo': 1}`. There **must** be precisely one label for each unique
            value in 'count_by'. If `count_by` is given but `count_by_labels` is left as `None`, the values
            in `count_by` will be used as the labels.
        return_enrichment : bool, optional
            If `True`, a second DataFrame containing the fractional enrichment of each lipid species at each
            frame is also returned. The default is `False`, in which case the fractional enrichment
            if not returned.
        
        Returns
        -------
        
        counts : pandas.DataFrame
            A DataFrame containing the following data for each lipid at each frame: lipid identifier
            (default is resname), lipid residue index, frame number, number of neighbours of each species
            (or of each type in 'count_by' if this is provided), as well as the total number of neighbours.
        
        enrichment : pandas.DataFrame
            A DataFrame containing the following data enrichment/depletion data for each lipid species at
            each frame.
        
        """
        
        if self.neighbours is None:
            raise NoDataError(".neighbours attribute is None: use .run() before calling .count_neighbours()")
        
        # create output array
        if count_by is None:
            
            # Use lipid resnames to distinguish lipids
            count_by = np.full(
                (self.membrane.n_residues, self.n_frames),
                fill_value=self.membrane.residues.resnames[:, np.newaxis],
            )
            count_by_labels = {label: index for index, label in enumerate(np.unique(self.membrane.resnames))}
        
        elif count_by_labels is None:
            
            # Use values in 'count_by' as the labels
            count_by_labels = {label: index for index, label in enumerate(np.unique(count_by))}
            
        else:
            
            # the ordinal values in 'count_by' now take on the string labels supplied
            max_label_size = max([len(label) for label in count_by_labels])
            new_count_by = np.full_like(count_by, dtype=f'<U{max_label_size}', fill_value="")
            for label in count_by_labels:
                new_count_by[count_by == count_by_labels[label]] = label
            count_by = new_count_by
            del new_count_by
        
        # create output array
        all_counts = np.full(
            (self.membrane.n_residues, self.n_frames, len(count_by_labels)),
            fill_value=0,
            dtype=np.uint8  # count can't be negative, and no lipid will have more than 255 neighbours
        )
        
        # For counts we need to know which column of the output array to add counts to for each lipid type
        type_index = {value: index for index, value in enumerate(count_by_labels)}
        
        # Get counts at each frame
        n_residues = self.membrane.n_residues
        for frame_index, neighbours in tqdm(enumerate(self.neighbours), total=self.n_frames):
        
            ref, neigh = neighbours.nonzero()
            unique, counts = np.unique([ref, [type_index[t] for t in count_by[neigh, frame_index]]], axis=1, return_counts=True)
            
            r, t = unique  # reference index (r) and type index (t)
            all_counts[r, frame_index, t] = counts

        # Assemble data for the DataFrame
        labels = np.array([list(count_by_labels)[type_index[frame_index]] for lipid in count_by for frame_index in lipid])
        
        resindices = np.full((n_residues, self.n_frames), fill_value=self.membrane.residues.resindices[:, np.newaxis])
        resindices = resindices.reshape(n_residues * self.n_frames)
        
        frames = np.full((n_residues, self.n_frames), fill_value=self.frames)
        frames = frames.reshape(n_residues * self.n_frames)

        all_counts = all_counts.reshape(n_residues * self.n_frames, len(count_by_labels))
        total_counts = np.sum(all_counts, axis=1)
        
        # Create the dataframe
        counts = pd.DataFrame(
            data=labels,
            columns=["Label"]
        )

        counts["Resindex"] = resindices
        counts["Frame"] = frames

        for count_by_label in count_by_labels:
            counts[f"n{count_by_label}"] = all_counts.T[type_index[count_by_label]]

        counts["Total"] = total_counts
        
        # make every column except the label take on integer values
        for column in counts.columns[1:]:
            counts[column] = pd.to_numeric(counts[column])
        
        if return_enrichment is False:
            return counts
        
        # Otherwise create a second DataFrame containing the fractional enrichment
        unique_labels = [label for label in type_index]

        # We need to normalize the count by the mean number of neighbours of each species
        mean_neighbours_counts = np.asarray(
            [counts.groupby("Frame")[neigh].mean().values for neigh in [f"n{label}" for label in unique_labels]]
        )
        n_unique_labels, n_frames = mean_neighbours_counts.shape
        
        # create new output arrays
        labels = np.full((n_frames, n_unique_labels), fill_value=unique_labels).T.flatten()
        neighbour_enrichment = np.full((n_frames * n_unique_labels, n_unique_labels), fill_value=np.NaN)
        
        # and the new DataFrame
        enrichment = pd.DataFrame(
            data=labels,
            columns=["Label"]
        )
        enrichment["Frame"] = np.full((n_unique_labels, n_frames), fill_value=counts["Frame"].unique()).flatten()
        
        # Calculate the enrichment of each species at each frame
        for species_index, ref in enumerate(unique_labels):
        
            ref_mask = (counts.Label == ref).values
            
            species_neighbour_counts = counts.loc[ref_mask]
            species_neighbour_enrichment = species_neighbour_counts.groupby("Frame")[[f"n{label}" for label in unique_labels]].mean() / mean_neighbours_counts.T
            neighbour_enrichment[n_frames * species_index:n_frames * (species_index + 1)] = species_neighbour_enrichment
            
        # Finally add the enrichment values to the DataFrame
        for species_index, ref in enumerate([f"fe{label}" for label in unique_labels]):
            enrichment[ref] = neighbour_enrichment[:, species_index]
        
        return counts, enrichment
Пример #8
0
    def count_neighbours(self, count_by=None, count_by_labels=None):
        """Count the number of each neighbour type at each frame.

        Parameters
        ----------
        count_by : numpy.ndarray, optional
            An array containing ordinal data describing each lipid at each frame. For example,
            it may be an array containing information on the ordered state or each lipid.
            Defaults to None, in which case the lipid species (resnames) are used for counting neighbours.
        count_by_labels : dict, optional
            A dictionary of labels describing what each unique value in `count_by` refers to, e.g
            if `count_by` contains information on the ordered state of each lipid at each frame, whereby
            0 corresponds to disordered and 1 corresponds to ordered, then
            `count_by_labels = {'Ld': 0, 'Lo': 1}`. There **must** be precisely one label for each unique
            value in 'count_by'. If `count_by` is given but `count_by_labels` is left as `None`, the values
            in `count_by` will be used as the labels.
        
        Returns
        -------
        
        counts : pandas.DataFrame
            A DataFrame containing the following data for each lipid at each frame: lipid identifier
            (default is resname), lipid residue index, frame number, number of neighbours of each species
            (or of each type in 'count_by' if this is provided), as well as the total number of neighbours.
        
        """

        if self.neighbours is None:
            raise NoDataError(
                ".neighbours attribute is None: use .run() before calling .count_neighbours()"
            )

        # create output array
        if count_by is None:

            # Use lipid resnames to distinguish lipids
            count_by = np.full(
                (self.membrane.n_residues, self.n_frames),
                fill_value=self.membrane.residues.resnames[:, np.newaxis],
            )
            count_by_labels = {
                label: index
                for index, label in enumerate(np.unique(
                    self.membrane.resnames))
            }

        elif count_by_labels is None:

            # Use values in 'count_by' as the labels
            count_by_labels = {
                label: index
                for index, label in enumerate(np.unique(count_by))
            }

        else:

            # the ordinal values in 'count_by' now take on the string labels supplied
            max_label_size = max([len(label) for label in count_by_labels])
            new_count_by = np.full_like(count_by,
                                        dtype=f'<U{max_label_size}',
                                        fill_value="")
            for label in count_by_labels:
                new_count_by[count_by == count_by_labels[label]] = label
            count_by = new_count_by
            del new_count_by

        # create output array
        all_counts = np.full(
            (self.membrane.n_residues, self.n_frames, len(count_by_labels)),
            fill_value=0,
            dtype=np.
            uint8  # count can't be negative, and no lipid will have more than 255 neighbours
        )

        # For counts we need to know which column of the output array to add counts to for each lipid type
        type_index = {
            value: index
            for index, value in enumerate(count_by_labels)
        }

        # Get counts at each frame
        n_residues = self.membrane.n_residues
        for frame_index in tqdm(np.arange(self.n_frames)):

            ref, neigh = self.neighbours[:, frame_index *
                                         n_residues:(frame_index + 1) *
                                         n_residues].nonzero()
            unique, counts = np.unique(
                [ref, [type_index[t] for t in count_by[neigh, frame_index]]],
                axis=1,
                return_counts=True)

            r, t = unique  # reference index (r) and type index (t)
            all_counts[r, frame_index, t] = counts

        # Assemble data for the DataFrame
        labels = np.array([
            list(count_by_labels)[type_index[frame_index]]
            for lipid in count_by for frame_index in lipid
        ])

        resindices = np.full(
            (n_residues, self.n_frames),
            fill_value=self.membrane.residues.resindices[:, np.newaxis])
        resindices = resindices.reshape(n_residues * self.n_frames)

        frames = np.full((n_residues, self.n_frames), fill_value=self.frames)
        frames = frames.reshape(n_residues * self.n_frames)

        all_counts = all_counts.reshape(n_residues * self.n_frames,
                                        len(count_by_labels))
        total_counts = np.sum(all_counts, axis=1)

        data = np.concatenate(
            (labels[:, np.newaxis], resindices[:, np.newaxis],
             frames[:, np.newaxis], all_counts, total_counts[:, np.newaxis]),
            axis=1)

        # Create DataFrame
        columns = ["Label", "Resindex", "Frame"
                   ] + [f"n{label}" for label in count_by_labels] + ["Total"]
        df = pd.DataFrame(data=data, columns=columns)

        # make every column except the label take on integer values
        for column in df.columns[1:]:
            df[column] = pd.to_numeric(df[column])

        return df