def __init__(self, filename, convert_units=True, **kwargs): """ Parameters ---------- filename : str or :class:`h5py.File` trajectory filename or open h5py file convert_units : bool (optional) convert units to MDAnalysis units **kwargs : dict General reader arguments. Raises ------ RuntimeError when `H5PY`_ is not installed RuntimeError when a unit is not recognized by MDAnalysis ValueError when ``n_atoms`` changes values between timesteps ValueError when ``convert_units=True`` but the H5MD file contains no units NoDataError when the H5MD file has no 'position', 'velocity', or 'force' group """ if not HAS_H5PY: raise RuntimeError("Please install h5py") super(H5MDReader, self).__init__(filename, **kwargs) self.filename = filename ## add parallel MPI kwwargs ## kwwargs needs to get pulled out self.open_trajectory() # _has dictionary used for checking whether h5md file has # 'position', 'velocity', or 'force' groups in the file self._has = {name: name in self._particle_group for name in ('position', 'velocity', 'force')} # Gets n_atoms from first available group for name, value in self._has.items(): if value: self.n_atoms = self._particle_group[name]['value'].shape[1] break else: raise NoDataError("Provide at least a position, velocity" " or force group in the h5md file.") self.ts = self._Timestep(self.n_atoms, velocities=self.has_velocities, forces=self.has_forces, **self._ts_kwargs) self.units = {'time': None, 'length': None, 'velocity': None, 'force': None} self._set_translated_units() # fills units dictionary self._read_next_timestep()
def save(self, filename=None): """Save RMSD from :attr:`RMSD.rmsd` to text file *filename*. If *filename* is not supplied then the default provided to the constructor is used. The data are saved with :func:`np.savetxt`. """ filename = filename or self.filename if filename is not None: if self.rmsd is None: raise NoDataError("rmsd has not been calculated yet") np.savetxt(filename, self.rmsd) logger.info("Wrote RMSD timeseries to file %r", filename) return filename
def save(self, filename=None): """Save RMSD from :attr:`RMSD.rmsd` to text file *filename*. Parameters ---------- filename : str (optional) if no filename is given the default provided to the constructor is used. """ filename = filename or self.filename if filename is not None: if self.rmsd is None: raise NoDataError("rmsd has not been calculated yet") np.savetxt(filename, self.rmsd) logger.info("Wrote RMSD timeseries to file %r", filename) return filename
def _read_frame(self, frame): """reads data from h5md file and copies to current timestep""" try: for name, value in self._has.items(): if value: myframe = self._particle_group[name]['step'][frame] break else: raise NoDataError("Provide at least a position, velocity" " or force group in the h5md file.") except ValueError: raise IOError from None self._frame = frame ts = self.ts particle_group = self._particle_group ts.frame = frame # this block populates the data dictionary # DT not read if 'observables' in self._file: data = self._file['observables'] for name in self._data_keywords: self._copy_data(name, data) for name, value in self._has.items(): if value: self.ts.data['time'] = particle_group[name]['time'][frame] break # set frame box dimensions # set triclinic box vectors ts._unitcell[:] = particle_group['box/edges/value'][frame, :] # set the timestep positions, velocities, and forces with # set_ts_attribute() method ts.positions = self._set_ts_attribute('position') ts.velocities = self._set_ts_attribute('velocity') ts.forces = self._set_ts_attribute('force') # unit conversion if self.convert_units: # ensures h5md file has units if convert_units=True self._check_and_convert_units() return ts
def _get_dh_pairs(self): """Finds donor-hydrogen pairs. Returns ------- donors, hydrogens: AtomGroup, AtomGroup AtomGroups corresponding to all donors and all hydrogens. AtomGroups are ordered such that, if zipped, will produce a list of donor-hydrogen pairs. """ # If donors_sel is not provided, use topology to find d-h pairs if not self.donors_sel: # We're using u._topology.bonds rather than u.bonds as it is a million times faster to access. # This is because u.bonds also calculates properties of each bond (e.g bond length). # See https://github.com/MDAnalysis/mdanalysis/issues/2396#issuecomment-596251787 if not (hasattr(self.u._topology, 'bonds') and len(self.u._topology.bonds.values) != 0): raise NoDataError( 'Cannot assign donor-hydrogen pairs via topology as no bond information is present. ' 'Please either: load a topology file with bond information; use the guess_bonds() ' 'topology guesser; or set HydrogenBondAnalysis.donors_sel so that a distance cutoff ' 'can be used.') hydrogens = self.u.select_atoms(self.hydrogens_sel) donors = sum(h.bonded_atoms[0] for h in hydrogens) if hydrogens \ else AtomGroup([], self.u) # Otherwise, use d_h_cutoff as a cutoff distance else: hydrogens = self.u.select_atoms(self.hydrogens_sel) donors = self.u.select_atoms(self.donors_sel) donors_indices, hydrogen_indices = capped_distance( donors.positions, hydrogens.positions, max_cutoff=self.d_h_cutoff, box=self.u.dimensions, return_distances=False).T donors = donors[donors_indices] hydrogens = hydrogens[hydrogen_indices] return donors, hydrogens
def largest_cluster(self, cluster_sel=None, filter_by=None, return_indices=False): """Find the largest cluster of lipids at each frame. Parameters ---------- cluster_sel : str, optional Selection string for lipids to include in the cluster analysis. The default is `None`, in which case all lipid used in identiying neighbouring lipids will be used for finding the largest cluster. filter_by : numpy.ndarray, optional A boolean array indicating whether or not to include each lipid in the cluster analysis. If the array is 1D and of shape (n_lipids), the same lipids will be used in the cluster analysis at every frame. If the array is 2D and of shape (n_lipids, n_frames), the boolean value of each lipid at each frame will be taken into account. The default is `None`, in which case all lipids used in identiying neighbours will be used for finding the largest cluster. return_indices : bool, optional If `True`, a list of NumPy arrays will also be returned, on for each frame. Each NumPy array will contain the residue indices of the lipids in the largest cluster at that frame. Note, if there are two largest clusters of equal size, only the residue indices of lipids in one cluster will be returned (the cluster that has the lipid with the smallest residue index). The default is `False`, in which case no reidue indices are returned. Returns ------- largest_cluster : numpy.ndarray An array containing the number of lipids in the largest cluster at each frame. indices : list A list of 1D NumPy arrays, where each array corresponds to a single frame and contains the residue indices of lipids in the largest cluster at that frame. Note ---- Neighbours must be found by using `Neighbours.run()` before calling either `Neighbours.count_neighbours()` or `Neighbours.largest_cluster()`. """ if self.neighbours is None: raise NoDataError(".neighbours attribute is None: use .run() before calling .largest_cluster()") if filter_by is not None and np.array(filter_by).ndim not in [1, 2]: raise ValueError("'filter_by' must either be a 1D array containing non-changing boolean" "values for each lipid, or a 2D array of shape (n_residues, n_frames)" " containing a boolean value for each lipid at each frame." ) elif filter_by is not None and len(filter_by) != self.membrane.n_residues: raise ValueError("The shape of 'filter_by' must be (n_residues,)") # determine which lipids to use in the analysis at each frame if filter_by is None: filter_by = np.full( (self.membrane.n_residues, self.n_frames), fill_value=True, dtype=bool ) elif filter_by.ndim == 1: filter_by = np.full( (self.membrane.n_residues, self.n_frames), fill_value=filter_by[:, np.newaxis], dtype=bool ) # also create mask based on `cluster_sel` if cluster_sel is None: filter_lipids = np.full( self.membrane.n_residues, fill_value=True, dtype=bool ) else: lipids = self.u.select_atoms(cluster_sel).residues if lipids.n_residues == 0: raise ValueError( "'cluster_sel' produces atom empty AtomGroup. Please check the selection string." ) filter_lipids = np.in1d( self.membrane.residues.resindices, lipids.resindices ) # combine the masks filter_by[filter_lipids == False] = False # noqa: E712 # output arrays largest_cluster = np.zeros(self.n_frames, dtype=int) largest_cluster_resindices = np.full(self.n_frames, fill_value=0, dtype=object) for frame_index, neighbours in tqdm(enumerate(self.neighbours), total=self.n_frames): frame_filter = filter_by[:, frame_index] frame_neighbours = neighbours[frame_filter][:, frame_filter] # find all connected components _, com_labels = scipy.sparse.csgraph.connected_components(frame_neighbours) unique_com_labels, counts = np.unique(com_labels, return_counts=True) largest_label = unique_com_labels[np.argmax(counts)] # largest cluster and resindices of lipids in the cluster largest_cluster[frame_index] = max(counts) frame_resindices = self.membrane.residues.resindices[frame_filter] largest_cluster_resindices[frame_index] = frame_resindices[com_labels == largest_label] if return_indices is True: return largest_cluster, largest_cluster_resindices else: return largest_cluster
def count_neighbours(self, count_by=None, count_by_labels=None, return_enrichment=False): """Count the number of each neighbour type at each frame. Parameters ---------- count_by : numpy.ndarray, optional An array containing ordinal data describing each lipid at each frame. For example, it may be an array containing information on the ordered state or each lipid. Defaults to None, in which case the lipid species (resnames) are used for counting neighbours. count_by_labels : dict, optional A dictionary of labels describing what each unique value in `count_by` refers to, e.g if `count_by` contains information on the ordered state of each lipid at each frame, whereby 0 corresponds to disordered and 1 corresponds to ordered, then `count_by_labels = {'Ld': 0, 'Lo': 1}`. There **must** be precisely one label for each unique value in 'count_by'. If `count_by` is given but `count_by_labels` is left as `None`, the values in `count_by` will be used as the labels. return_enrichment : bool, optional If `True`, a second DataFrame containing the fractional enrichment of each lipid species at each frame is also returned. The default is `False`, in which case the fractional enrichment if not returned. Returns ------- counts : pandas.DataFrame A DataFrame containing the following data for each lipid at each frame: lipid identifier (default is resname), lipid residue index, frame number, number of neighbours of each species (or of each type in 'count_by' if this is provided), as well as the total number of neighbours. enrichment : pandas.DataFrame A DataFrame containing the following data enrichment/depletion data for each lipid species at each frame. """ if self.neighbours is None: raise NoDataError(".neighbours attribute is None: use .run() before calling .count_neighbours()") # create output array if count_by is None: # Use lipid resnames to distinguish lipids count_by = np.full( (self.membrane.n_residues, self.n_frames), fill_value=self.membrane.residues.resnames[:, np.newaxis], ) count_by_labels = {label: index for index, label in enumerate(np.unique(self.membrane.resnames))} elif count_by_labels is None: # Use values in 'count_by' as the labels count_by_labels = {label: index for index, label in enumerate(np.unique(count_by))} else: # the ordinal values in 'count_by' now take on the string labels supplied max_label_size = max([len(label) for label in count_by_labels]) new_count_by = np.full_like(count_by, dtype=f'<U{max_label_size}', fill_value="") for label in count_by_labels: new_count_by[count_by == count_by_labels[label]] = label count_by = new_count_by del new_count_by # create output array all_counts = np.full( (self.membrane.n_residues, self.n_frames, len(count_by_labels)), fill_value=0, dtype=np.uint8 # count can't be negative, and no lipid will have more than 255 neighbours ) # For counts we need to know which column of the output array to add counts to for each lipid type type_index = {value: index for index, value in enumerate(count_by_labels)} # Get counts at each frame n_residues = self.membrane.n_residues for frame_index, neighbours in tqdm(enumerate(self.neighbours), total=self.n_frames): ref, neigh = neighbours.nonzero() unique, counts = np.unique([ref, [type_index[t] for t in count_by[neigh, frame_index]]], axis=1, return_counts=True) r, t = unique # reference index (r) and type index (t) all_counts[r, frame_index, t] = counts # Assemble data for the DataFrame labels = np.array([list(count_by_labels)[type_index[frame_index]] for lipid in count_by for frame_index in lipid]) resindices = np.full((n_residues, self.n_frames), fill_value=self.membrane.residues.resindices[:, np.newaxis]) resindices = resindices.reshape(n_residues * self.n_frames) frames = np.full((n_residues, self.n_frames), fill_value=self.frames) frames = frames.reshape(n_residues * self.n_frames) all_counts = all_counts.reshape(n_residues * self.n_frames, len(count_by_labels)) total_counts = np.sum(all_counts, axis=1) # Create the dataframe counts = pd.DataFrame( data=labels, columns=["Label"] ) counts["Resindex"] = resindices counts["Frame"] = frames for count_by_label in count_by_labels: counts[f"n{count_by_label}"] = all_counts.T[type_index[count_by_label]] counts["Total"] = total_counts # make every column except the label take on integer values for column in counts.columns[1:]: counts[column] = pd.to_numeric(counts[column]) if return_enrichment is False: return counts # Otherwise create a second DataFrame containing the fractional enrichment unique_labels = [label for label in type_index] # We need to normalize the count by the mean number of neighbours of each species mean_neighbours_counts = np.asarray( [counts.groupby("Frame")[neigh].mean().values for neigh in [f"n{label}" for label in unique_labels]] ) n_unique_labels, n_frames = mean_neighbours_counts.shape # create new output arrays labels = np.full((n_frames, n_unique_labels), fill_value=unique_labels).T.flatten() neighbour_enrichment = np.full((n_frames * n_unique_labels, n_unique_labels), fill_value=np.NaN) # and the new DataFrame enrichment = pd.DataFrame( data=labels, columns=["Label"] ) enrichment["Frame"] = np.full((n_unique_labels, n_frames), fill_value=counts["Frame"].unique()).flatten() # Calculate the enrichment of each species at each frame for species_index, ref in enumerate(unique_labels): ref_mask = (counts.Label == ref).values species_neighbour_counts = counts.loc[ref_mask] species_neighbour_enrichment = species_neighbour_counts.groupby("Frame")[[f"n{label}" for label in unique_labels]].mean() / mean_neighbours_counts.T neighbour_enrichment[n_frames * species_index:n_frames * (species_index + 1)] = species_neighbour_enrichment # Finally add the enrichment values to the DataFrame for species_index, ref in enumerate([f"fe{label}" for label in unique_labels]): enrichment[ref] = neighbour_enrichment[:, species_index] return counts, enrichment
def count_neighbours(self, count_by=None, count_by_labels=None): """Count the number of each neighbour type at each frame. Parameters ---------- count_by : numpy.ndarray, optional An array containing ordinal data describing each lipid at each frame. For example, it may be an array containing information on the ordered state or each lipid. Defaults to None, in which case the lipid species (resnames) are used for counting neighbours. count_by_labels : dict, optional A dictionary of labels describing what each unique value in `count_by` refers to, e.g if `count_by` contains information on the ordered state of each lipid at each frame, whereby 0 corresponds to disordered and 1 corresponds to ordered, then `count_by_labels = {'Ld': 0, 'Lo': 1}`. There **must** be precisely one label for each unique value in 'count_by'. If `count_by` is given but `count_by_labels` is left as `None`, the values in `count_by` will be used as the labels. Returns ------- counts : pandas.DataFrame A DataFrame containing the following data for each lipid at each frame: lipid identifier (default is resname), lipid residue index, frame number, number of neighbours of each species (or of each type in 'count_by' if this is provided), as well as the total number of neighbours. """ if self.neighbours is None: raise NoDataError( ".neighbours attribute is None: use .run() before calling .count_neighbours()" ) # create output array if count_by is None: # Use lipid resnames to distinguish lipids count_by = np.full( (self.membrane.n_residues, self.n_frames), fill_value=self.membrane.residues.resnames[:, np.newaxis], ) count_by_labels = { label: index for index, label in enumerate(np.unique( self.membrane.resnames)) } elif count_by_labels is None: # Use values in 'count_by' as the labels count_by_labels = { label: index for index, label in enumerate(np.unique(count_by)) } else: # the ordinal values in 'count_by' now take on the string labels supplied max_label_size = max([len(label) for label in count_by_labels]) new_count_by = np.full_like(count_by, dtype=f'<U{max_label_size}', fill_value="") for label in count_by_labels: new_count_by[count_by == count_by_labels[label]] = label count_by = new_count_by del new_count_by # create output array all_counts = np.full( (self.membrane.n_residues, self.n_frames, len(count_by_labels)), fill_value=0, dtype=np. uint8 # count can't be negative, and no lipid will have more than 255 neighbours ) # For counts we need to know which column of the output array to add counts to for each lipid type type_index = { value: index for index, value in enumerate(count_by_labels) } # Get counts at each frame n_residues = self.membrane.n_residues for frame_index in tqdm(np.arange(self.n_frames)): ref, neigh = self.neighbours[:, frame_index * n_residues:(frame_index + 1) * n_residues].nonzero() unique, counts = np.unique( [ref, [type_index[t] for t in count_by[neigh, frame_index]]], axis=1, return_counts=True) r, t = unique # reference index (r) and type index (t) all_counts[r, frame_index, t] = counts # Assemble data for the DataFrame labels = np.array([ list(count_by_labels)[type_index[frame_index]] for lipid in count_by for frame_index in lipid ]) resindices = np.full( (n_residues, self.n_frames), fill_value=self.membrane.residues.resindices[:, np.newaxis]) resindices = resindices.reshape(n_residues * self.n_frames) frames = np.full((n_residues, self.n_frames), fill_value=self.frames) frames = frames.reshape(n_residues * self.n_frames) all_counts = all_counts.reshape(n_residues * self.n_frames, len(count_by_labels)) total_counts = np.sum(all_counts, axis=1) data = np.concatenate( (labels[:, np.newaxis], resindices[:, np.newaxis], frames[:, np.newaxis], all_counts, total_counts[:, np.newaxis]), axis=1) # Create DataFrame columns = ["Label", "Resindex", "Frame" ] + [f"n{label}" for label in count_by_labels] + ["Total"] df = pd.DataFrame(data=data, columns=columns) # make every column except the label take on integer values for column in df.columns[1:]: df[column] = pd.to_numeric(df[column]) return df