示例#1
0
 def check_dihedral_inputs(selections):
     for group in selections:
         for k in group.keys():
             if len(group[k]) != 4:
                 msg = ''''Dihedral calculations require AtomGroups with
                           only 4 atoms, %s selected''' % len(group)
                 logger.error(msg)
                 raise SelectionError(msg)
示例#2
0
    def __init__(self,
                 atomgroup,
                 reference=None,
                 select='all',
                 groupselections=None,
                 filename="rmsd.dat",
                 weights=None,
                 tol_mass=0.1,
                 ref_frame=0,
                 **kwargs):
        # DEPRECATION: remove filename kwarg in 1.0
        r"""Parameters
        ----------
        atomgroup : AtomGroup or Universe
            Group of atoms for which the RMSD is calculated. If a trajectory is
            associated with the atoms then the computation iterates over the
            trajectory.
        reference : AtomGroup or Universe (optional)
            Group of reference atoms; if ``None`` then the current frame of
            `atomgroup` is used.
        select : str or dict or tuple (optional)
            The selection to operate on; can be one of:

            1. any valid selection string for
               :meth:`~MDAnalysis.core.groups.AtomGroup.select_atoms` that
               produces identical selections in `atomgroup` and `reference`; or

            2. a dictionary ``{'mobile': sel1, 'reference': sel2}`` where *sel1*
               and *sel2* are valid selection strings that are applied to
               `atomgroup` and `reference` respectively (the
               :func:`MDAnalysis.analysis.align.fasta2select` function returns such
               a dictionary based on a ClustalW_ or STAMP_ sequence alignment); or

            3. a tuple ``(sel1, sel2)``

            When using 2. or 3. with *sel1* and *sel2* then these selection strings
            are applied to `atomgroup` and `reference` respectively and should
            generate *groups of equivalent atoms*.  *sel1* and *sel2* can each also
            be a *list of selection strings* to generate a
            :class:`~MDAnalysis.core.groups.AtomGroup` with defined atom order as
            described under :ref:`ordered-selections-label`).

        groupselections : list (optional)
            A list of selections as described for `select`, with the difference
            that these selections are *always applied to the full universes*,
            i.e., ``atomgroup.universe.select_atoms(sel1)`` and
            ``reference.universe.select_atoms(sel2)``. Each selection describes
            additional RMSDs to be computed *after the structures have been
            superimposed* according to `select`. No additional fitting is
            performed.The output contains one additional column for each
            selection.

            .. Note:: Experimental feature. Only limited error checking
                      implemented.
        filename : str (optional)
            write RMSD into file with :meth:`RMSD.save`

            .. deprecated:; 0.19.0
               `filename` will be removed together with :meth:`save` in 1.0.

        weights : {"mass", ``None``} or array_like (optional)
             choose weights. With ``"mass"`` uses masses as weights; with ``None``
             weigh each atom equally. If a float array of the same length as
             `atomgroup` is provided, use each element of the `array_like` as a
             weight for the corresponding atom in `atomgroup`.
        tol_mass : float (optional)
             Reject match if the atomic masses for matched atoms differ by more
             than `tol_mass`.
        ref_frame : int (optional)
             frame index to select frame from `reference`
        verbose : bool (optional)
             Show detailed progress of the calculation if set to ``True``; the
             default is ``False``.

        Raises
        ------
        SelectionError
             If the selections from `atomgroup` and `reference` do not match.
        TypeError
             If `weights` is not of the appropriate type; see also
             :func:`MDAnalysis.lib.util.get_weights`
        ValueError
             If `weights` are not compatible with `atomgroup` (not the same
             length) or if it is not a 1D array (see
             :func:`MDAnalysis.lib.util.get_weights`).

             A :exc:`ValueError` is also raised if `weights` are not compatible
             with `groupselections`: only equal weights (``weights=None``) or
             mass-weighted (``weights="mass"``) are supported for additional
             `groupselections`.

        Notes
        -----
        The root mean square deviation :math:`\rho(t)` of a group of :math:`N`
        atoms relative to a reference structure as a function of time is
        calculated as

        .. math::

           \rho(t) = \sqrt{\frac{1}{N} \sum_{i=1}^N w_i \left(\mathbf{x}_i(t)
                                    - \mathbf{x}_i^{\text{ref}}\right)^2}

        The weights :math:`w_i` are calculated from the input weights `weights`
        :math:`w'_i` as relative to the mean of the input weights:

        .. math::

           w_i = \frac{w'_i}{\langle w' \rangle}

        The selected coordinates from `atomgroup` are optimally superimposed
        (translation and rotation) on the `reference` coordinates at each time step
        as to minimize the RMSD. Douglas Theobald's fast QCP algorithm
        [Theobald2005]_ is used for the rotational superposition and to calculate
        the RMSD (see :mod:`MDAnalysis.lib.qcprot` for implementation details).

        The class runs various checks on the input to ensure that the two atom
        groups can be compared. This includes a comparison of atom masses (i.e.,
        only the positions of atoms of the same mass will be considered to be
        correct for comparison). If masses should not be checked, just set
        `tol_mass` to a large value such as 1000.

        .. _ClustalW: http://www.clustal.org/
        .. _STAMP: http://www.compbio.dundee.ac.uk/manuals/stamp.4.2/


        See Also
        --------
        rmsd


        .. versionadded:: 0.7.7
        .. versionchanged:: 0.8
           `groupselections` added
        .. versionchanged:: 0.16.0
           Flexible weighting scheme with new `weights` keyword.
        .. deprecated:: 0.16.0
           Instead of ``mass_weighted=True`` (removal in 0.17.0) use new
           ``weights='mass'``; refactored to fit with AnalysisBase API
        .. versionchanged:: 0.17.0
           removed deprecated `mass_weighted` keyword; `groupselections`
           are *not* rotationally superimposed any more.
        .. deprecated:: 0.19.0
           `filename` will be removed in 1.0

        """
        super(RMSD, self).__init__(atomgroup.universe.trajectory, **kwargs)
        self.atomgroup = atomgroup
        self.reference = reference if reference is not None else self.atomgroup

        select = process_selection(select)
        self.groupselections = (
            [process_selection(s)
             for s in groupselections] if groupselections is not None else [])
        self.weights = weights
        self.tol_mass = tol_mass
        self.ref_frame = ref_frame
        self.filename = filename  # DEPRECATED in 0.19.0, remove in 1.0.0

        self.ref_atoms = self.reference.select_atoms(*select['reference'])
        self.mobile_atoms = self.atomgroup.select_atoms(*select['mobile'])

        if len(self.ref_atoms) != len(self.mobile_atoms):
            err = ("Reference and trajectory atom selections do "
                   "not contain the same number of atoms: "
                   "N_ref={0:d}, N_traj={1:d}".format(
                       self.ref_atoms.n_atoms, self.mobile_atoms.n_atoms))
            logger.exception(err)
            raise SelectionError(err)
        logger.info("RMS calculation "
                    "for {0:d} atoms.".format(len(self.ref_atoms)))
        mass_mismatches = (np.absolute(
            (self.ref_atoms.masses - self.mobile_atoms.masses)) >
                           self.tol_mass)

        if np.any(mass_mismatches):
            # diagnostic output:
            logger.error("Atoms: reference | mobile")
            for ar, at in zip(self.ref_atoms, self.mobile_atoms):
                if ar.name != at.name:
                    logger.error("{0!s:>4} {1:3d} {2!s:>3} {3!s:>3} {4:6.3f}"
                                 "|  {5!s:>4} {6:3d} {7!s:>3} {8!s:>3}"
                                 "{9:6.3f}".format(ar.segid, ar.resid,
                                                   ar.resname, ar.name,
                                                   ar.mass, at.segid, at.resid,
                                                   at.resname, at.name,
                                                   at.mass))
            errmsg = ("Inconsistent selections, masses differ by more than"
                      "{0:f}; mis-matching atoms"
                      "are shown above.".format(self.tol_mass))
            logger.error(errmsg)
            raise SelectionError(errmsg)
        del mass_mismatches

        # TODO:
        # - make a group comparison a class that contains the checks above
        # - use this class for the *select* group and the additional
        #   *groupselections* groups each a dict with reference/mobile
        self._groupselections_atoms = [{
            'reference':
            self.reference.universe.select_atoms(*s['reference']),
            'mobile':
            self.atomgroup.universe.select_atoms(*s['mobile']),
        } for s in self.groupselections]
        # sanity check
        for igroup, (sel, atoms) in enumerate(
                zip(self.groupselections, self._groupselections_atoms)):
            if len(atoms['mobile']) != len(atoms['reference']):
                logger.exception('SelectionError: Group Selection')
                raise SelectionError(
                    "Group selection {0}: {1} | {2}: Reference and trajectory "
                    "atom selections do not contain the same number of atoms: "
                    "N_ref={3}, N_traj={4}".format(igroup, sel['reference'],
                                                   sel['mobile'],
                                                   len(atoms['reference']),
                                                   len(atoms['mobile'])))

        # Explicitly check for "mass" because this option CAN
        # be used with groupselection. (get_weights() returns the mass array
        # for "mass")
        if not iterable(self.weights) and self.weights == "mass":
            pass
        else:
            self.weights = get_weights(self.mobile_atoms, self.weights)

        # cannot use arbitrary weight array (for superposition) with
        # groupselections because arrays will not match
        if (len(self.groupselections) > 0 and
            (iterable(self.weights) or self.weights not in ("mass", None))):
            raise ValueError("groupselections can only be combined with "
                             "weights=None or weights='mass', not a weight "
                             "array.")

        # initialized to note for testing the save function
        self.rmsd = None
示例#3
0
def get_matching_atoms(ag1, ag2, tol_mass=0.1, strict=False):
    """Return two atom groups with one-to-one matched atoms.

    The function takes two :class:`~MDAnalysis.core.AtomGroup.AtomGroup`
    instances *ag1* and *ag2* and returns two atom groups *g1* and *g2* that
    consist of atoms so that the mass of atom ``g1[0]`` is the same as the mass
    of atom ``g2[0]``, ``g1[1]`` and ``g2[1]`` etc.

    The current implementation is very simplistic and works on a per-residue basis:

    1. The two groups must contain the same number of residues.
    2. Any residues in each group that have differing number of atoms are discarded.
    3. The masses of corresponding atoms are compared. and if any masses differ
       by more than *tol_mass* the test is considered failed and a
       :exc:`SelectionError` is raised.

    The log file (see :func:`MDAnalysis.start_logging`) will contain detailed
    information about mismatches.

    :Arguments:
      *ag1*, *ag2*
         :class:`~MDAnalysis.core.AtomGroup.AtomGroup` instances that are compared
    :Keywords:
      *tol_mass*
         Reject if the atomic masses for matched atoms differ by more than
         *tol_mass* [0.1]
      *strict*
         ``True``
             Will raise :exc:`SelectioError` if a single atom does not
             match between the two selections.
         ``False`` [default]
             Will try to prepare a matching selection by dropping
             residues with non-matching atoms. See :func:`get_matching_atoms`
             for details.

    :Returns: Tuple ``(g1, g2)`` with :class:`~MDAnalysis.core.AtomGroup.AtomGroup` instances
              that match, atom by atom. The groups are either the original groups if all matches
              or slices of the original groups.

    :Raises: :exc:`SelectionError` if the number of residues does not match or if in the final
             matching masses differ by more than *tol*.

    The algorithm could be improved by using e.g. the Needleman-Wunsch
    algorithm in :mod:`Bio.profile2` to align atoms in each residue (doing a
    global alignment is too expensive).

    .. versionadded:: 0.8

    .. versionchanged:: 0.10.0
       Renamed from :func:`check_same_atoms` to :func:`get_matching_atoms` and now returns
       matching atomgroups (possibly with residues removed)

    """

    if ag1.n_atoms != ag2.n_atoms:
        if ag1.n_residues != ag2.n_residues:
            errmsg = "Reference and trajectory atom selections do not contain "
            "the same number of atoms: \n"
            "atoms:    N_ref={0}, N_traj={1}\n"
            "and also not the same number of residues:\n"
            "residues: N_ref={2}, N_traj={3}\n"
            "\n"
            "(More details can be found in the log file "
            "which can be enabled with 'MDAnalysis.start_logging()')".format(
                ag1.n_atoms, ag2.n_atoms, ag1.n_residues, ag2.n_residues)
            dbgmsg = "mismatched residue numbers\n" + \
                "\n".join(["{0} | {1}"  for r1, r2 in
                           itertools.izip_longest(ag1.resids, ag2.resids)])
            logger.error(errmsg)
            logger.debug(dbgmsg)
            raise SelectionError(errmsg)
        else:
            msg = ("Reference and trajectory atom selections do not contain "
                   "the same number of atoms: \n"
                   "atoms:    N_ref={0}, N_traj={1}").format(
                       ag1.n_atoms, ag2.n_atoms)
            if strict:
                raise SelectionError(msg)

            # continue with trying to creating a valid selection
            warnings.warn(msg +
                          "\nbut we attempt to create a valid selection.",
                          category=SelectionWarning)

        # continue with trying to salvage the selection:
        # - number of atoms is different
        # - number of residues is the same
        # We will remove residues with mismatching number of atoms (e.g. not resolved
        # in an X-ray structure)
        assert ag1.n_residues == ag2.n_residues

        # Alternatively, we could align all atoms but Needleman-Wunsch
        # pairwise2 consumes too much memory for thousands of characters in
        # each sequence. Perhaps a solution would be pairwise alignment per residue.
        #
        # aln_elem = Bio.pairwise2.align.globalms("".join([MDAnalysis.topology.core.guess_atom_element(n) for n in gref.atoms.names]),
        #    "".join([MDAnalysis.topology.core.guess_atom_element(n) for n in models[0].atoms.names]),
        #                               2, -1, -1, -0.1,
        #                               one_alignment_only=True)

        # For now, just remove the residues that don't have matching numbers
        rsize1 = np.array([r.n_atoms for r in ag1.residues])
        rsize2 = np.array([r.n_atoms for r in ag2.residues])
        rsize_mismatches = np.absolute(rsize1 - rsize2)
        mismatch_mask = (rsize_mismatches > 0)
        if np.any(mismatch_mask):
            if strict:
                # diagnostics
                mismatch_resindex = np.arange(ag1.n_residues)[mismatch_mask]

                def log_mismatch(number,
                                 ag,
                                 rsize,
                                 mismatch_resindex=mismatch_resindex):
                    logger.error("Offending residues: group {0}: {1}".format(
                        number, ", ".join([
                            "{0[0]}{0[1]} ({0[2]})".format(r)
                            for r in itertools.izip(
                                ag.resnames[mismatch_resindex],
                                ag.resids[mismatch_resindex],
                                rsize[mismatch_resindex])
                        ])))

                logger.error(
                    "Found {0} residues with non-matching numbers of atoms (#)"
                    .format(mismatch_mask.sum()))
                log_mismatch(1, ag1, rsize1)
                log_mismatch(2, ag2, rsize2)

                raise SelectionError(
                    "Different number of atoms in some residues. "
                    "(Use strict=False to attempt using matching atoms only.)")

            def get_atoms_byres(g, match_mask=np.logical_not(mismatch_mask)):
                # not pretty... but need to do things on a per-atom basis in order
                # to preserve original selection
                ag = g.atoms
                good = ag.resids[match_mask]
                resids = np.array([a.resid for a in ag])  # resid for each atom
                ix_good = np.in1d(resids,
                                  good)  # boolean array for all matching atoms
                return ag[np.arange(len(ag))
                          [ix_good]]  # workaround for missing boolean indexing

            _ag1 = get_atoms_byres(ag1)
            _ag2 = get_atoms_byres(ag2)

            # diagnostics
            # (ugly workaround for missing boolean indexing of AtomGroup)
            # note: ag[arange(len(ag))[boolean]] is ~2x faster than ag[where[boolean]]
            mismatch_resindex = np.arange(ag1.n_residues)[mismatch_mask]
            logger.warn(
                "Removed {0} residues with non-matching numbers of atoms".
                format(mismatch_mask.sum()))
            logger.debug("Removed residue ids: group 1: {0}".format(
                ag1.resids[mismatch_resindex]))
            logger.debug("Removed residue ids: group 2: {0}".format(
                ag2.resids[mismatch_resindex]))
            # replace after logging (still need old ag1 and ag2 for diagnostics)
            ag1 = _ag1
            ag2 = _ag2
            del _ag1, _ag2

    mass_mismatches = (np.absolute(ag1.masses - ag2.masses) > tol_mass)
    if np.any(mass_mismatches):
        # Test 2 failed.
        # diagnostic output:
        # (ugly workaround because boolean indexing is not yet working for atomgroups)
        assert ag1.n_atoms == ag2.n_atoms
        mismatch_atomindex = np.arange(ag1.n_atoms)[mass_mismatches]

        logger.error("Atoms: reference | trajectory")
        for ar, at in itertools.izip(ag1[mismatch_atomindex],
                                     ag2[mismatch_atomindex]):
            logger.error(
                "{0!s:>4} {1:3d} {2!s:>3} {3!s:>3} {4:6.3f}  |  {5!s:>4} {6:3d} {7!s:>3} {8!s:>3} {9:6.3f}"
                .format(ar.segid, ar.resid, ar.resname, ar.name, ar.mass,
                        at.segid, at.resid, at.resname, at.name, at.mass))
        errmsg = ("Inconsistent selections, masses differ by more than {0}; " + \
            "mis-matching atoms are shown above.").format(tol_mass)
        logger.error(errmsg)
        raise SelectionError(errmsg)
    return ag1, ag2
示例#4
0
    def __init__(self,
                 atomgroup,
                 reference=None,
                 select='all',
                 groupselections=None,
                 filename="rmsd.dat",
                 mass_weighted=None,
                 weights=None,
                 tol_mass=0.1,
                 ref_frame=0,
                 **kwargs):
        super(RMSD, self).__init__(atomgroup.universe.trajectory, **kwargs)
        self.universe = atomgroup.universe
        self.reference = reference if reference is not None else self.universe

        select = process_selection(select)
        self.groupselections = (
            [process_selection(s)
             for s in groupselections] if groupselections is not None else [])
        if mass_weighted is not None:
            warnings.warn(
                "mass weighted is deprecated argument. Please use "
                " 'weights=\"mass\" instead. Will be removed in 0.17.0",
                category=DeprecationWarning)
            if mass_weighted:
                weights = 'mass'
        self.weights = weights
        self.tol_mass = tol_mass
        self.ref_frame = ref_frame
        self.filename = filename

        self.ref_atoms = self.reference.select_atoms(*select['reference'])
        self.mobile_atoms = self.universe.select_atoms(*select['mobile'])

        if len(self.ref_atoms) != len(self.mobile_atoms):
            err = ("Reference and trajectory atom selections do "
                   "not contain the same number of atoms: "
                   "N_ref={0:d}, N_traj={1:d}".format(
                       self.ref_atoms.n_atoms, self.mobile_atoms.n_atoms))
            logger.exception(err)
            raise SelectionError(err)
        logger.info("RMS calculation "
                    "for {0:d} atoms.".format(len(self.ref_atoms)))
        mass_mismatches = (np.absolute(
            (self.ref_atoms.masses - self.mobile_atoms.masses)) >
                           self.tol_mass)

        if np.any(mass_mismatches):
            # diagnostic output:
            logger.error("Atoms: reference | mobile")
            for ar, at in zip(self.ref_atoms, self.mobile_atoms):
                if ar.name != at.name:
                    logger.error("{0!s:>4} {1:3d} {2!s:>3} {3!s:>3} {4:6.3f}"
                                 "|  {5!s:>4} {6:3d} {7!s:>3} {8!s:>3}"
                                 "{9:6.3f}".format(ar.segid, ar.resid,
                                                   ar.resname, ar.name,
                                                   ar.mass, at.segid, at.resid,
                                                   at.resname, at.name,
                                                   at.mass))
            errmsg = ("Inconsistent selections, masses differ by more than"
                      "{0:f}; mis-matching atoms"
                      "are shown above.".format(self.tol_mass))
            logger.error(errmsg)
            raise SelectionError(errmsg)
        del mass_mismatches

        # TODO:
        # - make a group comparison a class that contains the checks above
        # - use this class for the *select* group and the additional
        #   *groupselections* groups each a dict with reference/mobile
        self._groupselections_atoms = [{
            'reference':
            self.reference.select_atoms(*s['reference']),
            'mobile':
            self.universe.select_atoms(*s['mobile']),
        } for s in self.groupselections]
        # sanity check
        for igroup, (sel, atoms) in enumerate(
                zip(self.groupselections, self._groupselections_atoms)):
            if len(atoms['mobile']) != len(atoms['reference']):
                logger.exception('SelectionError: Group Selection')
                raise SelectionError(
                    "Group selection {0}: {1} | {2}: Reference and trajectory "
                    "atom selections do not contain the same number of atoms: "
                    "N_ref={3}, N_traj={4}".format(igroup, sel['reference'],
                                                   sel['mobile'],
                                                   len(atoms['reference']),
                                                   len(atoms['mobile'])))
        # initialized to note for testing the save function
        self.rmsd = None
示例#5
0
    def __init__(self, traj, reference=None, select='all', groupselections=None, filename="rmsd.dat",
                 mass_weighted=False, tol_mass=0.1, ref_frame=0):
        """Setting up the RMSD analysis.

        The RMSD will be computed between *select* and *reference* for
        all frames in the trajectory in *universe*.

        :Arguments:
          *traj*
             universe (:class:`MDAnalysis.Universe` object) that contains a
             trajectory
          *reference*
             reference coordinates; :class:`MDAnalysis.Universe` object; if ``None``
             the *traj* is used (uses the current time step of the object) [``None``]
          *select*
             The selection to operate on; can be one of:

             1. any valid selection string for
                :meth:`~MDAnalysis.core.AtomGroup.AtomGroup.select_atoms` that produces identical
                selections in *mobile* and *reference*; or
             2. a dictionary ``{'mobile':sel1, 'reference':sel2}`` (the
                :func:`MDAnalysis.analysis.align.fasta2select` function returns such a
                dictionary based on a ClustalW_ or STAMP_ sequence alignment); or
             3. a tuple ``(sel1, sel2)``

             When using 2. or 3. with *sel1* and *sel2* then these selections can also each be
             a list of selection strings (to generate a AtomGroup with defined atom order as
             described under :ref:`ordered-selections-label`).
          *groupselections*
             A list of selections as described for *select*. Each selection describes additional
             RMSDs to be computed *after the structures have be superpositioned* according to *select*.
             The output contains one additional column for each selection. [``None``]

             .. Note:: Experimental feature. Only limited error checking implemented.
          *filename*
             If set, *filename* can be used to write the results with :meth:`RMSD.save` [``None``]
          *mass_weighted*
             do a mass-weighted RMSD fit
          *tol_mass*
             Reject match if the atomic masses for matched atoms differ by more than
             *tol_mass* [0.1]
          *ref_frame*
             frame index to select frame from *reference* [0]

        .. _ClustalW: http://www.clustal.org/
        .. _STAMP: http://www.compbio.dundee.ac.uk/manuals/stamp.4.2/

        .. versionadded:: 0.7.7
        .. versionchanged:: 0.8
           *groupselections* added
        """
        self.universe = traj
        if reference is None:
            self.reference = self.universe
        else:
            self.reference = reference
        self.select = _process_selection(select)
        if groupselections is not None:
            self.groupselections = [_process_selection(s) for s in groupselections]
        else:
            self.groupselections = []
        self.mass_weighted = mass_weighted
        self.tol_mass = tol_mass
        self.ref_frame = ref_frame
        self.filename = filename

        self.ref_atoms = self.reference.select_atoms(*self.select['reference'])
        self.traj_atoms = self.universe.select_atoms(*self.select['mobile'])
        natoms = self.traj_atoms.n_atoms
        if len(self.ref_atoms) != len(self.traj_atoms):
            logger.exception()
            raise SelectionError("Reference and trajectory atom selections do not contain " +
                                 "the same number of atoms: N_ref={0:d}, N_traj={1:d}".format(len(self.ref_atoms), len(self.traj_atoms)))
        logger.info("RMS calculation for {0:d} atoms.".format(len(self.ref_atoms)))
        mass_mismatches = (np.absolute(self.ref_atoms.masses - self.traj_atoms.masses) > self.tol_mass)
        if np.any(mass_mismatches):
            # diagnostic output:
            logger.error("Atoms: reference | trajectory")
            for ar, at in izip(self.ref_atoms, self.traj_atoms):
                if ar.name != at.name:
                    logger.error("{0!s:>4} {1:3d} {2!s:>3} {3!s:>3} {4:6.3f}  |  {5!s:>4} {6:3d} {7!s:>3} {8!s:>3} {9:6.3f}".format(ar.segid, ar.resid, ar.resname, ar.name, ar.mass,
                                 at.segid, at.resid, at.resname, at.name, at.mass))
            errmsg = "Inconsistent selections, masses differ by more than {0:f}; mis-matching atoms are shown above.".format( \
                     self.tol_mass)
            logger.error(errmsg)
            raise SelectionError(errmsg)
        del mass_mismatches

        # TODO:
        # - make a group comparison a class that contains the checks above
        # - use this class for the *select* group and the additional *groupselections* groups
        # each a dict with reference/mobile
        self.groupselections_atoms = [
            {
                'reference': self.reference.select_atoms(*s['reference']),
                'mobile': self.universe.select_atoms(*s['mobile']),
            }
            for s in self.groupselections]
        # sanity check
        for igroup, (sel, atoms) in enumerate(zip(self.groupselections, self.groupselections_atoms)):
            if len(atoms['mobile']) != len(atoms['reference']):
                logger.exception()
                raise SelectionError(
                    "Group selection {0}: {1} | {2}: Reference and trajectory atom selections do not contain " +
                    "the same number of atoms: N_ref={3}, N_traj={4}".format(
                        igroup, sel['reference'], sel['mobile'], len(atoms['reference']), len(atoms['mobile'])))

        self.rmsd = None
def get_matching_atoms(ag1, ag2, tol_mass=0.1, strict=False):
    """Return two atom groups with one-to-one matched atoms.

    The function takes two :class:`~MDAnalysis.core.groups.AtomGroup`
    instances `ag1` and `ag2` and returns two atom groups `g1` and `g2` that
    consist of atoms so that the mass of atom ``g1[0]`` is the same as the mass
    of atom ``g2[0]``, ``g1[1]`` and ``g2[1]`` etc.

    The current implementation is very simplistic and works on a per-residue basis:

    1. The two groups must contain the same number of residues.
    2. Any residues in each group that have differing number of atoms are discarded.
    3. The masses of corresponding atoms are compared. and if any masses differ
       by more than `tol_mass` the test is considered failed and a
       :exc:`SelectionError` is raised.

    The log file (see :func:`MDAnalysis.start_logging`) will contain detailed
    information about mismatches.

    Parameters
    ----------
    ag1 : AtomGroup
        First :class:`~MDAnalysis.core.groups.AtomGroup` instance that is
        compared
    ag2 : AtomGroup
        Second :class:`~MDAnalysis.core.groups.AtomGroup` instance that is
        compared
    tol_mass : float (optional)
         Reject if the atomic masses for matched atoms differ by more than
         `tol_mass` [0.1]
    strict : bool (optional)
        ``True``
            Will raise :exc:`SelectionError` if a single atom does not
            match between the two selections.
        ``False`` [default]
            Will try to prepare a matching selection by dropping
            residues with non-matching atoms. See :func:`get_matching_atoms`
            for details.

    Returns
    -------
    (g1, g2) : tuple
        Tuple with :class:`~MDAnalysis.core.groups.AtomGroup`
        instances that match, atom by atom. The groups are either the
        original groups if all matched or slices of the original
        groups.

    Raises
    ------
    :exc:`SelectionError`
        Error raised if the number of residues does not match or if in the final
        matching masses differ by more than *tol*.

    Notes
    -----
    The algorithm could be improved by using e.g. the Needleman-Wunsch
    algorithm in :mod:`Bio.profile2` to align atoms in each residue (doing a
    global alignment is too expensive).

    .. versionadded:: 0.8

    .. versionchanged:: 0.10.0
       Renamed from :func:`check_same_atoms` to
       :func:`get_matching_atoms` and now returns matching atomgroups
       (possibly with residues removed)

    """

    if ag1.n_atoms != ag2.n_atoms:
        if ag1.n_residues != ag2.n_residues:
            errmsg = ("Reference and trajectory atom selections do not contain "
                      "the same number of atoms: \n"
                      "atoms:    N_ref={0}, N_traj={1}\n"
                      "and also not the same number of residues:\n"
                      "residues: N_ref={2}, N_traj={3}").format(
                          ag1.n_atoms, ag2.n_atoms,
                          ag1.n_residues, ag2.n_residues)
            logger.error(errmsg)
            raise SelectionError(errmsg)
        else:
            msg = ("Reference and trajectory atom selections do not contain "
                   "the same number of atoms: \n"
                   "atoms:    N_ref={0}, N_traj={1}").format(
                       ag1.n_atoms, ag2.n_atoms)
            if strict:
                logger.error(msg)
                raise SelectionError(msg)

            # continue with trying to create a valid selection
            msg += ("\nbut we attempt to create a valid selection " +
                    "(use strict=True to disable this heuristic).")
            logger.info(msg)
            warnings.warn(msg, category=SelectionWarning)

        # continue with trying to salvage the selection:
        # - number of atoms is different
        # - number of residues is the same
        # We will remove residues with mismatching number of atoms (e.g. not resolved
        # in an X-ray structure)
        assert ag1.n_residues == ag2.n_residues

        # Alternatively, we could align all atoms but Needleman-Wunsch
        # pairwise2 consumes too much memory for thousands of characters in
        # each sequence. Perhaps a solution would be pairwise alignment per residue.
        #
        # aln_elem = Bio.pairwise2.align.globalms("".join([MDAnalysis.topology.
        # core.guess_atom_element(n) for n in gref.atoms.names]),
        # "".join([MDAnalysis.topology.core.guess_atom_element(n)
        # for n in models[0].atoms.names]),
        # 2, -1, -1, -0.1,
        # one_alignment_only=True)

        # For now, just remove the residues that don't have matching numbers
        # NOTE: This can create empty selections, e.g., when comparing a structure
        #       with hydrogens to a PDB structure without hydrogens.
        rsize1 = np.array([r.atoms.n_atoms for r in ag1.residues])
        rsize2 = np.array([r.atoms.n_atoms for r in ag2.residues])
        rsize_mismatches = np.absolute(rsize1 - rsize2)
        mismatch_mask = (rsize_mismatches > 0)
        if np.any(mismatch_mask):
            if strict:
                # diagnostics
                mismatch_resindex = np.arange(ag1.n_residues)[mismatch_mask]

                def log_mismatch(
                        number,
                        ag,
                        rsize,
                        mismatch_resindex=mismatch_resindex):
                    logger.error("Offending residues: group {0}: {1}".format(
                        number,
                        ", ".join(["{0[0]}{0[1]} ({0[2]})".format(r) for r in
                                   zip(ag.resnames[mismatch_resindex],
                                       ag.resids[mismatch_resindex],
                                       rsize[mismatch_resindex]
                                       )])))
                logger.error("Found {0} residues with non-matching numbers of atoms (#)".format(
                    mismatch_mask.sum()))
                log_mismatch(1, ag1, rsize1)
                log_mismatch(2, ag2, rsize2)

                errmsg = ("Different number of atoms in some residues. "
                          "(Use strict=False to attempt using matching atoms only.)")
                logger.error(errmsg)
                raise SelectionError(errmsg)

            def get_atoms_byres(g, match_mask=np.logical_not(mismatch_mask)):
                # not pretty... but need to do things on a per-atom basis in
                # order to preserve original selection
                ag = g.atoms
                good = ag.residues.resids[match_mask]  # resid for each residue
                resids = ag.resids                     # resid for each atom
                # boolean array for all matching atoms
                ix_good = np.in1d(resids, good)
                return ag[ix_good]

            _ag1 = get_atoms_byres(ag1)
            _ag2 = get_atoms_byres(ag2)

            assert _ag1.atoms.n_atoms == _ag2.atoms.n_atoms

            # diagnostics
            # (ugly workaround for missing boolean indexing of AtomGroup)
            # note: ag[arange(len(ag))[boolean]] is ~2x faster than
            # ag[where[boolean]]
            mismatch_resindex = np.arange(ag1.n_residues)[mismatch_mask]
            logger.warning("Removed {0} residues with non-matching numbers of atoms"
                           .format(mismatch_mask.sum()))
            logger.debug("Removed residue ids: group 1: {0}"
                         .format(ag1.residues.resids[mismatch_resindex]))
            logger.debug("Removed residue ids: group 2: {0}"
                         .format(ag2.residues.resids[mismatch_resindex]))
            # replace after logging (still need old ag1 and ag2 for
            # diagnostics)
            ag1 = _ag1
            ag2 = _ag2
            del _ag1, _ag2

            # stop if we created empty selections (by removing ALL residues...)
            if ag1.n_atoms == 0 or ag2.n_atoms == 0:
                errmsg = ("Failed to automatically find matching atoms: created empty selections. " +
                          "Try to improve your selections for mobile and reference.")
                logger.error(errmsg)
                raise SelectionError(errmsg)

    # check again because the residue matching heuristic is not very
    # good and can easily be misled (e.g., when one of the selections
    # had fewer atoms but the residues in mobile and reference have
    # each the same number)
    try:
        mass_mismatches = (np.absolute(ag1.masses - ag2.masses) > tol_mass)
    except ValueError:
        errmsg = ("Failed to find matching atoms: len(reference) = {}, len(mobile) = {} " +
                  "Try to improve your selections for mobile and reference.").format(
                      ag1.n_atoms, ag2.n_atoms)
        logger.error(errmsg)
        raise SelectionError(errmsg)

    if np.any(mass_mismatches):
        # Test 2 failed.
        # diagnostic output:
        logger.error("Atoms: reference | trajectory")
        for ar, at in zip(ag1[mass_mismatches], ag2[mass_mismatches]):
            logger.error(
                "{0!s:>4} {1:3d} {2!s:>3} {3!s:>3} {4:6.3f}  |  {5!s:>4} {6:3d} {7!s:>3} {8!s:>3} {9:6.3f}".format(
                    ar.segid,
                    ar.resid,
                    ar.resname,
                    ar.name,
                    ar.mass,
                    at.segid,
                    at.resid,
                    at.resname,
                    at.name,
                    at.mass))
        errmsg = ("Inconsistent selections, masses differ by more than {0}; "
                  "mis-matching atoms are shown above.").format(tol_mass)
        logger.error(errmsg)
        raise SelectionError(errmsg)

    return ag1, ag2