示例#1
0
class RotorFilter(BasicSettings, CustomWorkflowComponent):
    """
    Filters molecules based on the maximum allowed number of rotatable bonds.

    Note:
        Rotatable bonds are non terminal torsions found using the `find_rotatable_bonds` method of the
        openforcefield.topology.Molecule class.
    """

    component_name = "RotorFilter"
    component_description = (
        "Filter the molecules based on the maximum number of allowed rotatable bonds."
    )
    component_fail_message = "The molecule has too many rotatable bonds."

    maximum_rotors: int = Field(
        4,
        description=
        "The maximum number of rotatable bonds allowed in the molecule.")
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=False)

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        Apply the filter to the list of molecules to remove any molecules with more rotors then the maximum allowed
        number.

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        # create the return
        result = self._create_result()

        # run the the molecules and calculate the number of rotatable bonds
        for molecule in molecules:
            if len(molecule.find_rotatable_bonds()) > self.maximum_rotors:
                result.filter_molecule(molecule)

            else:
                result.add_molecule(molecule)

        return result
 def properties(cls) -> ComponentProperties:
     return ComponentProperties(process_parallel=True,
                                produces_duplicates=True)
示例#3
0
class ElementFilter(BasicSettings, CustomWorkflowComponent):
    """
    Filter the molecules based on a list of allowed elements.

    Note:
        The `allowed_elements` attribute can take a list of either symbols or atomic numbers and will resolve them to a
        common internal format as required.

    Example:
        Using atomic symbols or atomic numbers in components.

        ```python
        >>> from openff.qcsubmit.workflow_components import ElementFilter
        >>> efil = ElementFilter()
        # set the allowed elements to H,C,N,O
        >>> efil.allowed_elements = ['H', 'C', 'N', 'O']
        >>> efil.allowed_elements = [1, 6, 7, 8]
        ```
    """

    component_name = "ElementFilter"
    component_description = (
        "Filter out molecules who contain elements not in the allowed element list"
    )
    component_fail_message = (
        "Molecule contained elements not in the allowed elements list")

    allowed_elements: List[Union[int, str]] = Field(
        [
            "H",
            "C",
            "N",
            "O",
            "F",
            "P",
            "S",
            "Cl",
            "Br",
            "I",
        ],
        description=
        "The list of allowed elements as symbols or atomic number ints.",
    )
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=False)

    @validator("allowed_elements", each_item=True)
    def check_allowed_elements(cls, element: Union[str,
                                                   int]) -> Union[str, int]:
        """
        Check that each item can be cast to a valid element.

        Parameters:
            element: The element that should be checked.

        Raises:
            ValueError: If the element number or symbol passed could not be converted into a valid element.
        """
        from simtk.openmm.app import Element

        if isinstance(element, int):
            return element
        else:
            try:
                _ = Element.getBySymbol(element)
                return element
            except KeyError:
                raise KeyError(
                    f"An element could not be determined from symbol {element}, please enter symbols only."
                )

    def _apply_init(self, result: ComponentResult) -> None:

        from simtk.openmm.app import Element

        self._cache["elements"] = [
            Element.getBySymbol(ele).atomic_number
            if isinstance(ele, str) else ele for ele in self.allowed_elements
        ]

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        The common entry point of all workflow components which applies the workflow component to the given list of
        molecules.

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result()

        # First lets convert the allowed_elements list to ints as this is what is stored in the atom object
        _allowed_elements = self._cache["elements"]

        # now apply the filter
        for molecule in molecules:
            for atom in molecule.atoms:
                if atom.atomic_number not in _allowed_elements:
                    result.filter_molecule(molecule)
                    break
            else:
                result.add_molecule(molecule)

        return result

    def provenance(self) -> Dict:
        """
        Generate version information for all of the software used during the running of this component.

        Returns:
            A dictionary of all of the software used in the component along wither their version numbers.

        Note:
            The element class in OpenMM is used to match the elements so the OpenMM version is given.
        """

        from simtk import openmm

        provenance = super().provenance()
        provenance["openmm_elements"] = openmm.__version__

        return provenance
示例#4
0
class RMSDCutoffConformerFilter(BasicSettings, CustomWorkflowComponent):
    """
    Prunes conformers from a molecule that are less than a specified RMSD from
    all other conformers
    """

    # standard components which must be defined
    component_name = "RMSDCutoffConformerFilter"
    component_description = (
        "Filter conformations for the given molecules using a RMSD cutoff")
    component_fail_message = "Could not filter the conformers using RMSD"

    # custom components for this class
    cutoff: float = Field(-1.0, description="The RMSD cut off in angstroms.")
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=False)

    def _prune_conformers(self, molecule: Molecule) -> None:

        no_conformers: int = molecule.n_conformers

        # This will be used to determined whether it should be pruned
        # from the RMSD calculations. If we find it should be pruned
        # just once, it is sufficient to avoid it later in the pairwise
        # processing.
        uniq: List = list([True] * no_conformers)

        # Needed to get the aligned best-fit RMSD
        rdmol = molecule.to_rdkit()

        rmsd = []
        # This begins the pairwise RMSD pruner
        if no_conformers > 1 and self.cutoff >= 0.0:

            # The reference conformer for RMSD calculation
            for j in range(no_conformers - 1):

                # A previous loop has determine this specific conformer
                # is too close to another, so we can entirely skip it
                if not uniq[j]:
                    continue

                # since k starts from j+1, we are only looking at the
                # upper triangle of the comparisons (j < k)
                for k in range(j + 1, no_conformers):

                    rmsd_i = AlignMol(rdmol, rdmol, k, j)
                    rmsd.append(rmsd_i)

                    # Flag this conformer for pruning, and also
                    # prevent it from being used as a reference in the
                    # future comparisons
                    if rmsd_i < self.cutoff:
                        uniq[k] = False

            confs = [
                molecule.conformers[j] for j, add_bool in enumerate(uniq)
                if add_bool
            ]

            molecule._conformers = confs.copy()

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        Prunes conformers from a molecule that are less than a specified RMSD from
        all other conformers

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result()

        for molecule in molecules:

            if molecule.n_conformers == 0:
                result.filter_molecule(molecule)
            else:
                self._prune_conformers(molecule)
                result.add_molecule(molecule)

        return result
示例#5
0
class SmartsFilter(BasicSettings, CustomWorkflowComponent):
    """
    Filters molecules based on if they contain certain smarts substructures.

    Note:
        * The smarts tags used for filtering should be numerically tagged in order to work with the toolkit.
        * If None is passed to the allowed list all molecules that dont match a filter pattern will be passed.
        * If tag_dihedrals is set to true any smarts pattern tagging 4 atoms in a torsion will be prepared for a torsiondrive.
    """

    component_name = "SmartsFilter"
    component_description = "Filter molecules based on the given smarts patterns."
    component_fail_message = (
        "The molecule did/didn't contain the given smarts patterns.")
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=False)

    allowed_substructures: Optional[List[str]] = Field(
        None,
        description=
        "The list of allowed substructures which should be tagged with indicies.",
    )
    filtered_substructures: Optional[List[str]] = Field(
        None,
        description="The list of substructures which should be filtered.")
    tag_dihedrals: bool = Field(
        False,
        description=
        "If any dihedrals included in the allowed smarts should also be tagged for torsion driving.",
    )

    @validator("allowed_substructures",
               "filtered_substructures",
               each_item=True)
    def _check_environments(cls, environment):
        """
        Check the the string passed is valid by trying to create a ChemicalEnvironment in the toolkit.
        """

        # try and make a new chemical environment checking for parse errors
        _ = ChemicalEnvironment(smirks=environment)

        # check for numeric tags in the environment
        if re.search(":[0-9]]+", environment) is not None:
            return environment

        else:
            raise SMIRKSParsingError(
                "The smarts pattern passed had no tagged atoms please tag the atoms in the "
                "substructure you wish to include/exclude.")

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        Apply the filter to the input list of molecules removing those that match the filtered set or do not contain an
        allowed substructure.

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result()

        if self.allowed_substructures is None:
            # pass all of the molecules
            for molecule in molecules:
                result.add_molecule(molecule=molecule)

        else:
            for molecule in molecules:
                # keep all dihedral matches here
                dihedrals = TorsionIndexer()
                for substructure in self.allowed_substructures:
                    matches = molecule.chemical_environment_matches(
                        query=substructure)
                    if matches and not self.tag_dihedrals:
                        result.add_molecule(molecule=molecule)
                        break
                    elif matches and self.tag_dihedrals:
                        # add the dihedral for tagging if valid
                        for match in matches:
                            # this will handle deduplication
                            dihedrals.add_torsion(torsion=match,
                                                  scan_range=None)
                    else:
                        continue
                else:
                    # if we have dihedrals then add the molecule else fail it as we didn't break
                    if dihedrals.n_torsions >= 1:
                        molecule.properties["dihedrals"] = dihedrals
                        result.add_molecule(molecule)
                    else:
                        result.filter_molecule(molecule=molecule)

        if self.filtered_substructures is not None:
            # now we only want to check the molecules in the pass list
            molecules_to_remove = []
            for molecule in result.molecules:
                for substructure in self.filtered_substructures:
                    if molecule.chemical_environment_matches(
                            query=substructure):
                        molecules_to_remove.append(molecule)
                        break

            # Failing a molecule automatically removes it from the successes
            for molecule in molecules_to_remove:
                result.filter_molecule(molecule)

        return result
示例#6
0
class MolecularWeightFilter(BasicSettings, CustomWorkflowComponent):
    """
    Filters molecules based on the minimum and maximum allowed molecular weights.
    """

    component_name = "MolecularWeightFilter"
    component_description = (
        "Molecules are filtered based on the allowed molecular weights.")
    component_fail_message = "Molecule weight was not in the specified region."

    minimum_weight: int = Field(
        130,
        description=
        "The minimum allowed molecule weight  default value taken from the openeye blockbuster filter",
    )
    maximum_weight: int = Field(
        781,
        description=
        "The maximum allow molecule weight, default taken from the openeye blockbuster filter.",
    )
    _properties: ComponentProperties = ComponentProperties(
        process_parallel=True, produces_duplicates=False)

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        The common entry point of all workflow components which applies the workflow component to the given list of
        molecules.

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        from rdkit.Chem import Descriptors

        result = self._create_result()

        for molecule in molecules:
            total_weight = Descriptors.ExactMolWt(molecule.to_rdkit())

            if self.minimum_weight < total_weight < self.maximum_weight:
                result.add_molecule(molecule)
            else:
                result.filter_molecule(molecule)

        return result

    def provenance(self) -> Dict:
        """
        Generate version information for all of the software used during the running of this component.

        Returns:
            A dictionary of all of the software used in the component along wither their version numbers.

        Important:
            The simtk unit module has no version information so the version of OpenMM is given instead.
        """

        from simtk import openmm

        provenance = super().provenance()
        provenance["openmm_units"] = openmm.__version__

        return provenance
示例#7
0
class CoverageFilter(BasicSettings, CustomWorkflowComponent):
    """
    Filters molecules based on the requested forcefield coverage.

    Important:
        The ids supplied to the respective group are the ids that are allowed, if `None` is passed all ids are allowed.

    Note:
        * If a molecule has any id in the allowed_ids and not in the filtered ids it is passed. Any molecule with a
            parameter in both sets is failed.
        * If None is passed to allowed IDs and tag_dihedrals will have no effect as all dihedrals are scanned by default.

    Important:
        A value of None in a list will let all molecules through.
    """

    component_name = "CoverageFilter"
    component_description = (
        "Filter the molecules based on the requested FF allowed parameters.")
    component_fail_message = "The molecule was typed with disallowed parameters."

    allowed_ids: Optional[Set[str]] = Field(
        None,
        description=
        "The SMIRKS parameter ids of the parameters which are allowed to be exercised by the molecules. Molecules should use atleast one of these ids to be passed by the component.",
    )
    filtered_ids: Optional[Set[str]] = Field(
        None,
        description=
        "The SMIRKS parameter ids of the parameters which are not allowed to be exercised by the molecules.",
    )
    forcefield: str = Field(
        "openff_unconstrained-1.0.0.offxml",
        description=
        "The name of the forcefield which we want to filter against.",
    )
    tag_dihedrals: bool = Field(
        False,
        description=
        "If we should tag any dihedral ids exercised for torsion driving.",
    )
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=False)

    def _apply_init(self, result: ComponentResult) -> None:

        self._cache["forcefield"] = ForceField(self.forcefield)

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        Apply the filter to the list of molecules to remove any molecules typed by an id that is not allowed, i.e. not
        included in the allowed list.

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            A [ComponentResult][qcsubmit.datasets.ComponentResult] instance containing information about the molecules
            that passed and were filtered by the component and details about the component which generated the result.
        """

        result = self._create_result()

        forcefield: ForceField = self._cache["forcefield"]

        # type the molecules
        for molecule in molecules:
            labels = forcefield.label_molecules(molecule.to_topology())[0]
            # format the labels into a set
            covered_types = set([
                label.id for types in labels.values()
                for label in types.values()
            ])
            # use set intersection to check coverage for unwanted types
            # if filtered is None change to an empty set.
            unwanted_types = covered_types.intersection(self.filtered_ids
                                                        or set())
            if unwanted_types:
                # fail the molecule for any unwanted matches
                result.filter_molecule(molecule)
                continue

            # now check for wanted common types
            # if the allowed option is None change to have overlap
            common_types = covered_types.intersection(self.allowed_ids
                                                      or covered_types)
            if common_types:
                # here we have to find improper and proper dihedrals to tag
                if self.tag_dihedrals:
                    torsion_indexer = TorsionIndexer()
                    # combine a full torsion list
                    torsion_labels = labels["ProperTorsions"]
                    torsion_labels.update(labels["ImproperTorsions"])
                    for type_label in common_types:
                        if "t" in type_label or "i" in type_label:
                            for torsion, parameter in torsion_labels.items():
                                if type_label == parameter.id:
                                    if "Improper" in parameter.__class__.__name__:
                                        torsion_indexer.add_improper(
                                            central_atom=torsion[1],
                                            improper=torsion,
                                            scan_range=None,
                                        )
                                    elif "Proper" in parameter.__class__.__name__:
                                        torsion_indexer.add_torsion(
                                            torsion=torsion, scan_range=None)

                    molecule.properties["dihedrals"] = torsion_indexer

                result.add_molecule(molecule)

            else:
                result.filter_molecule(molecule)

        return result

    def provenance(self) -> Dict:
        """
        Generate version information for all of the software used during the running of this component.

        Returns:
            A dictionary of all of the software used in the component along wither their version numbers.
        """
        import openforcefields

        provenance = super().provenance()
        provenance["openforcefields"] = openforcefields.__version__

        return provenance
class WBOFragmenter(ToolkitValidator, CustomWorkflowComponent):
    """
    Fragment molecules using the WBO fragmenter class of the fragmenter module.
    For more information see <https://github.com/openforcefield/fragmenter>.
    """

    component_name = "WBOFragmenter"
    component_description = (
        "Fragment a molecule across all rotatble bonds using the WBO fragmenter."
    )
    component_fail_message = "The molecule could not be fragmented correctly."
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=True)
    threshold: float = Field(
        0.03,
        description=
        "The WBO error threshold between the parent and the fragment value, the fragmentation will stop when the difference between the fragment and parent is less than this value.",
    )
    keep_non_rotor_ring_substituents: bool = Field(
        False,
        description=
        "If any non rotor ring substituents should be kept during the fragmentation resulting in smaller fragments.",
    )
    functional_groups: Optional[Union[bool, str]] = Field(
        None,
        description=
        "The path to the yaml/json file containing a list of functional group types to be considered during fragmentation. Supplying None will cause fragmenter to use"
        "its own predefined list.",
    )
    include_parent: bool = Field(
        False,
        description=
        "If the parent molecule should also be included in the output.",
    )

    @validator("functional_groups")
    def check_functional_groups(cls, functional_group):
        """
        Check the functional groups which can be passed as a file name or as a dictionary are valid.

        Note:
            This check could be quite fragile.
        """
        if functional_group is None or functional_group is False:
            return functional_group

        elif isinstance(functional_group, str):
            fgroups = deserialize(functional_group)
            # simple check on the smarts
            for smarts in fgroups.values():
                if "[" not in smarts:
                    raise ValueError(
                        f"Some functional group smarts were not valid {smarts}."
                    )

            return functional_group

    @classmethod
    def is_available(cls) -> bool:
        """
        Check if fragmenter can be imported.
        """
        openeye = which_import(
            ".oechem",
            raise_error=True,
            return_bool=True,
            package="openeye",
            raise_msg=
            "Please install via `conda install openeye-toolkits -c openeye`.",
        )
        fragmenter = which_import(
            "fragmenter",
            raise_error=True,
            return_bool=True,
            raise_msg="Please install via `conda install fragmenter -c omnia`.",
        )

        return openeye and fragmenter

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        Fragment the molecules using the WBOFragmenter.

        Parameters:
            molecules: The list of molecules which should be processed by this component.

        Note:
            * If the input molecule fails fragmentation it will be fail this component and be removed even when
            `include_parent` is set to true.
            * When a molecule can not be fragmented to meet the wbo threshold the parent is likely to be included in the
            dataset.
            *
        """
        from fragmenter import fragment

        result = self._create_result()

        for molecule in molecules:
            # not having a conformer can cause issues
            if molecule.n_conformers == 0:
                molecule.generate_conformers(n_conformers=1)

            if self.include_parent:
                result.add_molecule(molecule)

            fragment_factory = fragment.WBOFragmenter(
                molecule=molecule.to_openeye(),
                functional_groups=self.functional_groups,
                verbose=False,
            )

            try:
                fragment_factory.fragment(
                    threshold=self.threshold,
                    keep_non_rotor_ring_substituents=self.
                    keep_non_rotor_ring_substituents,
                )

                # we need to store the central bond which was fragmented around
                # to make sure this is the bond we torsiondrive around
                fragmets_dict = fragment_factory.to_torsiondrive_json()

                # check we have fragments
                if fragmets_dict:

                    for fragment_data in fragmets_dict.values():
                        frag_mol = Molecule.from_mapped_smiles(
                            mapped_smiles=fragment_data["identifiers"]
                            ["canonical_isomeric_explicit_hydrogen_mapped_smiles"]
                        )
                        torsion_index = tuple(fragment_data["dihedral"][0])
                        # this is stored back into the molecule and will be used when generating the cmiles tags latter
                        torsion_tag = TorsionIndexer()
                        torsion_tag.add_torsion(torsion=torsion_index)
                        frag_mol.properties["dihedrals"] = torsion_tag
                        result.add_molecule(frag_mol)

                # if we have no fragments and we dont want the parent then we failed to fragment
                elif not fragmets_dict and not self.include_parent:
                    result.filter_molecule(molecule)

            except (RuntimeError, ValueError):
                # this will catch cmiles errors for molecules with undefined stero
                result.filter_molecule(molecule)

        return result

    def provenance(self) -> Dict:
        """
        Collect the toolkit information and add the fragmenter version information.
        """

        import fragmenter

        provenance = super().provenance()

        provenance["fragmenter"] = fragmenter.__version__

        return provenance
示例#9
0
class StandardConformerGenerator(ToolkitValidator, CustomWorkflowComponent):
    """
    Standard conformer generator using the OFFTK and the back end toolkits.

    Note:
        The provenance information and toolkit settings are handled by the
        [ToolkitValidator][qcsubmit.workflow_components.base_component.ToolkitValidator] mixin.
    """

    # standard components which must be defined
    component_name = "StandardConformerGenerator"
    component_description = "Generate conformations for the given molecules"
    component_fail_message = "Conformers could not be generated"

    # custom components for this class
    _properties = ComponentProperties(process_parallel=True,
                                      produces_duplicates=False)

    rms_cutoff: Optional[float] = Field(
        None,
        description=
        "The rms cut off in angstroms to be used when generating the conformers. Passing None will use the default in toolkit of 1.",
    )
    max_conformers: int = Field(
        10,
        description=
        "The maximum number of conformers to be generated per molecule.")
    clear_existing: bool = Field(
        True, description="If any pre-existing conformers should be kept.")

    def _apply_init(self, result: ComponentResult) -> None:
        """
        Set up the standard conformer filter
        """
        if self.rms_cutoff is not None:
            self._cache["cutoff"] = self.rms_cutoff * unit.angstrom
        else:
            self._cache["cutoff"] = None

    def _apply(self, molecules: List[Molecule]) -> ComponentResult:
        """
        Generate conformers for the molecules using the selected toolkit backend.

        Parameters:
            molecules: The list of molecules the component should be applied on.

        Returns:
            An instance of the [ComponentResult][qcsubmit.datasets.ComponentResult]
            class which handles collecting together molecules that pass and fail
            the component
        """

        # create the toolkit
        toolkit = self._toolkits[self.toolkit]()

        result = self._create_result()

        rms_cutoff = self._cache["cutoff"]

        for molecule in molecules:
            try:
                # assume input is angstrom until Quantity can be serialized
                molecule.generate_conformers(
                    n_conformers=self.max_conformers,
                    clear_existing=self.clear_existing,
                    rms_cutoff=rms_cutoff,
                    toolkit_registry=toolkit,
                )

            # need to catch more specific exceptions here.
            except Exception:
                result.filter_molecule(molecule)

            finally:
                # if we could not produce a conformer then fail the molecule
                if molecule.n_conformers == 0:
                    result.filter_molecule(molecule)
                else:
                    result.add_molecule(molecule)

        return result