Exemplo n.º 1
0
    def __init__(self,
                 mol,
                 conformer=None,
                 second_embed=False,
                 use_random_coordinates=False):
        """Create a MyConformer objects.

        :param mol: The MyMol.MyMol associated with this conformer.
        :type mol: MyMol.MyMol
        :param conformer: An optional variable specifying the conformer to use.
           If not specified, it will create a new conformer. Defaults to None.
        :type conformer: rdkit.Conformer, optional
        :param second_embed: Whether to try to generate 3D coordinates using an
            older algorithm if the better (default) algorithm fails. This can add
            run time, but sometimes converts certain molecules that would
            otherwise fail. Defaults to False.
        :type second_embed: bool, optional
        :param use_random_coordinates: The first conformer should not start
           from random coordinates, but rather the eigenvalues-based
           coordinates rdkit defaults to. But Gypsum-DL generates subsequent
           conformers to try to consider alternate geometries. So they should
           start from random coordinates. Defaults to False.
        :type use_random_coordinates: bool, optional
        """

        # Save some values to the object.
        self.mol = copy.deepcopy(mol.rdkit_mol)
        self.smiles = mol.smiles()

        # Remove any previous conformers.
        self.mol.RemoveAllConformers()

        if conformer is None:
            # The user is providing no conformer. So we must generate it.

            # Note that I have confirmed that the below respects chirality.
            # params is a list of ETKDGv2 parameters generated by this command
            # Description of these parameters can be found at
            # help(AllChem.EmbedMolecule)

            try:
                # Try to use ETKDGv2, but it is only present in the python 3.6
                # version of RDKit.
                params = AllChem.ETKDGv2()
            except:
                # Use the original version of ETKDG if python 2.7 RDKit. This
                # may be resolved in next RDKit update so we encased this in a
                # try statement.
                params = AllChem.ETKDG()

            # The default, but just a sanity check.
            params.enforcechiral = True

            # Set a max number of times it will try to calculate the 3D
            # coordinates. Will save a little time.
            params.maxIterations = 0  # This should be the default but lets
            # set it anyway

            # Also set whether to start from random coordinates.
            params.useRandomCoords = use_random_coordinates

            # AllChem.EmbedMolecule uses geometry to create inital molecule
            # coordinates. This sometimes takes a very long time
            AllChem.EmbedMolecule(self.mol, params)

            # On rare occasions, the new conformer generating algorithm fails
            # because params.useRandomCoords = False. So if it fails, try
            # again with True.
            if self.mol.GetNumConformers(
            ) == 0 and use_random_coordinates == False:
                params.useRandomCoords = True
                AllChem.EmbedMolecule(self.mol, params)

            # On very rare occasions, the new conformer generating algorithm
            # fails. For example, COC(=O)c1cc(C)nc2c(C)cc3[nH]c4ccccc4c3c12 .
            # In this case, the old one still works. So if no coordinates are
            # assigned, try that one. Parameters must have second_embed set to
            # True for this to happen.
            if second_embed == True and self.mol.GetNumConformers() == 0:
                AllChem.EmbedMolecule(self.mol,
                                      useRandomCoords=use_random_coordinates)

            # On rare occasions, both methods fail. For example,
            # O=c1cccc2[C@H]3C[NH2+]C[C@@H](C3)Cn21 Another example:
            # COc1cccc2c1[C@H](CO)[N@H+]1[C@@H](C#N)[C@@H]3C[C@@H](C(=O)[O-])[C@H]([C@H]1C2)[N@H+]3C
            if self.mol.GetNumConformers() == 0:
                self.mol = False
        else:
            # The user has provided a conformer. Just add it.
            conformer.SetId(0)
            self.mol.AddConformer(conformer, assignId=True)

        # Calculate some energies, other housekeeping.
        if self.mol is not False:
            try:
                ff = AllChem.UFFGetMoleculeForceField(self.mol)
                self.energy = ff.CalcEnergy()
            except:
                Utils.log("Warning: Could not calculate energy for molecule " +
                          Chem.MolToSmiles(self.mol))
                # Example of smiles that cause problem here without try...catch:
                # NC1=NC2=C(N[C@@H]3[C@H](N2)O[C@@H](COP(O)(O)=O)C2=C3S[Mo](S)(=O)(=O)S2)C(=O)N1
                self.energy = 9999
            self.minimized = False
            self.ids_hvy_atms = [
                a.GetIdx() for a in self.mol.GetAtoms()
                if a.GetAtomicNum() != 1
            ]
Exemplo n.º 2
0
    def remove_bizarre_substruc(self):
        """Removes molecules with improbable substuctures, likely generated
           from the tautomerization process. Used to find artifacts.

        :return: Boolean, whether or not there are impossible substructures.
           Also saves to self.bizarre_substruct.
        :rtype: bool
        """

        if self.bizarre_substruct != "":
            # Already been determined.
            return self.bizarre_substruct

        if self.rdkit_mol is None:
            # It is bizarre to have a molecule with no atoms in it.
            return True

        # These are substrutures that can't be easily corrected using
        # fix_common_errors() below.
        #, "[C+]", "[C-]", "[c+]", "[c-]", "[n-]", "[N-]"] # ,
        # "[*@@H]1(~[*][*]~2)~[*]~[*]~[*@@H]2~[*]~[*]~1",
        # "[*@@H]1~2~*~*~[*@@H](~*~*2)~*1",
        # "[*@@H]1~2~*~*~*~[*@@H](~*~*2)~*1",
        # "[*@@H]1~2~*~*~*~*~[*@@H](~*~*2)~*1",
        # "[*@@H]1~2~*~[*@@H](~*~*2)~*1", "[*@@H]~1~2~*~*~*~[*@H]1O2",
        # "[*@@H]~1~2~*~*~*~*~[*@H]1O2"]

        # Note that C(O)=N, C and N mean they are aliphatic. Does not match
        # c(O)n, when aromatic. So this form is acceptable if in aromatic
        # structure.
        prohibited_substructures = ["O(=*)-*"]  #, "C(O)=N"]
        prohibited_substructures.append(
            "C(=[CH2])[OH]")  # Enol forms with terminal alkenes are unlikely.
        prohibited_substructures.append(
            "C(=[CH2])[O-]")  # Enol forms with terminal alkenes are unlikely.
        prohibited_substructures.append(
            "C=C([OH])[OH]"
        )  # A geminal vinyl diol is not a tautomer of a carboxylate group.
        prohibited_substructures.append(
            "C=C([O-])[OH]"
        )  # A geminal vinyl diol is not a tautomer of a carboxylate group.
        prohibited_substructures.append(
            "C=C([O-])[O-]"
        )  # A geminal vinyl diol is not a tautomer of a carboxylate group.
        prohibited_substructures.append("[C-]")  # No carbanions.
        prohibited_substructures.append("[c-]")  # No carbanions.

        for s in prohibited_substructures:
            # First just match strings... could be faster, but not 100%
            # accurate.
            if s in self.orig_smi:
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

            if s in self.orig_smi_deslt:
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

            if s in self.can_smi:
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

        # Now do actual substructure matching
        for s in prohibited_substructures:
            pattrn = Chem.MolFromSmarts(s)
            if self.rdkit_mol.HasSubstructMatch(pattrn):
                # Utils.log("\tRemoving a molecule because it has an odd
                # substructure: " + s)
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

        # Now certin patterns that are more complex.
        # TODO in the future?

        self.bizarre_substruct = False
        return False
Exemplo n.º 3
0
    def __init__(self, starter, name=""):
        """Initialize the MyMol object.

        :param starter: The object (smiles or rdkit.Mol) on which to build this
           class.
        :type starter: str or rdkit.Mol
        :param name: An optional string, the name of this molecule. Defaults to "".
        :param name: str, optional
        """

        if isinstance(starter, str):
            # It's a SMILES string.
            self.rdkit_mol = ""
            self.can_smi = ""
            smiles = starter
        else:
            # So it's an rdkit mol object.
            self.rdkit_mol = starter  # No need to regenerate this, since already provided.

            # Get the smiles too from the rdkit mol object.
            try:
                smiles = Chem.MolToSmiles(self.rdkit_mol,
                                          isomericSmiles=True,
                                          canonical=True)

                # In this case you know it's cannonical.
                self.can_smi = smiles
            except:
                # Sometimes this conversion just can't happen. Happened once
                # with this beast, for example:
                # CC(=O)NC1=CC(=C=[N+]([O-])O)C=C1O
                self.can_smi = False
                id_to_print = name if name != "" else str(starter)
                Utils.log(
                    "\tERROR: Could not generate one of the structures " +
                    "for (" + id_to_print + ").")

        self.can_smi_noh = ""
        self.orig_smi = smiles

        # Default assumption is that they are the same.
        self.orig_smi_deslt = smiles
        self.name = name
        self.conformers = []
        self.nonaro_ring_atom_idx = ""
        self.chiral_cntrs_only_assigned = ""
        self.chiral_cntrs_include_unasignd = ""
        self.bizarre_substruct = ""
        self.enrgy = {}  # different energies for different conformers.
        self.minimized_enrgy = {}
        self.contnr_idx = ""
        self.frgs = ""
        self.stdrd_smiles = ""
        self.mol_props = {}
        self.idxs_low_energy_confs_no_opt = {}
        self.idxs_of_confs_to_min = set([])
        self.genealogy = []  # Keep track of how the molecule came to be.

        # Makes the molecule if a smiles was provided. Sanitizes the molecule
        # regardless.
        self.make_mol_frm_smiles_sanitze()
Exemplo n.º 4
0
def parallel_get_chiral(mol, max_variants_per_compound, thoroughness):
    """A parallelizable function for enumerating chiralities.

    :param mol: The input molecule.
    :type mol: MyMol.MyMol
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :return: A list of MyMol.MyMol objects.
    :rtype: list
    """

    # Get all chiral centers that aren't assigned explicitly in the input
    # molecules.
    unasignd = [p[0] for p in mol.chiral_cntrs_w_unasignd() if p[1] == "?"]
    num = len(unasignd)

    # Get all possible chiral assignments. If the chirality is specified,
    # retain it.
    results = []
    if num == 0:
        # There are no unspecified chiral centers, so just keep existing.
        results.append(mol)
        return results
    elif num == 1:
        # There's only one chiral center.
        options = ["R", "S"]
    else:
        # There are multiple chiral centers.
        starting = [["R"], ["S"]]
        options = [["R"], ["S"]]
        for i in range(num - 1):
            if len(options) > thoroughness * max_variants_per_compound:
                # Unfortunately, this section lends itself to a combinatorial
                # explosion if there are many chiral centers. Necessary to
                # control that or it can become problematic. So truncate early
                # if you already have a enough (so some will unfortunately
                # never be evaluated).
                break
            options = list(itertools.product(options, starting))
            options = [list(itertools.chain(c[0], c[1])) for c in options]

    # Let the user know the number of chiral centers.
    Utils.log(
        "\t"
        + mol.smiles(True)
        + " ("
        + mol.name
        + ") has "
        # + str(len(options))
        + str(2 ** num)
        + " enantiomers when chiral centers with "
        + "no specified chirality are systematically varied."
    )

    # Randomly select a few of the chiral combinations to examine. This is to
    # reduce the potential combinatorial explosion.
    num_to_keep_initially = thoroughness * max_variants_per_compound
    options = Utils.random_sample(options, num_to_keep_initially, "")

    # Go through the chirality combinations and make a molecule with that
    # chirality.
    for option in options:
        # Copy the initial rdkit molecule.
        a_rd_mol = copy.copy(mol.rdkit_mol)

        # Set its chirality.
        for idx, chiral in zip(unasignd, option):
            if chiral == "R":
                a_rd_mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW
                )
            elif chiral == "S":
                a_rd_mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW
                )

        # Make a new MyMol.MyMol object from that rdkit molecule.
        new_mol = MyMol.MyMol(a_rd_mol)

        # Add the new molecule to the list of results, if it does not have a
        # bizarre substructure.
        if not new_mol.remove_bizarre_substruc():
            new_mol.contnr_idx = mol.contnr_idx
            new_mol.name = mol.name
            new_mol.genealogy = mol.genealogy[:]
            new_mol.genealogy.append(new_mol.smiles(True) + " (chirality)")
            results.append(new_mol)

    # Return the results.
    return results
Exemplo n.º 5
0
def enumerate_chiral_molecules(
    contnrs,
    max_variants_per_compound,
    thoroughness,
    num_procs,
    job_manager,
    parallelizer_obj,
):
    """Enumerates all possible enantiomers of a molecule. If the chirality of
       an atom is given, that chiral center is not varied. Only the chirality
       of unspecified chiral centers is varied.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multiprocess mode.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # No point in continuing none requested.
    if max_variants_per_compound == 0:
        return

    Utils.log("Enumerating all possible enantiomers for all molecules...")

    # Group the molecules so you can feed them to parallelizer.
    params = []
    for contnr in contnrs:
        for mol in contnr.mols:
            params.append(tuple([mol, thoroughness, max_variants_per_compound]))
    params = tuple(params)

    # Run it through the parallelizer.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_get_chiral, num_procs, job_manager)
    else:
        for i in params:
            tmp.append(parallel_get_chiral(i[0], i[1], i[2]))

    # Remove Nones (failed molecules)
    clean = Parallelizer.strip_none(tmp)

    # Flatten the data into a single list.
    flat = Parallelizer.flatten_list(clean)

    # Get the indexes of the ones that failed to generate.
    contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, flat)

    # Go through the missing ones and throw a message.
    for miss_indx in contnr_idxs_of_failed:
        Utils.log(
            "\tCould not generate valid enantiomers for "
            + contnrs[miss_indx].orig_smi
            + " ("
            + contnrs[miss_indx].name
            + "), so using existing "
            + "(unprocessed) structures."
        )
        for mol in contnrs[miss_indx].mols:
            mol.genealogy.append("(WARNING: Unable to generate enantiomers)")
            clean.append(mol)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs, flat, max_variants_per_compound, thoroughness
    )
Exemplo n.º 6
0
def run_test():
    script_dir = os.path.dirname(os.path.realpath(__file__))
    output_folder = script_dir + os.sep + "gypsum_dl_test_output" + os.sep

    # Delete test output directory if it exists.
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)

    # Make the directory
    os.mkdir(output_folder)

    # Make the Gypsum-DL parameters.
    params = {
        "source": script_dir + os.sep + "sample_molecules.smi",
        "separate_output_files": True,
        "job_manager": "serial",  # multiprocessing
        "output_folder": output_folder,
        "add_pdb_output": False,
        "max_variants_per_compound": 8,
        "thoroughness": 1,
        "min_ph": 4,
        "max_ph": 10,
        "pka_precision": 1,
        "use_durrant_lab_filters": True,
    }

    # Prepare the molecules.
    prepare_molecules(params)
    Utils.log("")
    Utils.log("TEST RESULTS")
    Utils.log("============")

    # Get the output sdf files.
    sdf_files = glob.glob(output_folder + "*")

    # There should be seven sdf files.
    msg = "Expected 15 output files, got " + str(len(sdf_files)) + "."
    if len(sdf_files) != 15:
        Utils.exception("FAILED. " + msg)
    else:
        Utils.log("PASSED. " + msg)

    # Get all the smiles from the files.
    all_smiles = set([])
    for sdf_file in sdf_files:
        lines = open(sdf_file).readlines()
        for i, line in enumerate(lines):
            if "<SMILES>" in line:
                all_smiles.add(lines[i + 1].strip())

    # List what the smiles should be.
    target_smiles = set([])

    # salt_and_ionization should produce two models (ionized and
    # deionized).
    target_smiles |= set(["[O-]c1ccccc1", "Oc1ccccc1"])

    # tautomer_and_cis_trans should produce three models (two tautomers, one
    # of them with alternate cis/trans).
    target_smiles |= set([r"C/C=C\O", "C/C=C/O", "CCC=O"])

    # two_chiral_one_unspecified_and_tautomer should produce four models.
    target_smiles |= set([
        "CC(C)C(=O)[C@@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)=C(O)[C@@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)C(=O)[C@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)=C(O)[C@](F)(Cl)C[C@@](C)(F)Cl",
    ])

    # two_double_bonds_one_chiral_center should produce eight models.
    target_smiles |= set([
        r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl",
        "CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl",
        r"CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl",
        r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl",
        "CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl",
    ])

    # two_double_bonds_one_unspecified should produce two models.
    target_smiles |= set(
        [r"CC/C(C)=C(\Cl)C/C(I)=C(\C)F", r"CC/C(C)=C(/Cl)C/C(I)=C(\C)F"])

    # non_aromatic_ring should produce one model. It will list it several
    # times, because different ring conformations of the same model.
    target_smiles |= set(["CC(C)(C)[C@H]1CC[C@@H](C(C)(C)C)CC1"])

    # There should be no =[N-] if Durrant lab filters are turned on. Note:
    # Removed "CC(=N)O" from below list because durrant lab filters now remove
    # iminols.
    target_smiles |= set(["CC([NH-])=O", "CC(N)=O"])

    # There should be no [N-]C=[N+] (CC(=O)[N-]C=[N+](C)C).
    target_smiles |= set([
        r"C/C(O)=N\C=[N+](C)C",
        r"CC(=O)/N=C\[NH+](C)C",
        "CC(=O)/N=C/[NH+](C)C",
        "CC(=O)NC=[N+](C)C",
        "C/C(O)=N/C=[N+](C)C",
    ])

    # There should be no [nH+]c[n-] (c1c[nH+]c[n-]1)
    target_smiles |= set(["c1c[n-]cn1", "c1c[nH+]c[nH]1", "c1c[nH]cn1"])

    # There should be no [#7+]~[#7+] (c1cc[nH+][nH+]c1)
    target_smiles |= set(["c1ccnnc1", "c1cc[nH+]nc1"])

    # There should be no [#7-]~[#7-] (CC(=O)[N-][N-]C(C)=O). Note that some
    # are commented out because Python2 and Python3 given different SMILES
    # strings that are all valid. See below to see how things are
    # consolodated. (Really this was probably a bad example to pick because
    # there are so many forms...)
    target_smiles |= set([
        "CC(=O)NNC(C)=O",
        # r"CC(=O)N/N=C(\C)O",
        # r"CC(=O)[N-]/N=C(/C)O",
        # r"C/C(O)=N/N=C(\C)O",
        r"C/C(O)=N\N=C(/C)O",
        # r"CC(=O)[N-]/N=C(\C)O",
        # "CC(=O)[N-]NC(C)=O",
        # "CC(=O)N/N=C(/C)O"
    ])

    # There should be no [!#7]~[#7+]~[#7-]~[!#7] (c1c[n-][nH+]c1)
    target_smiles |= set(["c1cn[n-]c1", "c1cn[nH]c1", "c1c[nH][nH+]c1"])

    # Azides can have adjacent +/- nitrogens.
    target_smiles |= set(["CN=[N+]=[N-]", "CN=[N+]=N"])

    # msg = "Expected " + str(len(target_smiles)) + " total SMILES, got " + \
    #     str(len(all_smiles)) + "."
    # if len(all_smiles) != len(target_smiles):
    #     Utils.exception("FAILED. " + msg)
    # else:
    #     Utils.log("PASSED. " + msg)

    # Python3 gives some smiles that are different than thsoe obtain with
    # Python2. But they are just different representations of the same thing.
    # Let's make the switch to the Python2 form for this test.
    all_smiles = set(
        ["CN=[N+]=N" if s == "[H]N=[N+]=NC" else s for s in all_smiles])

    # Note: Commented out below because durrant lab filters now remove
    # iminols.
    # all_smiles = set(
    #     ["CC(=N)O" if s in [r"[H]/N=C(\C)O", "[H]/N=C(/C)O"] else s for s in all_smiles]
    # )

    all_smiles = set([
        r"C/C(O)=N\N=C(/C)O" if s == r"C/C(O)=N/N=C(/C)O" else
        s  # Different one that turns up sometimes
        for s in all_smiles
    ])
    all_smiles = set([
        r"CC(=O)NNC(C)=O" if s in [
            r"CC(=O)[N-]/N=C(\C)O",
            r"C/C(O)=N/N=C(\C)O",
            r"CC(=O)N/N=C(\C)O",
            r"CC(=O)[N-]/N=C(/C)O",
            r"CC(=O)[N-]NC(C)=O",
            r"CC(=O)N/N=C(/C)O",
        ] else s  # Different one that turns up sometimes
        for s in all_smiles
    ])

    if len(all_smiles ^ target_smiles) > 0:
        print(all_smiles)
        print(target_smiles)
        import pdb
        pdb.set_trace()

        Utils.exception(
            "FAILED. " +
            "Got some SMILES I didn't expect (either in output or target list): "
            + " ".join(list(all_smiles ^ target_smiles)))
    else:
        Utils.log(
            "PASSED. Gypsum-DL output the very SMILES strings I was expecting."
        )

    Utils.log("")

    # Delete test output directory if it exists.
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
Exemplo n.º 7
0
def add_hydrogens(contnrs, min_pH, max_pH, st_dev, max_variants_per_compound,
                  thoroughness, num_procs, job_manager,
                  parallelizer_obj):
    """Adds hydrogen atoms to molecule containers, as appropriate for a given
       pH.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param min_pH: The minimum pH to consider.
    :type min_pH: float
    :param max_pH: The maximum pH to consider.
    :type max_pH: float
    :param st_dev: The standard deviation. See Dimorphite-DL paper.
    :type st_dev: float
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Ionizing all molecules...")

    # Make a simple directory with the ionization parameters.
    protonation_settings = {"min_ph": min_pH,
                            "max_ph": max_pH,
                            "pka_precision": st_dev,
                            "max_variants": thoroughness * max_variants_per_compound}

    # Format the inputs for use in the parallelizer.
    inputs = tuple([tuple([cont, protonation_settings]) for cont in contnrs if type(cont.orig_smi_canonical)==str])

    # Run the parallelizer and collect the results.
    results = []
    if parallelizer_obj !=  None:
        results = parallelizer_obj.run(inputs, parallel_add_H, num_procs, job_manager)
    else:
        for i in inputs:
            results.append(parallel_add_H(i[0],i[1]))

    results = Parallelizer.flatten_list(results)

    # Dimorphite-DL might not have generated ionization states for some
    # molecules. Identify those that are missing.
    contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, results)

    # For those molecules, just use the original SMILES string, with hydrogen
    # atoms added using RDKit.
    for miss_indx in contnr_idxs_of_failed:
        Utils.log(
            "\tWARNING: Gypsum-DL produced no valid ionization states for " +
            contnrs[miss_indx].orig_smi + " (" +
            contnrs[miss_indx].name + "), so using the original " +
            "smiles."
        )

        amol = contnrs[miss_indx].mol_orig_frm_inp_smi
        amol.contnr_idx = miss_indx

        # Save this failure to the genealogy record.
        amol.genealogy = [
            amol.orig_smi + " (source)",
            amol.orig_smi_deslt + " (desalted)",
            "(WARNING: Gypsum-DL could not assign ionization states)"
        ]

        # Save this one to the results too, even though not processed
        # properly.
        results.append(amol)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs, results, max_variants_per_compound, thoroughness
    )
Exemplo n.º 8
0
def prepare_smiles(contnrs, params):
    """Runs the appropriate steps for processing the SMILES strings.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param params: The user parameters.
    :type params: dict
    """

    # Unpack some of the parameter values.
    min_ph = params["min_ph"]
    max_ph = params["max_ph"]
    std_dev = params["pka_precision"]
    max_variants_per_compound = params["max_variants_per_compound"]
    thoroughness = params["thoroughness"]
    num_procs = params["num_processors"]
    job_manager = params["job_manager"]
    let_tautomers_change_chirality = params["let_tautomers_change_chirality"]
    parallelizer_obj = params["Parallelizer"]

    debug = True

    # Desalt the molecules. Note that the program always desalts (can't turn it
    # off).
    # Utils.log("Begin Desaltings")
    desalt_orig_smi(contnrs, num_procs, job_manager, parallelizer_obj)
    # Utils.log("Done with Desalting")

    # Filter the containers to remove ones that have bad substrings (metal,
    # etc.) in the desalted smiles, assuming durrant lab filter turned on. Note
    # that some compounds aren't filtered until later.
    if params["use_durrant_lab_filters"] == True:
        contnrs = [
            c for c in contnrs if not durrant_lab_contains_bad_substr(c.orig_smi_deslt)
        ]

    if debug:
        Utils.print_current_smiles(contnrs)

    # Add hydrogens for user-specified pH, if requested.
    if not params["skip_adding_hydrogen"]:
        # Utils.log("Ionizing Molecules")
        add_hydrogens(
            contnrs,
            min_ph,
            max_ph,
            std_dev,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            parallelizer_obj,
        )
        # Utils.log("Done with Ionization")
    else:
        Utils.log("Skipping ionization")
        wrap_molecules(contnrs)

    if debug:
        Utils.print_current_smiles(contnrs)

    # Make alternate tautomeric forms, if requested.
    if not params["skip_making_tautomers"]:
        # Utils.log("Tautomerizing Molecules")
        make_tauts(
            contnrs,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            let_tautomers_change_chirality,
            parallelizer_obj,
        )
        # Utils.log("Done with Tautomerization")
    else:
        Utils.log("Skipping tautomerization")

    if debug:
        Utils.print_current_smiles(contnrs)

    # Apply Durrant-lab filters if requested
    if params["use_durrant_lab_filters"]:
        # Utils.log("Applying Durrant-Lab Filters")
        durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj)
        # Utils.log("Done Applying Durrant-Lab Filters")
    else:
        Utils.log("Not applying Durrant-lab filters")

    if debug:
        Utils.print_current_smiles(contnrs)

    # Make alternate chiral forms, if requested.
    if not params["skip_enumerate_chiral_mol"]:
        # Utils.log("Enumerating Chirality")
        enumerate_chiral_molecules(
            contnrs,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            parallelizer_obj,
        )
        # Utils.log("Done with Chirality Enumeration")
    else:
        Utils.log("Skipping chirality enumeration")

    if debug:
        Utils.print_current_smiles(contnrs)

    # Make alternate double-bond isomers, if requested.
    if not params["skip_enumerate_double_bonds"]:
        # Utils.log("Enumerating Double Bonds")
        enumerate_double_bonds(
            contnrs,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            parallelizer_obj,
        )
        # Utils.log("Done with Double Bond Enumeration")
    else:
        Utils.log("Skipping double bond enumeration")

    if debug:
        Utils.print_current_smiles(contnrs)
Exemplo n.º 9
0
def bst_for_each_contnr_no_opt(
    contnrs,
    mol_lst,
    max_variants_per_compound,
    thoroughness,
    crry_ovr_frm_lst_step_if_no_fnd=True,
):
    """Keep only the top few compound variants in each container, to prevent a
       combinatorial explosion. This is run periodically on the growing
       containers to keep them in check.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param mol_lst: The list of MyMol.MyMol objects.
    :type mol_lst: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param crry_ovr_frm_lst_step_if_no_fnd: If it can't find any low-energy
       conformers, determines whether to just keep the old ones. Defaults to
       True.
    :param crry_ovr_frm_lst_step_if_no_fnd: bool, optional
    """

    # Remove duplicate ligands from each container.
    for mol_cont in contnrs:
        mol_cont.remove_identical_mols_from_contnr()

    # Group the smiles by contnr_idx.
    data = Utils.group_mols_by_container_index(mol_lst)

    # Go through each container.
    for contnr_idx, contnr in enumerate(contnrs):
        contnr_idx = contnr.contnr_idx
        none_generated = False

        # Pick just the lowest-energy conformers from the new candidates.
        # Possible a compound was eliminated early on, so doesn't exist.
        if contnr_idx in list(data.keys()):
            mols = data[contnr_idx]

            # Remove molecules with unusually high charges.
            mols = remove_highly_charged_molecules(mols)

            # Pick the lowest-energy molecules. Note that this creates a
            # conformation if necessary, but it is not minimized and so is not
            # computationally expensive.
            mols = pick_lowest_enrgy_mols(mols, max_variants_per_compound,
                                          thoroughness)

            if len(mols) > 0:
                # Now remove all previously determined mols for this
                # container.
                contnr.mols = []

                # Add in the lowest-energy conformers back to the container.
                for mol in mols:
                    contnr.add_mol(mol)
            else:
                none_generated = True
        else:
            none_generated = True

        # No low-energy conformers were generated.
        if none_generated:
            if crry_ovr_frm_lst_step_if_no_fnd:
                # Just use previous ones.
                Utils.log(
                    "\tWARNING: Unable to find low-energy conformations: " +
                    contnr.orig_smi_deslt + " (" + contnr.name +
                    "). Keeping original " + "conformers.")
            else:
                # Discard the conformation.
                Utils.log(
                    "\tWARNING: Unable to find low-energy conformations: " +
                    contnr.orig_smi_deslt + " (" + contnr.name +
                    "). Discarding conformer.")
                contnr.mols = []
Exemplo n.º 10
0
def make_tauts(contnrs, max_variants_per_compound, thoroughness, num_procs,
               job_manager, let_tautomers_change_chirality, parallelizer_obj):
    """Generates tautomers of the molecules. Note that some of the generated
    tautomers are not realistic. If you find a certain improbable
    substructure keeps popping up, add it to the list in the
    `prohibited_substructures` definition found with MyMol.py, in the function
    remove_bizarre_substruc().

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param let_tautomers_change_chirality: Whether to allow tautomers that
      change the total number of chiral centers.
    :type let_tautomers_change_chirality: bool
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # No need to proceed if there are no max variants.
    if max_variants_per_compound == 0:
        return

    Utils.log("Generating tautomers for all molecules...")

    # Create the parameters to feed into the parallelizer object.
    params = []
    for contnr in contnrs:
        for mol_index, mol in enumerate(contnr.mols):
            params.append(tuple([contnr, mol_index,
                                 max_variants_per_compound]))
    params = tuple(params)

    # Run the tautomizer through the parallel object.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_make_taut, num_procs,
                                   job_manager)
    else:
        for i in params:
            tmp.append(parallel_make_taut(i[0], i[1], i[2]))

    # Flatten the resulting list of lists.
    none_data = tmp
    taut_data = Parallelizer.flatten_list(none_data)

    # Remove bad tautomers.
    taut_data = tauts_no_break_arom_rngs(contnrs, taut_data, num_procs,
                                         job_manager, parallelizer_obj)

    if not let_tautomers_change_chirality:
        taut_data = tauts_no_elim_chiral(contnrs, taut_data, num_procs,
                                         job_manager, parallelizer_obj)

    # taut_data = tauts_no_change_hs_to_cs_unless_alpha_to_carbnyl(
    #    contnrs, taut_data, num_procs, job_manager, parallelizer_obj
    # )

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(contnrs, taut_data,
                                         max_variants_per_compound,
                                         thoroughness)
Exemplo n.º 11
0
def prepare_molecules(args):
    """A function for preparing small-molecule models for docking. To work, it
    requires that the python module rdkit be installed on the system.

    :param args: The arguments, from the commandline.
    :type args: dict
    """

    # Keep track of the tim the program starts.
    start_time = datetime.now()

    # A list of command-line parameters that will be ignored if using a json
    # file.
    json_warning_list = [
        "source",
        "output_folder",
        "num_processors",
        "min_ph",
        "max_ph",
        "delta_ph_increment",
        "thoroughness",
        "max_variants_per_compound",
        "pka_precision",
    ]

    # Whether to warn the user that the above parameters, if specified, will
    # be ignored.
    need_to_print_override_warning = False

    if "json" in args:
        # "json" is one of the parameters, so we'll be ignoring the rest.
        try:
            params = json.load(open(args["json"]))
        except:
            Utils.exception("Is your input json file properly formed?")

        params = set_parameters(params)
        if [i for i in json_warning_list if i in list(args.keys())]:
            need_to_print_override_warning = True
    else:
        # We're actually going to use all the command-line parameters. No
        # warning necessary.
        params = set_parameters(args)

    # If running in serial mode, make sure only one processor is used.
    if params["job_manager"] == "serial":
        if params["num_processors"] != 1:
            Utils.log(
                "Because --job_manager was set to serial, this will be run on a single processor."
            )
        params["num_processors"] = 1

    # Handle mpi errors if mpi4py isn't installed
    if params["job_manager"] == "mpi":

        # Before executing Parallelizer with mpi4py (which override python raise Exceptions)
        # We must check that it is being run with the "-m mpi4py" runpy flag
        sys_modules = sys.modules
        if "runpy" not in sys_modules.keys():
            printout = "\nTo run in mpi mode you must run with -m flag. ie) mpirun -n $NTASKS python -m mpi4py run_gypsum_dl.py\n"
            print(printout)
            Utils.exception(printout)

        # Check mpi4py import
        try:
            import mpi4py
        except:
            printout = "\nmpi4py not installed but --job_manager is set to mpi. \n Either install mpi4py or switch job_manager to multiprocessing or serial.\n"
            print(printout)
            Utils.exception(printout)

        # Check mpi4py import version. This must be at version 2.1.0 and higher
        mpi4py_version = mpi4py.__version__
        mpi4py_version = [int(x) for x in mpi4py_version.split(".")]

        if mpi4py_version[0] == 2:
            if mpi4py_version[1] < 1:
                printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n"
                print(printout)
                Utils.exception(printout)
        elif mpi4py_version[0] < 2:
            printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n"
            print(printout)
            Utils.exception(printout)

    # Throw a message if running on windows. Windows doesn't deal with with
    # multiple processors, so use only 1.
    if sys.platform == "win32":
        Utils.log(
            "WARNING: Multiprocessing is not supported on Windows. Tasks will be run in Serial mode."
        )
        params["num_processors"] = 1
        params["job_manager"] = "serial"

    # Launch mpi workers if that's what's specified.
    if params["job_manager"] == "mpi":
        params["Parallelizer"] = Parallelizer(
            params["job_manager"], params["num_processors"]
        )
    else:
        # Lower-level mpi (i.e. making a new Parallelizer within an mpi) has
        # problems with importing the MPI environment and mpi4py. So we will
        # flag it to skip the MPI mode and just go to multiprocess/serial.
        # This is a saftey precaution
        params["Parallelizer"] = Parallelizer(
            params["job_manager"], params["num_processors"], True
        )

    # Let the user know that their command-line parameters will be ignored, if
    # they have specified a json file.
    if need_to_print_override_warning == True:
        Utils.log("WARNING: Using the --json flag overrides all other flags.")

    # If running in mpi mode, separate_output_files must be set to true.
    if params["job_manager"] == "mpi" and params["separate_output_files"] == False:
        Utils.log(
            "WARNING: Running in mpi mode, but separate_output_files is not set to True. Setting separate_output_files to True anyway."
        )
        params["separate_output_files"] = True

    # Outputing HTML files not supported in mpi mode.
    if params["job_manager"] == "mpi" and params["add_html_output"] == True:
        Utils.log(
            "WARNING: Running in mpi mode, but add_html_output is set to True. HTML output is not supported in mpi mode."
        )
        params["add_html_output"] = False

    # Warn the user if he or she is not using the Durrant lab filters.
    if params["use_durrant_lab_filters"] ==- False:
        Utils.log(
            "WARNING: Running Gypsum-DL without the Durrant-lab filters. In looking over many Gypsum-DL-generated " +
            "variants, we have identified a number of substructures that, though technically possible, strike us " +
            "as improbable or otherwise poorly suited for virtual screening. We strongly recommend removing these " +
            "by running Gypsum-DL with the --use_durrant_lab_filters option.",
            trailing_whitespace="\n"
        )

    # Load SMILES data
    if isinstance(params["source"], str):
        Utils.log("Loading molecules from " + os.path.basename(params["source"]) + "...")

        # Smiles must be array of strs.
        src = params["source"]
        if src.lower().endswith(".smi") or src.lower().endswith(".can"):
            # It's an smi file.
            smiles_data = load_smiles_file(src)
        elif params["source"].lower().endswith(".sdf"):
            # It's an sdf file. Convert it to a smiles.
            smiles_data = load_sdf_file(src)
        else:
            smiles_data = [params["source"]]
    else:
        pass  # It's already in the required format.

    # Make the output directory if necessary.
    if os.path.exists(params["output_folder"]) == False:
        os.mkdir(params["output_folder"])
        if os.path.exists(params["output_folder"]) == False:
            Utils.exception("Output folder directory couldn't be found or created.")

    # For Debugging
    # print("")
    # print("###########################")
    # print("num_procs  :  ", params["num_processors"])
    # print("chosen mode  :  ", params["job_manager"])
    # print("Parallel style:  ", params["Parallelizer"].return_mode())
    # print("Number Nodes:  ", params["Parallelizer"].return_node())
    # print("###########################")
    # print("")

    # Make the molecule containers.
    contnrs = []
    idx_counter = 0
    for i in range(0, len(smiles_data)):
        try:
            smiles, name, props = smiles_data[i]
        except:
            msg = 'Unexpected error. Does your "source" parameter specify a '
            msg = msg + "filename that ends in a .can, .smi, or .sdf extension?"
            Utils.exception(msg)

        if detect_unassigned_bonds(smiles) is None:
            Utils.log(
                "WARNING: Throwing out SMILES because of unassigned bonds: " + smiles
            )
            continue

        new_contnr = MolContainer(smiles, name, idx_counter, props)
        if (
            new_contnr.orig_smi_canonical == None
            or type(new_contnr.orig_smi_canonical) != str
        ):
            Utils.log(
                "WARNING: Throwing out SMILES because of it couldn't convert to mol: "
                + smiles
            )
            continue

        contnrs.append(new_contnr)
        idx_counter += 1

    # Remove None types from failed conversion
    contnrs = [x for x in contnrs if x.orig_smi_canonical != None]
    if len(contnrs) != idx_counter:
        Utils.exception("There is a corrupted container")

    # In multiprocessing mode, Gypsum-DL parallelizes each small-molecule
    # preparation step separately. But this scheme is inefficient in MPI mode
    # because it increases the amount of communication required between nodes.
    # So for MPI mode, we will run all the preparation steps for a given
    # molecule container on a single thread.
    if params["Parallelizer"].return_mode() != "mpi":
        # Non-MPI (e.g., multiprocessing)
        execute_gypsum_dl(contnrs, params)
    else:
        # MPI mode. Group the molecule containers so they can be passed to the
        # parallelizer.
        job_input = []
        temp_param = {}
        for key in list(params.keys()):
            if key == "Parallelizer":
                temp_param["Parallelizer"] = None
            else:
                temp_param[key] = params[key]

        for contnr in contnrs:
            contnr.contnr_idx = 0  # Because each container being run in isolation.
            job_input.append(tuple([[contnr], temp_param]))
        job_input = tuple(job_input)

        params["Parallelizer"].run(job_input, execute_gypsum_dl)

    # Calculate the total run time.
    end_time = datetime.now()
    run_time = end_time - start_time
    params["start_time"] = str(start_time)
    params["end_time"] = str(end_time)
    params["run_time"] = str(run_time)

    Utils.log("\nStart time at: " + str(start_time))
    Utils.log("End time at:   " + str(end_time))
    Utils.log("Total time at: " + str(run_time))

    # Kill mpi workers if necessary.
    params["Parallelizer"].end(params["job_manager"])
Exemplo n.º 12
0
def parallel_make_taut(contnr, mol_index, max_variants_per_compound):
    """Makes alternate tautomers for a given molecule container. This is the
       function that gets fed into the parallelizer.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param mol_index: The molecule index.
    :type mol_index: int
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :return: A list of MyMol.MyMol objects, containing the alternate
        tautomeric forms.
    :rtype: list
    """

    # Get the MyMol.MyMol within the molecule container corresponding to the
    # given molecule index.
    mol = contnr.mols[mol_index]

    # Create a temporary RDKit mol object, since that's what MolVS works with.
    # TODO: There should be a copy function
    m = MyMol.MyMol(mol.smiles()).rdkit_mol

    # For tautomers to work, you need to not have any explicit hydrogens.
    m = Chem.RemoveHs(m)

    # Make sure it's not None.
    if m is None:
        Utils.log("\tCould not generate tautomers for " + contnr.orig_smi +
                  ". I'm deleting it.")
        return

    # Molecules should be kekulized already, but let's double check that.
    # Because MolVS requires kekulized input.
    Chem.Kekulize(m)
    m = MOH.check_sanitization(m)
    if m is None:
        return None

    # Limit to max_variants_per_compound tauts. Note that another batch could
    # add more, so you'll need to once again trim to this number later. But
    # this could at least help prevent the combinatorial explosion at this
    # stage.
    enum = tautomer.TautomerEnumerator(max_tautomers=max_variants_per_compound)
    tauts_rdkit_mols = enum.enumerate(m)

    # Make all those tautomers into MyMol objects.
    tauts_mols = [MyMol.MyMol(m) for m in tauts_rdkit_mols]

    # Keep only those that have reasonable substructures.
    tauts_mols = [
        t for t in tauts_mols if t.remove_bizarre_substruc() == False
    ]

    # If there's more than one, let the user know that.
    if len(tauts_mols) > 1:
        Utils.log("\t" + mol.smiles(True) + " has tautomers.")

    # Now collect the final results.
    results = []

    for tm in tauts_mols:
        tm.inherit_contnr_props(contnr)
        tm.genealogy = mol.genealogy[:]
        tm.name = mol.name

        if tm.smiles() != mol.smiles():
            tm.genealogy.append(tm.smiles(True) + " (tautomer)")

        results.append(tm)

    return results
Exemplo n.º 13
0
def save_to_sdf(contnrs, params, separate_output_files, output_folder):
    """Saves the 3D models to the disk as an SDF file.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param params: The parameters.
    :type params: dict
    :param separate_output_files: Whether save each molecule to a different
       file.
    :type separate_output_files: bool
    :param output_folder: The output folder.
    :type output_folder: str
    """

    # Save an empty molecule with the parameters.
    if separate_output_files == False:
        w = Chem.SDWriter(output_folder + os.sep + "gypsum_dl_success.sdf")
    else:
        w = Chem.SDWriter(output_folder + os.sep + "gypsum_dl_params.sdf")

    m = Chem.Mol()
    m.SetProp("_Name", "EMPTY MOLECULE DESCRIBING GYPSUM-DL PARAMETERS")
    for param in params:
        m.SetProp(param, str(params[param]))
    w.write(m)

    if separate_output_files == True:
        w.flush()
        w.close()

    # Also save the file or files containing the output molecules.
    Utils.log("Saving molecules associated with...")
    for i, contnr in enumerate(contnrs):
        # Add the container properties to the rdkit_mol object so they get
        # written to the SDF file.
        contnr.add_container_properties()

        # Let the user know which molecule you're on.
        Utils.log("\t" + contnr.orig_smi)

        # Save the file(s).
        if separate_output_files == True:
            # sdf_file = "{}{}__{}.pdb".format(output_folder + os.sep, slug(name), conformer_counter)
            sdf_file = "{}{}__input{}.sdf".format(
                output_folder + os.sep,
                Utils.slug(contnr.name),
                contnr.contnr_idx_orig + 1,
            )
            w = Chem.SDWriter(sdf_file)
            # w = Chem.SDWriter(output_folder + os.sep + "output." + str(i + 1) + ".sdf")

        for m in contnr.mols:
            m.load_conformers_into_rdkit_mol()
            w.write(m.rdkit_mol)

        if separate_output_files == True:
            w.flush()
            w.close()

    if separate_output_files == False:
        w.flush()
        w.close()
Exemplo n.º 14
0
def generate_alternate_3d_nonaromatic_ring_confs(contnrs,
                                                 max_variants_per_compound,
                                                 thoroughness, num_procs,
                                                 second_embed, job_manager,
                                                 parallelizer_obj):
    """Docking programs like Vina rotate chemical moieties around their
       rotatable bonds, so it's not necessary to generate a larger rotomer
       library for each molecule. The one exception to this rule is
       non-aromatic rings, which can assume multiple conformations (boat vs.
       chair, etc.). This function generates a few low-energy ring structures
       for each molecule with a non-aromatic ring(s).

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param second_embed: Whether to try to generate 3D coordinates using an
        older algorithm if the better (default) algorithm fails. This can add
        run time, but sometimes converts certain molecules that would
        otherwise fail.
    :type second_embed: bool
    :param job_manager: The multiprocess mode.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    :return: Returns None if no ring conformers are generated
    :rtype: None
    """

    # Let the user know you've started this step.
    Utils.log("Generating several conformers of molecules with non-aromatic " +
              "rings (boat vs. chair, etc.)...")

    # Create parameters (inputs) to feed to the parallelizer.
    params = []
    ones_with_nonaro_rngs = set([])  # This is just to keep track of which
    # ones have non-aromatic rings.
    for contnr_idx, contnr in enumerate(contnrs):
        if contnr.num_nonaro_rngs > 0:
            ones_with_nonaro_rngs.add(contnr_idx)
            for mol in contnr.mols:
                params.append(
                    tuple([
                        mol, max_variants_per_compound, thoroughness,
                        second_embed
                    ]))
    params = tuple(params)

    # If there are no compounds with non-aromatic rings, no need to continue.
    if len(ones_with_nonaro_rngs) == 0:
        return  # There are no such ligands to process.

    # Run it through the parallelizer
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_get_ring_confs, num_procs,
                                   job_manager)
    else:
        for i in params:
            tmp.append(parallel_get_ring_confs(i[0], i[1], i[2], i[3]))

    # Flatten the results.
    results = Parallelizer.flatten_list(tmp)

    # Group by mol. You can't use existing functions because they would
    # require you to recalculate already calculated energies.
    grouped = {}  # Index will be container index. Value is list of
    # (energy, mol) pairs.
    for mol in results:
        # Save the energy as a prop while you're here.
        energy = mol.conformers[0].energy
        mol.mol_props["Energy"] = energy

        # Add the mol with it's energy to the appropriate entry in grouped.
        # Make that entry if needed.
        contnr_idx = mol.contnr_idx
        if not contnr_idx in grouped:
            grouped[contnr_idx] = []
        grouped[contnr_idx].append((energy, mol))

    # Now, for each container, keep only the best ones.
    for contnr_idx in grouped:
        lst_enrgy_mol_pairs = grouped[contnr_idx]

        if len(lst_enrgy_mol_pairs) != 0:
            contnrs[contnr_idx].mols = []  # Note that only affects ones that
            # had non-aromatic rings.
            lst_enrgy_mol_pairs.sort()  # Sorting by energy (first item in
            # pair).

            # Keep only the top ones.
            lst_enrgy_mol_pairs = lst_enrgy_mol_pairs[:
                                                      max_variants_per_compound]

            # Add the top ones to the container mol list.
            for energy, mol in lst_enrgy_mol_pairs:
                contnrs[contnr_idx].add_mol(mol)
        else:
            # There are no entries in the list. It apparently wasn't able to
            # generate any alternate conformers. Let the user know.
            for i in range(len(contnrs[contnr_idx].mols)):
                contnrs[contnr_idx].mols[i].genealogy.append(
                    "(WARNING: Could not generate alternate conformations " +
                    "of nonaromatic ring)")
Exemplo n.º 15
0
def load_smiles_file(filename):
    """Loads a smiles file.

    :param filename: The filename.
    :type filename: str
    :return: A list of tuples, (SMILES, Name).
    :rtype: list
    """

    # A smiles file contains one molecule on each line. Each line is a string,
    # separated by white space, followed by the molecule name.
    data = []
    duplicate_names = {}
    line_counter = 0
    name_list = []
    for line in open(filename):
        # You've got the line.
        line = line.strip()
        if line != "":
            # From that line, get the smiles string and name.
            chunks = line.split()
            smiles = chunks[0]
            name = " ".join(chunks[1:])

            # Handle unnamed ligands.
            if name == "":
                name = "untitled_line_{}".format(line_counter + 1)
                Utils.log(
                    ("\tUntitled ligand on line {}. Naming that ligand " +
                     "{}. All associated files will be referred to with " +
                     "this name.").format(line_counter + 1, name))

            # Handle duplicate ligands in same list.
            if name in name_list:
                # If multiple names...
                if name in list(duplicate_names.keys()):
                    duplicate_names[name] = duplicate_names[name] + 1

                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe version of the ligand on line {} will be retitled {}"
                        .format(line_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be referred to with this name"
                    )
                    name = new_name
                else:
                    duplicate_names[name] = 2
                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe version of the ligand on line {} will be retitled {}"
                        .format(line_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be referred to with this name"
                    )
                    name = new_name

            # Save the data for this line and advance.
            name_list.append(name)
            line_counter += 1
            data.append((smiles, name, {}))

    # Return the data.
    return data
Exemplo n.º 16
0
def parallel_get_double_bonded(mol, max_variants_per_compound, thoroughness):
    """A parallelizable function for enumerating double bonds.

    :param mol: The molecule with a potentially unspecified double bond.
    :type mol: MyMol.MyMol
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :return: [description]
    :rtype: [type]
    """

    # For this to work, you need to have explicit hydrogens in place.
    mol.rdkit_mol = Chem.AddHs(mol.rdkit_mol)

    # Get all double bonds that don't have defined stereochemistry. Note that
    # these are the bond indexes, not the atom indexes.
    unasignd_dbl_bnd_idxs = mol.get_double_bonds_without_stereochemistry()

    if len(unasignd_dbl_bnd_idxs) == 0:
        # There are no unassigned double bonds, so move on.
        return [mol]

    # Throw out any bond that is in a small ring.
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(3)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(4)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(5)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(6)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(7)
    ]

    # Previously, I fully enumerated all double bonds. When there are many
    # such bonds, that leads to a combinatorial explosion that causes problems
    # in terms of speed and memory. Now, enumerate only enough bonds to make
    # sure you generate at least thoroughness * max_variants_per_compound.
    unasignd_dbl_bnd_idxs_orig_count = len(unasignd_dbl_bnd_idxs)
    num_bonds_to_keep = int(math.ceil(math.log(thoroughness * max_variants_per_compound, 2)))
    random.shuffle(unasignd_dbl_bnd_idxs)
    unasignd_dbl_bnd_idxs = sorted(unasignd_dbl_bnd_idxs[:num_bonds_to_keep])

    # Get a list of all the single bonds that come off each double-bond atom.
    all_sngl_bnd_idxs = set([])
    dbl_bnd_count = 0
    for dbl_bnd_idx in unasignd_dbl_bnd_idxs:
        bond = mol.rdkit_mol.GetBondWithIdx(dbl_bnd_idx)

        atom1 = bond.GetBeginAtom()
        atom1_bonds = atom1.GetBonds()
        if len(atom1_bonds) == 1:
            # The only bond is the one you already know about. So don't save.
            continue

        atom2 = bond.GetEndAtom()
        atom2_bonds = atom2.GetBonds()
        if len(atom2_bonds) == 1:
            # The only bond is the one you already know about. So don't save.
            continue

        dbl_bnd_count = dbl_bnd_count + 1

        # Suffice it to say, RDKit does not deal with cis-trans isomerization
        # in an intuitive way...
        idxs_of_other_bnds_frm_atm1 = [b.GetIdx() for b in atom1.GetBonds()]
        idxs_of_other_bnds_frm_atm1.remove(dbl_bnd_idx)

        idxs_of_other_bnds_frm_atm2 = [b.GetIdx() for b in atom2.GetBonds()]
        idxs_of_other_bnds_frm_atm2.remove(dbl_bnd_idx)

        all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm1)
        all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm2)

    # Now come up with all possible up/down combinations for those bonds.
    all_sngl_bnd_idxs = list(all_sngl_bnd_idxs)
    all_atom_config_options = list(
        itertools.product([True, False], repeat=len(all_sngl_bnd_idxs))
    )

    # Let the user know.
    if dbl_bnd_count > 0:
        Utils.log(
            "\t"
            + mol.smiles(True)
            + " has "
            # + str(dbl_bnd_count)
            + str(
                # Not exactly right, I think, because should be dbl_bnd_count, but ok.
                unasignd_dbl_bnd_idxs_orig_count
            )
            + " double bond(s) with unspecified stereochemistry."
        )

    # Go through and consider each of the retained combinations.
    smiles_to_consider = set([])
    for atom_config_options in all_atom_config_options:
        # Make a copy of the original RDKit molecule.
        a_rd_mol = copy.copy(mol.rdkit_mol)
        # a_rd_mol = Chem.MolFromSmiles(mol.smiles())

        for bond_idx, direc in zip(all_sngl_bnd_idxs, atom_config_options):
            # Always done with reference to the atom in the double bond.
            if direc:
                a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDUPRIGHT)
            else:
                a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDDOWNRIGHT)

        # Assign the StereoChemistry. Required to actually set it.
        a_rd_mol.ClearComputedProps()
        Chem.AssignStereochemistry(a_rd_mol, force=True)

        # Add to list of ones to consider
        try:
            smiles_to_consider.add(
                Chem.MolToSmiles(a_rd_mol, isomericSmiles=True, canonical=True)
            )
        except:
            # Some molecules still give troubles. Unfortunate, but these are
            # rare cases. Let's just skip these. Example:
            # CN1C2=C(C=CC=C2)C(C)(C)[C]1=[C]=[CH]C3=CC(=C(O)C(=C3)I)I
            continue

    # Remove ones that don't have "/" or "\". These are not real enumerated ones.
    smiles_to_consider = [s for s in smiles_to_consider if "/" in s or "\\" in s]

    # Get the maximum number of / + \ in any string.
    cnts = [s.count("/") + s.count("\\") for s in smiles_to_consider]

    if len(cnts) == 0:
        # There are no appropriate double bonds. Move on...
        return [mol]

    max_cnts = max(cnts)

    # Only keep those with that same max count. The others have double bonds
    # that remain unspecified.
    smiles_to_consider = [
        s[0] for s in zip(smiles_to_consider, cnts) if s[1] == max_cnts
    ]
    results = []
    for smile_to_consider in smiles_to_consider:
        # Make a new MyMol.MyMol object with the specified smiles.
        new_mol = MyMol.MyMol(smile_to_consider)

        if new_mol.can_smi != False and new_mol.can_smi != None:
            # Sometimes you get an error if there's a bad structure otherwise.

            # Add the new molecule to the list of results, if it does not have
            # a bizarre substructure.
            if not new_mol.remove_bizarre_substruc():
                new_mol.contnr_idx = mol.contnr_idx
                new_mol.name = mol.name
                new_mol.genealogy = mol.genealogy[:]
                new_mol.genealogy.append(
                    new_mol.smiles(True) + " (cis-trans isomerization)"
                )
                results.append(new_mol)

    # Return the results.
    return results
Exemplo n.º 17
0
def load_sdf_file(filename):
    """Loads an sdf file.

    :param filename: The filename.
    :type filename: str
    :return: A list of tuples, (SMILES, Name).
    :rtype: list
    """

    suppl = Chem.SDMolSupplier(filename)
    data = []
    duplicate_names = {}
    missing_name_counter = 0
    mol_obj_counter = 0
    name_list = []
    for mol in suppl:
        # Convert mols to smiles. That's what the rest of the program is
        # designed to deal with.
        smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

        try:
            name = mol.GetProp("_Name")
        except:
            name = ""

        # Handle unnamed ligands
        if name == "":
            Utils.log("\tUntitled ligand for the {} molecule in the input SDF".
                      format(mol_obj_counter))
            name = "untitled_{}_molnum_{}".format(missing_name_counter,
                                                  mol_obj_counter)
            Utils.log("\tNaming that ligand {}".format(name))
            Utils.log(
                "\tAll associated files will be referred to with this name")
            missing_name_counter += 1

            # Handle duplicate ligands in same list.
            if name in name_list:
                # If multiple names.
                if name in list(duplicate_names.keys()):
                    duplicate_names[name] = duplicate_names[name] + 1

                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe version of the ligand for the {} molecule in the SDF file will be retitled {}"
                        .format(mol_obj_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be referred to with this name"
                    )
                    name = new_name
                else:
                    duplicate_names[name] = 2
                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe version of the ligand for the {} molecule in the SDF file will be retitled {}"
                        .format(mol_obj_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be referred to with this name"
                    )
                    name = new_name

            mol_obj_counter += 1
            name_list.append(name)

        # SDF files may also contain properties. Get those as well.
        try:
            properties = mol.GetPropsAsDict()
        except:
            properties = {}

        if smiles != "":
            data.append((smiles, name, properties))

    return data
Exemplo n.º 18
0
                    Durrant lab. See README.md for more details.",
)

PARSER.add_argument("--2d_output_only",
                    action="store_true",
                    help="Skips the generate-3D-models step.")
PARSER.add_argument(
    "--cache_prerun",
    "-c",
    action="store_true",
    help="Run this before running Gypsum-DL in mpi mode.",
)
PARSER.add_argument("--test",
                    action="store_true",
                    help="Tests Gypsum-DL to check for programming bugs.")

ARGS_DICT = vars(PARSER.parse_args())
if ARGS_DICT["test"] == True:
    run_test()
elif ARGS_DICT["cache_prerun"] == False:

    INPUTS = copy.deepcopy(ARGS_DICT)

    for k, v in ARGS_DICT.items():
        if v is None:
            del INPUTS[k]
    prepare_molecules(INPUTS)
    Utils.log("Finished Gypsum-DL")
else:
    pass
Exemplo n.º 19
0
"""
This module removes molecules with prohibited substructures, per Durrant-lab
filters.
"""

import __future__

import gypsum_dl.Parallelizer as Parallelizer
import gypsum_dl.Utils as Utils
import gypsum_dl.ChemUtils as ChemUtils

try:
    from rdkit import Chem
except:
    Utils.exception("You need to install rdkit and its dependencies.")

# Get the substructures you won't permit (per substructure matching, not
# substring matching)
prohibited_smi_substrs_for_substruc = [
    "[NX3;!R]=[C;!R](-[OH1,OH2])[#6]",  # Imidic acid not in rings
    "[C;!R]=[C;!R](-[OH])[#6]",  # Enol not in rings
    "C=[N-]",
    "[N-]C=[N+]",
    "[nH+]c[n-]",
    "[#7+]~[#7+]",
    "[#7-]~[#7-]",
    "[!#7]~[#7+]~[#7-]~[!#7]",  # Doesn't hit azide.
    # Vina can't process boron anyway...
    "[#5]",  # B
    "O=[PH](=O)([#8])([#8])",  # molvs does odd tautomer: OP(O)(O)=O => O=[PH](=O)(O)O
Exemplo n.º 20
0
def minimize_3d(contnrs, max_variants_per_compound, thoroughness, num_procs, second_embed, job_manager, parallelizer_obj):
    """This function minimizes a 3D molecular conformation. In an attempt to
       not get trapped in a local minimum, it actually generates a number of
       conformers, minimizes the best ones, and then saves the best of the
       best.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param second_embed: Whether to try to generate 3D coordinates using an
        older algorithm if the better (default) algorithm fails. This can add
        run time, but sometimes converts certain molecules that would
        otherwise fail.
    :type second_embed: bool
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # Let the user know you're on this step.
    Utils.log("Minimizing all 3D molecular structures...")

    # Create the parameters (inputs) for the parallelizer.
    params = []
    ones_without_nonaro_rngs = set([])
    for contnr in contnrs:
        if contnr.num_nonaro_rngs == 0:
            # Because ones with nonaromatic rings have already been minimized,
            # so they can be skipped here.
            for mol in contnr.mols:
                ones_without_nonaro_rngs.add(mol.contnr_idx)
                params.append(tuple([mol, max_variants_per_compound, thoroughness, second_embed]))
    params = tuple(params)

    # Run the inputs through the parallelizer.
    tmp = []
    if parallelizer_obj !=  None:
        tmp = parallelizer_obj.run(params, parallel_minit, num_procs, job_manager)
    else:
        for i in params:
            tmp.append(parallel_minit(i[0],i[1],i[2],i[3]))


    # Save energy into MyMol object, and get a list of just those objects.
    contnr_list_not_empty = set([])  # To keep track of which container lists
                                     # are not empty. These are the ones
                                     # you'll be repopulating with better
                                     # optimized structures.
    results = []  # Will contain MyMol.MyMol objects, with the saved energies
                  # inside.
    for mol in tmp:
        mol.mol_props["Energy"] = mol.conformers[0].energy
        results.append(mol)
        contnr_list_not_empty.add(mol.contnr_idx)

    # Go through each of the containers that are not empty and remove current
    # ones. Because you'll be replacing them with optimized versions.
    for i in contnr_list_not_empty:
        contnrs[i].mols = []

    # Go through each of the minimized mols, and populate containers they
    # belong to.
    for mol in results:
        contnrs[mol.contnr_idx].add_mol(mol)

    # Alert the user to any errors.
    for contnr in contnrs:
        for mol in contnr.mols:
            if mol.rdkit_mol == "":
                mol.genealogy.append(
                    "(WARNING: Could not optimize 3D geometry)"
                )
                mol.conformers = []