示例#1
0
    def MolToMolBlock(self):
        """Prints out the first 500 letters of the molblock version of this
        conformer. Good for debugging."""

        mol_copy = copy.deepcopy(self.mol_copy)  # Use it as a template.
        mol_copy.RemoveAllConformers()
        mol_copy.AddConformer(self.conformer)
        Utils.log(Chem.MolToMolBlock(mol_copy)[:500])
示例#2
0
def durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj):
    """Removes any molecules that contain prohibited substructures, per the
    durrant-lab filters.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Applying Durrant-lab filters to all molecules...")

    prohibited_substructs = [
        Chem.MolFromSmarts(s) for s in prohibited_smi_substrs_for_substruc
    ]

    # Get the parameters to pass to the parallelizer object.
    params = [[c, prohibited_substructs] for c in contnrs]

    # Run the tautomizer through the parallel object.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_durrant_lab_filter,
                                   num_procs, job_manager)
    else:
        for c in params:
            tmp.append(parallel_durrant_lab_filter(c, prohibited_substructs))

    # Note that results is a list of containers.

    # Stripping out None values (failed).
    results = Parallelizer.strip_none(tmp)

    # You need to get the molecules as a flat array so you can run it through
    # bst_for_each_contnr_no_opt
    mols = []
    for contnr in results:
        mols.extend(contnr.mols)
        # contnr.mols = []  # Necessary because ones are being removed...

    # contnrs = results

    # print([c.orig_smi for c in results])
    # import pdb; pdb.set_trace()

    # Using this function just to make the changes. Doesn't do energy
    # minimization or anything (as it does later) because max variants
    # and thoroughness maxed out.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs,
        mols,
        1000,
        1000  # max_variants_per_compound, thoroughness
    )
示例#3
0
def convert_2d_to_3d(
    contnrs,
    max_variants_per_compound,
    thoroughness,
    num_procs,
    job_manager,
    parallelizer_obj,
):
    """Converts the 1D smiles strings into 3D small-molecule models.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Converting all molecules to 3D structures.")

    # Make the inputs to pass to the parallelizer.
    params = []
    for contnr in contnrs:
        for mol in contnr.mols:
            params.append(tuple([mol]))
    params = tuple(params)

    # Run the parallelizer
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_make_3d, num_procs,
                                   job_manager)
    else:
        for i in params:
            tmp.append(parallel_make_3d(i[0]))

    # Remove and Nones from the output, which represent failed molecules.
    clear = Parallelizer.strip_none(tmp)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(contnrs, clear,
                                         max_variants_per_compound,
                                         thoroughness, False)
示例#4
0
def parallel_add_H(contnr, protonation_settings):
    """Creates alternate ionization variants for a given molecule container.
       This is the function that gets fed into the parallelizer.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param protonation_settings: Protonation settings to pass to Dimorphite-DL.
    :type protonation_settings: dict
    :return: [description]
    :rtype: [type]
    """

    # Make sure the canonical SMILES is actually a string.
    if type(contnr.orig_smi_canonical) != str:
        Utils.log("container.orig_smi_canonical: " + contnr.orig_smi_canonical)
        Utils.log("type container.orig_smi_canonical: " +
                  str(type(contnr.orig_smi_canonical)))
        Utils.exception("container.orig_smi_canonical: " +
                        contnr.orig_smi_canonical)

    # Add the SMILES string to the protonation parameters.
    protonation_settings["smiles"] = contnr.orig_smi_canonical

    # Protonate the SMILESstring. This is Dimorphite-DL.
    smis = Protonate(protonation_settings)

    # Convert the protonated SMILES strings into a list of rdkit molecule
    # objects.
    rdkit_mols = [Chem.MolFromSmiles(smi.strip()) for smi in smis]

    # Convert from rdkit mols to MyMol.MyMol.
    addH_mols = [MyMol.MyMol(mol) for mol in rdkit_mols if mol is not None]

    # Remove MyMols with odd substructures.
    addH_mols = [
        mol for mol in addH_mols if mol.remove_bizarre_substruc() is False
    ]

    # I once saw it add a "C+"" here. So do a secondary check at this point to
    # make sure it's valid. Recreate the list, moving new MyMol.MyMol objects
    # into the return_values list.

    return_values = []

    orig_mol = contnr.mol_orig_frm_inp_smi
    for Hm in addH_mols:
        Hm.inherit_contnr_props(contnr)
        Hm.genealogy = orig_mol.genealogy[:]
        Hm.name = orig_mol.name

        if Hm.smiles() != orig_mol.smiles():
            Hm.genealogy.append(Hm.smiles(True) + " (protonated)")

        return_values.append(Hm)

    return return_values
示例#5
0
    def smiles(self, noh=False):
        """Get the desalted, canonical smiles string associated with this
           object. (Not the input smiles!)

        :param noh: Whether or not hydrogen atoms should be included in the
           canonical smiles string., defaults to False
        :param noh: bool, optional
        :return: The canonical smiles string, or None if it cannot be
           determined.
        :rtype: str or None
        """

        # See if it's already been calculated.
        if noh == False:
            # They want the hydrogen atoms.
            if self.can_smi != "":
                # Return previously determined canonical SMILES.
                return self.can_smi
            else:
                # Need to determine canonical SMILES.
                try:
                    can_smi = Chem.MolToSmiles(self.rdkit_mol,
                                               isomericSmiles=True,
                                               canonical=True)
                except:
                    # Sometimes this conversion just can't happen. Happened
                    # once with this beast, for example:
                    # CC(=O)NC1=CC(=C=[N+]([O-])O)C=C1O
                    Utils.log("Warning: Couldn't put " + self.orig_smi + " (" +
                              self.name +
                              ") in canonical form. Got this error: " +
                              str(sys.exc_info()[0]) +
                              ". This molecule will be " + "discarded.")
                    self.can_smi = None
                    return None

                self.can_smi = can_smi
                return can_smi
        else:
            # They don't want the hydrogen atoms.
            if self.can_smi_noh != "":
                # Return previously determined string.
                return self.can_smi_noh

            # So remove hydrogens. Note that this assumes you will have called
            # this function previously with noh = False
            amol = copy.copy(self.rdkit_mol)
            amol = MOH.try_deprotanation(amol)
            self.can_smi_noh = Chem.MolToSmiles(amol,
                                                isomericSmiles=True,
                                                canonical=True)
            return self.can_smi_noh
示例#6
0
    def standardize_smiles(self):
        """Standardize the smiles string if you can."""

        if self.stdrd_smiles != "":
            return self.stdrd_smiles

        try:
            self.stdrd_smiles = ssmiles(self.smiles())
        except:
            Utils.log("\tCould not standardize " + self.smiles(True) +
                      ". Skipping.")
            self.stdrd_smiles = self.smiles()

        return self.stdrd_smiles
示例#7
0
def parallel_durrant_lab_filter(contnr, prohibited_substructs):
    """A parallelizable helper function that checks that tautomers do not
       break any nonaromatic rings present in the original object.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param prohibited_substructs: A list of the prohibited substructures.
    :type prohibited_substructs: list
    :return: Either the container with bad molecules removed, or a None
      object.
    :rtype: MolContainer.MolContainer | None
    """

    # Replace any molecules that have prohibited substructure with None.
    for mi, m in enumerate(contnr.mols):
        for pattrn in prohibited_substructs:
            if durrant_lab_contains_bad_substr(
                m.orig_smi_deslt
            ) or m.rdkit_mol.HasSubstructMatch(pattrn):
                Utils.log(
                    "\t"
                    + m.smiles(True)
                    + ", a variant generated "
                    + "from "
                    + contnr.orig_smi
                    + " ("
                    + m.name
                    + "), contains a prohibited substructure, so I'm "
                    + "discarding it."
                )

                contnr.mols[mi] = None

                # continue # JDD: this was wrong, wasn't it?
                break  # On to next mol in mols.

    # Now go back and remove those Nones
    contnr.mols = Parallelizer.strip_none(contnr.mols)

    # If there are no molecules, mark this container for deletion.
    if len(contnr.mols) == 0:
        return None

    # Return the container
    return contnr
示例#8
0
    def minimize(self):
        """Minimize (optimize) the geometry of the current conformer if it
           hasn't already been optimized."""

        if self.minimized == True:
            # Already minimized. Don't do it again.
            return

        # Perform the minimization, and save the energy.
        try:
            ff = AllChem.UFFGetMoleculeForceField(self.mol)
            ff.Minimize()
            self.energy = ff.CalcEnergy()
        except:
            Utils.log("Warning: Could not calculate energy for molecule " +
                      Chem.MolToSmiles(self.mol))
            self.energy = 9999
        self.minimized = True
示例#9
0
def desalter(contnr):
    """Desalts molecules in a molecule container.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :return: A molecule object.
    :rtype: MyMol.MyMol
    """

    # Split it into fragments
    frags = contnr.get_frags_of_orig_smi()

    if len(frags) == 1:
        # It's only got one fragment, so default assumption that
        # orig_smi = orig_smi_deslt is correct.
        return contnr.mol_orig_frm_inp_smi
    else:
        Utils.log(
            "\tMultiple fragments found in "
            + contnr.orig_smi
            + " ("
            + contnr.name
            + ")"
        )

        # Find the biggest fragment
        num_heavy_atoms = []
        num_heavy_atoms_to_frag = {}

        for i, f in enumerate(frags):
            num = f.GetNumHeavyAtoms()
            num_heavy_atoms.append(num)
            num_heavy_atoms_to_frag[num] = f

        max_num = max(num_heavy_atoms)
        biggest_frag = num_heavy_atoms_to_frag[max_num]

        # Return info about that biggest fragment.
        new_mol = MyMol.MyMol(biggest_frag)
        new_mol.contnr_idx = contnr.contnr_idx
        new_mol.name = contnr.name
        new_mol.genealogy = contnr.mol_orig_frm_inp_smi.genealogy
        new_mol.make_mol_frm_smiles_sanitze()  # Need to update the mol.
        return new_mol
示例#10
0
def proccess_output(contnrs, params):
    """Proccess the molecular models in preparation for writing them to the
       disk."""

    # Unpack some variables.
    separate_output_files = params["separate_output_files"]
    output_folder = params["output_folder"]

    if params["add_html_output"] == True:
        # Write to an HTML file.
        web_2d_output(contnrs, output_folder)

    # Write to an SDF file.
    save_to_sdf(contnrs, params, separate_output_files, output_folder)

    # Also write to PDB files, if requested.
    if params["add_pdb_output"] == True:
        Utils.log("\nMaking PDB output files\n")
        convert_sdfs_to_PDBs(contnrs, output_folder)
示例#11
0
def web_2d_output(contnrs, output_folder):
    """Saves pictures of the models to an HTML file on disk. It can be viewed in
    a browser. Then opens a browser automatically to view them. This is mostly
    for debugging."""

    Utils.log("Saving html image of molecules associated with...")

    # Let's not parallelize it for now. This will rarely be used.
    html_file = output_folder + os.sep + "gypsum_dl_success.html"
    f = open(html_file, 'w')
    for contnr in contnrs:
        Utils.log("\t" + contnr.orig_smi)
        for mol in contnr.mols:
            # See
            # http://rdkit.org/docs/source/rdkit.Chem.rdmolops.html#rdkit.Chem.rdmolops.RemoveHs
            # I think in older versions of rdkit (e.g., 2016.09.2), RemoveHs
            # would remove hydrogens, even if that make double bonds
            # ambiguous. Not so in newer versions (e.g., 2018.03.4). So if
            # your double-bonded nitrogen doesn't have its hydrogen attached,
            # and you're using an older version of rdkit, don't worry about
            # it. The cis/trans info is still there.
            mol2 = Chem.RemoveHs(mol.rdkit_mol)
            # mol2 = mol.rdkit_mol

            mol2 = PrepareMolForDrawing(mol2,
                                        addChiralHs=True,
                                        wedgeBonds=True)
            rdDepictor.Compute2DCoords(mol2)
            drawer = rdMolDraw2D.MolDraw2DSVG(200, 200)
            drawer.DrawMolecule(mol2)
            drawer.FinishDrawing()
            svg = drawer.GetDrawingText()
            f.write(
                '<div style="float: left; width:200px; height: 220px;" title="'
                + mol.name + '">' +
                '<div style="width: 200px; height: 200px;">' +
                svg.replace("svg:", "") + '</div>' +
                '<div style="width: 200px; height: 20px;">' +
                '<small><center>' + mol.smiles(True) + '</center></small>' +
                '</div>' + '</div>')
    f.close()
示例#12
0
def parallel_make_3d(mol):
    """Does the 2D to 3D conversion. Meant to run within parallelizer.

    :param mol: The molecule to be converted.
    :type mol: MyMol.MyMol
    :return: A MyMol.MyMol object with the 3D coordinates inside, or None if
       it fails.
    :rtype: MyMol.MyMol | None
    """

    # Initially assume you won't show an error message.
    show_error_msg = False

    if mol.rdkit_mol is None:
        # The rdkit mol is None. Something's gone wrong. Show an error
        # message.
        show_error_msg = True
    else:
        # Check if it has strange substructures.
        if mol.remove_bizarre_substruc() == False:
            # Perform the conversion.
            mol.make_first_3d_conf_no_min()

            # If there are some conformations, make note of that in the
            # genealogy record.
            if len(mol.conformers) > 0:
                mol.genealogy.append(
                    mol.smiles(True) + " (3D coordinates assigned)")
                return mol
            else:
                # No conformers? Show an error. Something's gone wrong.
                show_error_msg = True

    if show_error_msg:
        # Something's gone wrong, so show this error.
        Utils.log("\tWARNING: Could not generate 3D geometry for " +
                  str(mol.smiles()) + " (" + mol.name + "). Molecule " +
                  "discarded.")

    # If you get here, something's gone wrong...
    return None
示例#13
0
def deal_with_failed_molecules(contnrs, params):
    """Removes and logs failed molecules.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param params: The parameters, used to determine the filename that will
       contain the failed molecules.
    :type params: dict
    """

    failed_ones = []  # To keep track of failed molecules
    for contnr in contnrs:
        if len(contnr.mols) == 0:
            astr = contnr.orig_smi + "\t" + contnr.name
            failed_ones.append(astr)

    # Let the user know if there's more than one failed molecule.
    if len(failed_ones) > 0:
        Utils.log("\n3D models could not be generated for the following entries:")
        Utils.log("\n".join(failed_ones))
        Utils.log("\n")

        # Write the failures to an smi file.
        outfile = open(params["output_folder"] + os.sep + "gypsum_dl_failed.smi", "w")
        outfile.write("\n".join(failed_ones))
        outfile.close()
示例#14
0
def desalt_orig_smi(contnrs,
                    num_procs,
                    job_manager,
                    parallelizer_obj,
                    durrant_lab_filters=False):
    """If an input molecule has multiple unconnected fragments, this removes
       all but the largest fragment.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multiprocess mode.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    Utils.log("Desalting all molecules (i.e., keeping only largest fragment).")

    # Desalt each of the molecule containers. This step is very fast, so let's
    # just run it on a single processor always.
    tmp = [desalter(x) for x in contnrs]

    # Go through each contnr and update the orig_smi_deslt. If we update it,
    # also add a note in the genealogy record.
    tmp = Parallelizer.strip_none(tmp)
    for idx in range(0, len(tmp)):
        desalt_mol = tmp[idx]
        # idx = desalt_mol.contnr_idx
        cont = contnrs[idx]

        if contnrs[idx].orig_smi != desalt_mol.orig_smi:
            desalt_mol.genealogy.append(desalt_mol.orig_smi_deslt +
                                        " (desalted)")
            cont.update_orig_smi(desalt_mol.orig_smi_deslt)

        cont.add_mol(desalt_mol)
示例#15
0
def merge_parameters(default, params):
    """Add default values if missing from parameters.

    :param default: The parameters.
    :type default: dict
    :param params: The default values
    :type params: dict
    :raises KeyError: Unrecognized parameter.
    :raises TypeError: Input parameter has a different type than the default.
    """

    # Generate a dictionary with the same keys, but the types for the values.
    type_dict = make_type_dict(default)

    # Move user-specified values into the parameter.
    for param in params:
        # Throw an error if there's an unrecognized parameter.
        if param not in default:
            Utils.log('Parameter "' + str(param) + '" not recognized!')
            Utils.log("Here are the options:")
            Utils.log(" ".join(sorted(list(default.keys()))))
            Utils.exception("Unrecognized parameter: " + str(param))

        # Throw an error if the input parameter has a different type than
        # the default one.
        if not isinstance(params[param], type_dict[param]):
            # Cast int to float if necessary
            if type(params[param]) is int and type_dict[param] is float:
                params[param] = float(params[param])
            else:
                # Seems to be a type mismatch.
                Utils.exception(
                    'The parameter "'
                    + param
                    + '" must be of '
                    + "type "
                    + str(type_dict[param])
                    + ", but it is of type "
                    + str(type(params[param]))
                    + "."
                )

        # Update the parameter value with the user-defined one.
        default[param] = params[param]
示例#16
0
def make_tauts(contnrs, max_variants_per_compound, thoroughness, num_procs,
               job_manager, let_tautomers_change_chirality, parallelizer_obj):
    """Generates tautomers of the molecules. Note that some of the generated
    tautomers are not realistic. If you find a certain improbable
    substructure keeps popping up, add it to the list in the
    `prohibited_substructures` definition found with MyMol.py, in the function
    remove_bizarre_substruc().

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: A list.
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param let_tautomers_change_chirality: Whether to allow tautomers that
      change the total number of chiral centers.
    :type let_tautomers_change_chirality: bool
    :param job_manager: The multithred mode to use.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # No need to proceed if there are no max variants.
    if max_variants_per_compound == 0:
        return

    Utils.log("Generating tautomers for all molecules...")

    # Create the parameters to feed into the parallelizer object.
    params = []
    for contnr in contnrs:
        for mol_index, mol in enumerate(contnr.mols):
            params.append(tuple([contnr, mol_index,
                                 max_variants_per_compound]))
    params = tuple(params)

    # Run the tautomizer through the parallel object.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_make_taut, num_procs,
                                   job_manager)
    else:
        for i in params:
            tmp.append(parallel_make_taut(i[0], i[1], i[2]))

    # Flatten the resulting list of lists.
    none_data = tmp
    taut_data = Parallelizer.flatten_list(none_data)

    # Remove bad tautomers.
    taut_data = tauts_no_break_arom_rngs(contnrs, taut_data, num_procs,
                                         job_manager, parallelizer_obj)

    if not let_tautomers_change_chirality:
        taut_data = tauts_no_elim_chiral(contnrs, taut_data, num_procs,
                                         job_manager, parallelizer_obj)

    # taut_data = tauts_no_change_hs_to_cs_unless_alpha_to_carbnyl(
    #    contnrs, taut_data, num_procs, job_manager, parallelizer_obj
    # )

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(contnrs, taut_data,
                                         max_variants_per_compound,
                                         thoroughness)
示例#17
0
    def __init__(self,
                 mol,
                 conformer=None,
                 second_embed=False,
                 use_random_coordinates=False):
        """Create a MyConformer objects.

        :param mol: The MyMol.MyMol associated with this conformer.
        :type mol: MyMol.MyMol
        :param conformer: An optional variable specifying the conformer to use.
           If not specified, it will create a new conformer. Defaults to None.
        :type conformer: rdkit.Conformer, optional
        :param second_embed: Whether to try to generate 3D coordinates using an
            older algorithm if the better (default) algorithm fails. This can add
            run time, but sometimes converts certain molecules that would
            otherwise fail. Defaults to False.
        :type second_embed: bool, optional
        :param use_random_coordinates: The first conformer should not start
           from random coordinates, but rather the eigenvalues-based
           coordinates rdkit defaults to. But Gypsum-DL generates subsequent
           conformers to try to consider alternate geometries. So they should
           start from random coordinates. Defaults to False.
        :type use_random_coordinates: bool, optional
        """

        # Save some values to the object.
        self.mol = copy.deepcopy(mol.rdkit_mol)
        self.smiles = mol.smiles()

        # Remove any previous conformers.
        self.mol.RemoveAllConformers()

        if conformer is None:
            # The user is providing no conformer. So we must generate it.

            # Note that I have confirmed that the below respects chirality.
            # params is a list of ETKDGv2 parameters generated by this command
            # Description of these parameters can be found at
            # help(AllChem.EmbedMolecule)

            try:
                # Try to use ETKDGv2, but it is only present in the python 3.6
                # version of RDKit.
                params = AllChem.ETKDGv2()
            except:
                # Use the original version of ETKDG if python 2.7 RDKit. This
                # may be resolved in next RDKit update so we encased this in a
                # try statement.
                params = AllChem.ETKDG()

            # The default, but just a sanity check.
            params.enforcechiral = True

            # Set a max number of times it will try to calculate the 3D
            # coordinates. Will save a little time.
            params.maxIterations = 0  # This should be the default but lets
            # set it anyway

            # Also set whether to start from random coordinates.
            params.useRandomCoords = use_random_coordinates

            # AllChem.EmbedMolecule uses geometry to create inital molecule
            # coordinates. This sometimes takes a very long time
            AllChem.EmbedMolecule(self.mol, params)

            # On rare occasions, the new conformer generating algorithm fails
            # because params.useRandomCoords = False. So if it fails, try
            # again with True.
            if self.mol.GetNumConformers(
            ) == 0 and use_random_coordinates == False:
                params.useRandomCoords = True
                AllChem.EmbedMolecule(self.mol, params)

            # On very rare occasions, the new conformer generating algorithm
            # fails. For example, COC(=O)c1cc(C)nc2c(C)cc3[nH]c4ccccc4c3c12 .
            # In this case, the old one still works. So if no coordinates are
            # assigned, try that one. Parameters must have second_embed set to
            # True for this to happen.
            if second_embed == True and self.mol.GetNumConformers() == 0:
                AllChem.EmbedMolecule(self.mol,
                                      useRandomCoords=use_random_coordinates)

            # On rare occasions, both methods fail. For example,
            # O=c1cccc2[C@H]3C[NH2+]C[C@@H](C3)Cn21 Another example:
            # COc1cccc2c1[C@H](CO)[N@H+]1[C@@H](C#N)[C@@H]3C[C@@H](C(=O)[O-])[C@H]([C@H]1C2)[N@H+]3C
            if self.mol.GetNumConformers() == 0:
                self.mol = False
        else:
            # The user has provided a conformer. Just add it.
            conformer.SetId(0)
            self.mol.AddConformer(conformer, assignId=True)

        # Calculate some energies, other housekeeping.
        if self.mol is not False:
            try:
                ff = AllChem.UFFGetMoleculeForceField(self.mol)
                self.energy = ff.CalcEnergy()
            except:
                Utils.log("Warning: Could not calculate energy for molecule " +
                          Chem.MolToSmiles(self.mol))
                # Example of smiles that cause problem here without try...catch:
                # NC1=NC2=C(N[C@@H]3[C@H](N2)O[C@@H](COP(O)(O)=O)C2=C3S[Mo](S)(=O)(=O)S2)C(=O)N1
                self.energy = 9999
            self.minimized = False
            self.ids_hvy_atms = [
                a.GetIdx() for a in self.mol.GetAtoms()
                if a.GetAtomicNum() != 1
            ]
示例#18
0
    def __init__(self, starter, name=""):
        """Initialize the MyMol object.

        :param starter: The object (smiles or rdkit.Mol) on which to build this
           class.
        :type starter: str or rdkit.Mol
        :param name: An optional string, the name of this molecule. Defaults to "".
        :param name: str, optional
        """

        if isinstance(starter, str):
            # It's a SMILES string.
            self.rdkit_mol = ""
            self.can_smi = ""
            smiles = starter
        else:
            # So it's an rdkit mol object.
            self.rdkit_mol = starter  # No need to regenerate this, since already provided.

            # Get the smiles too from the rdkit mol object.
            try:
                smiles = Chem.MolToSmiles(self.rdkit_mol,
                                          isomericSmiles=True,
                                          canonical=True)

                # In this case you know it's cannonical.
                self.can_smi = smiles
            except:
                # Sometimes this conversion just can't happen. Happened once
                # with this beast, for example:
                # CC(=O)NC1=CC(=C=[N+]([O-])O)C=C1O
                self.can_smi = False
                id_to_print = name if name != "" else str(starter)
                Utils.log(
                    "\tERROR: Could not generate one of the structures " +
                    "for (" + id_to_print + ").")

        self.can_smi_noh = ""
        self.orig_smi = smiles

        # Default assumption is that they are the same.
        self.orig_smi_deslt = smiles
        self.name = name
        self.conformers = []
        self.nonaro_ring_atom_idx = ""
        self.chiral_cntrs_only_assigned = ""
        self.chiral_cntrs_include_unasignd = ""
        self.bizarre_substruct = ""
        self.enrgy = {}  # different energies for different conformers.
        self.minimized_enrgy = {}
        self.contnr_idx = ""
        self.frgs = ""
        self.stdrd_smiles = ""
        self.mol_props = {}
        self.idxs_low_energy_confs_no_opt = {}
        self.idxs_of_confs_to_min = set([])
        self.genealogy = []  # Keep track of how the molecule came to be.

        # Makes the molecule if a smiles was provided. Sanitizes the molecule
        # regardless.
        self.make_mol_frm_smiles_sanitze()
示例#19
0
    def remove_bizarre_substruc(self):
        """Removes molecules with improbable substuctures, likely generated
           from the tautomerization process. Used to find artifacts.

        :return: Boolean, whether or not there are impossible substructures.
           Also saves to self.bizarre_substruct.
        :rtype: bool
        """

        if self.bizarre_substruct != "":
            # Already been determined.
            return self.bizarre_substruct

        if self.rdkit_mol is None:
            # It is bizarre to have a molecule with no atoms in it.
            return True

        # These are substrutures that can't be easily corrected using
        # fix_common_errors() below.
        #, "[C+]", "[C-]", "[c+]", "[c-]", "[n-]", "[N-]"] # ,
        # "[*@@H]1(~[*][*]~2)~[*]~[*]~[*@@H]2~[*]~[*]~1",
        # "[*@@H]1~2~*~*~[*@@H](~*~*2)~*1",
        # "[*@@H]1~2~*~*~*~[*@@H](~*~*2)~*1",
        # "[*@@H]1~2~*~*~*~*~[*@@H](~*~*2)~*1",
        # "[*@@H]1~2~*~[*@@H](~*~*2)~*1", "[*@@H]~1~2~*~*~*~[*@H]1O2",
        # "[*@@H]~1~2~*~*~*~*~[*@H]1O2"]

        # Note that C(O)=N, C and N mean they are aliphatic. Does not match
        # c(O)n, when aromatic. So this form is acceptable if in aromatic
        # structure.
        prohibited_substructures = ["O(=*)-*"]  #, "C(O)=N"]
        prohibited_substructures.append(
            "C(=[CH2])[OH]")  # Enol forms with terminal alkenes are unlikely.
        prohibited_substructures.append(
            "C(=[CH2])[O-]")  # Enol forms with terminal alkenes are unlikely.
        prohibited_substructures.append(
            "C=C([OH])[OH]"
        )  # A geminal vinyl diol is not a tautomer of a carboxylate group.
        prohibited_substructures.append(
            "C=C([O-])[OH]"
        )  # A geminal vinyl diol is not a tautomer of a carboxylate group.
        prohibited_substructures.append(
            "C=C([O-])[O-]"
        )  # A geminal vinyl diol is not a tautomer of a carboxylate group.
        prohibited_substructures.append("[C-]")  # No carbanions.
        prohibited_substructures.append("[c-]")  # No carbanions.

        for s in prohibited_substructures:
            # First just match strings... could be faster, but not 100%
            # accurate.
            if s in self.orig_smi:
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

            if s in self.orig_smi_deslt:
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

            if s in self.can_smi:
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

        # Now do actual substructure matching
        for s in prohibited_substructures:
            pattrn = Chem.MolFromSmarts(s)
            if self.rdkit_mol.HasSubstructMatch(pattrn):
                # Utils.log("\tRemoving a molecule because it has an odd
                # substructure: " + s)
                Utils.log("\tDetected unusual substructure: " + s)
                self.bizarre_substruct = True
                return True

        # Now certin patterns that are more complex.
        # TODO in the future?

        self.bizarre_substruct = False
        return False
示例#20
0
def prepare_smiles(contnrs, params):
    """Runs the appropriate steps for processing the SMILES strings.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param params: The user parameters.
    :type params: dict
    """

    # Unpack some of the parameter values.
    min_ph = params["min_ph"]
    max_ph = params["max_ph"]
    std_dev = params["pka_precision"]
    max_variants_per_compound = params["max_variants_per_compound"]
    thoroughness = params["thoroughness"]
    num_procs = params["num_processors"]
    job_manager = params["job_manager"]
    let_tautomers_change_chirality = params["let_tautomers_change_chirality"]
    parallelizer_obj = params["Parallelizer"]

    debug = True

    # Desalt the molecules. Note that the program always desalts (can't turn it
    # off).
    # Utils.log("Begin Desaltings")
    desalt_orig_smi(contnrs, num_procs, job_manager, parallelizer_obj)
    # Utils.log("Done with Desalting")

    # Filter the containers to remove ones that have bad substrings (metal,
    # etc.) in the desalted smiles, assuming durrant lab filter turned on. Note
    # that some compounds aren't filtered until later.
    if params["use_durrant_lab_filters"] == True:
        contnrs = [
            c for c in contnrs if not durrant_lab_contains_bad_substr(c.orig_smi_deslt)
        ]

    if debug:
        Utils.print_current_smiles(contnrs)

    # Add hydrogens for user-specified pH, if requested.
    if not params["skip_adding_hydrogen"]:
        # Utils.log("Ionizing Molecules")
        add_hydrogens(
            contnrs,
            min_ph,
            max_ph,
            std_dev,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            parallelizer_obj,
        )
        # Utils.log("Done with Ionization")
    else:
        Utils.log("Skipping ionization")
        wrap_molecules(contnrs)

    if debug:
        Utils.print_current_smiles(contnrs)

    # Make alternate tautomeric forms, if requested.
    if not params["skip_making_tautomers"]:
        # Utils.log("Tautomerizing Molecules")
        make_tauts(
            contnrs,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            let_tautomers_change_chirality,
            parallelizer_obj,
        )
        # Utils.log("Done with Tautomerization")
    else:
        Utils.log("Skipping tautomerization")

    if debug:
        Utils.print_current_smiles(contnrs)

    # Apply Durrant-lab filters if requested
    if params["use_durrant_lab_filters"]:
        # Utils.log("Applying Durrant-Lab Filters")
        durrant_lab_filters(contnrs, num_procs, job_manager, parallelizer_obj)
        # Utils.log("Done Applying Durrant-Lab Filters")
    else:
        Utils.log("Not applying Durrant-lab filters")

    if debug:
        Utils.print_current_smiles(contnrs)

    # Make alternate chiral forms, if requested.
    if not params["skip_enumerate_chiral_mol"]:
        # Utils.log("Enumerating Chirality")
        enumerate_chiral_molecules(
            contnrs,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            parallelizer_obj,
        )
        # Utils.log("Done with Chirality Enumeration")
    else:
        Utils.log("Skipping chirality enumeration")

    if debug:
        Utils.print_current_smiles(contnrs)

    # Make alternate double-bond isomers, if requested.
    if not params["skip_enumerate_double_bonds"]:
        # Utils.log("Enumerating Double Bonds")
        enumerate_double_bonds(
            contnrs,
            max_variants_per_compound,
            thoroughness,
            num_procs,
            job_manager,
            parallelizer_obj,
        )
        # Utils.log("Done with Double Bond Enumeration")
    else:
        Utils.log("Skipping double bond enumeration")

    if debug:
        Utils.print_current_smiles(contnrs)
示例#21
0
def load_smiles_file(filename):
    """Loads a smiles file.

    :param filename: The filename.
    :type filename: str
    :return: A list of tuples, (SMILES, Name).
    :rtype: list
    """

    # A smiles file contains one molecule on each line. Each line is a string,
    # separated by white space, followed by the molecule name.
    data = []
    duplicate_names = {}
    line_counter = 0
    name_list = []
    for line in open(filename):
        # You've got the line.
        line = line.strip()
        if line != "":
            # From that line, get the smiles string and name.
            chunks = line.split()
            smiles = chunks[0]
            name = " ".join(chunks[1:])

            # Handle unnamed ligands.
            if name == "":
                name = "untitled_line_{}".format(line_counter + 1)
                Utils.log(
                    ("\tUntitled ligand on line {}. Naming that ligand " +
                     "{}. All associated files will be refered to with " +
                     "this name.").format(line_counter + 1, name))

            # Handle duplicate ligands in same list.
            if name in name_list:
                # If multiple names...
                if name in list(duplicate_names.keys()):
                    duplicate_names[name] = duplicate_names[name] + 1

                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe veresion of the ligand on line {} will be retitled {}"
                        .format(line_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be refered to with this name"
                    )
                    name = new_name
                else:
                    duplicate_names[name] = 2
                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe veresion of the ligand on line {} will be retitled {}"
                        .format(line_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be refered to with this name"
                    )
                    name = new_name

            # Save the data for this line and advance.
            name_list.append(name)
            line_counter += 1
            data.append((smiles, name, {}))

    # Return the data.
    return data
示例#22
0
def parallel_get_chiral(mol, max_variants_per_compound, thoroughness):
    """A parallelizable function for enumerating chiralities.

    :param mol: The input molecule.
    :type mol: MyMol.MyMol
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :return: A list of MyMol.MyMol objects.
    :rtype: list
    """

    # Get all chiral centers that aren't assigned explicitly in the input
    # molecules.
    unasignd = [p[0] for p in mol.chiral_cntrs_w_unasignd() if p[1] == "?"]
    num = len(unasignd)

    # Get all possible chiral assignments. If the chirality is specified,
    # retain it.
    results = []
    if num == 0:
        # There are no unspecified chiral centers, so just keep existing.
        results.append(mol)
        return results
    elif num == 1:
        # There's only one chiral center.
        options = ["R", "S"]
    else:
        # There are multiple chiral centers.
        starting = [["R"], ["S"]]
        options = [["R"], ["S"]]
        for i in range(num - 1):
            if len(options) > thoroughness * max_variants_per_compound:
                # Unfortunately, this section lends itself to a combinatorial
                # explosion if there are many chiral centers. Necessary to
                # control that or it can become problematic. So truncate early
                # if you already have a enough (so some will unfortunately
                # never be evaluated).
                break
            options = list(itertools.product(options, starting))
            options = [list(itertools.chain(c[0], c[1])) for c in options]

    # Let the user know the number of chiral centers.
    Utils.log(
        "\t"
        + mol.smiles(True)
        + " ("
        + mol.name
        + ") has "
        # + str(len(options))
        + str(2 ** num)
        + " enantiomers when chiral centers with "
        + "no specified chirality are systematically varied."
    )

    # Randomly select a few of the chiral combinations to examine. This is to
    # reduce the potential combinatorial explosion.
    num_to_keep_initially = thoroughness * max_variants_per_compound
    options = Utils.random_sample(options, num_to_keep_initially, "")

    # Go through the chirality combinations and make a molecule with that
    # chirality.
    for option in options:
        # Copy the initial rdkit molecule.
        a_rd_mol = copy.copy(mol.rdkit_mol)

        # Set its chirality.
        for idx, chiral in zip(unasignd, option):
            if chiral == "R":
                a_rd_mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW
                )
            elif chiral == "S":
                a_rd_mol.GetAtomWithIdx(idx).SetChiralTag(
                    Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW
                )

        # Make a new MyMol.MyMol object from that rdkit molecule.
        new_mol = MyMol.MyMol(a_rd_mol)

        # Add the new molecule to the list of results, if it does not have a
        # bizarre substructure.
        if not new_mol.remove_bizarre_substruc():
            new_mol.contnr_idx = mol.contnr_idx
            new_mol.name = mol.name
            new_mol.genealogy = mol.genealogy[:]
            new_mol.genealogy.append(new_mol.smiles(True) + " (chirality)")
            results.append(new_mol)

    # Return the results.
    return results
示例#23
0
                    Durrant lab. See README.md for more details.",
)

PARSER.add_argument("--2d_output_only",
                    action="store_true",
                    help="Skips the generate-3D-models step.")
PARSER.add_argument(
    "--cache_prerun",
    "-c",
    action="store_true",
    help="Run this before running Gypsum-DL in mpi mode.",
)
PARSER.add_argument("--test",
                    action="store_true",
                    help="Tests Gypsum-DL to check for programming bugs.")

ARGS_DICT = vars(PARSER.parse_args())
if ARGS_DICT["test"] == True:
    run_test()
elif ARGS_DICT["cache_prerun"] == False:

    INPUTS = copy.deepcopy(ARGS_DICT)

    for k, v in ARGS_DICT.items():
        if v is None:
            del INPUTS[k]
    prepare_molecules(INPUTS)
    Utils.log("Finished Gypsum-DL")
else:
    pass
def parallel_get_double_bonded(mol, max_variants_per_compound, thoroughness):
    """A parallelizable function for enumerating double bonds.

    :param mol: The molecule with a potentially unspecified double bond.
    :type mol: MyMol.MyMol
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :return: [description]
    :rtype: [type]
    """

    # For this to work, you need to have explicit hydrogens in place.
    mol.rdkit_mol = Chem.AddHs(mol.rdkit_mol)

    # Get all double bonds that don't have defined stereochemistry. Note that
    # these are the bond indexes, not the atom indexes.
    unasignd_dbl_bnd_idxs = mol.get_double_bonds_without_stereochemistry()

    if len(unasignd_dbl_bnd_idxs) == 0:
        # There are no unassigned double bonds, so move on.
        return [mol]

    # Throw out any bond that is in a small ring.
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(3)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(4)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(5)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(6)
    ]
    unasignd_dbl_bnd_idxs = [
        i
        for i in unasignd_dbl_bnd_idxs
        if not mol.rdkit_mol.GetBondWithIdx(i).IsInRingSize(7)
    ]

    # Previously, I fully enumerated all double bonds. When there are many
    # such bonds, that leads to a combinatorial explosion that causes problems
    # in terms of speed and memory. Now, enumerate only enough bonds to make
    # sure you generate at least thoroughness * max_variants_per_compound.
    unasignd_dbl_bnd_idxs_orig_count = len(unasignd_dbl_bnd_idxs)
    num_bonds_to_keep = int(math.ceil(math.log(thoroughness * max_variants_per_compound, 2)))
    random.shuffle(unasignd_dbl_bnd_idxs)
    unasignd_dbl_bnd_idxs = sorted(unasignd_dbl_bnd_idxs[:num_bonds_to_keep])

    # Get a list of all the single bonds that come off each double-bond atom.
    all_sngl_bnd_idxs = set([])
    dbl_bnd_count = 0
    for dbl_bnd_idx in unasignd_dbl_bnd_idxs:
        bond = mol.rdkit_mol.GetBondWithIdx(dbl_bnd_idx)

        atom1 = bond.GetBeginAtom()
        atom1_bonds = atom1.GetBonds()
        if len(atom1_bonds) == 1:
            # The only bond is the one you already know about. So don't save.
            continue

        atom2 = bond.GetEndAtom()
        atom2_bonds = atom2.GetBonds()
        if len(atom2_bonds) == 1:
            # The only bond is the one you already know about. So don't save.
            continue

        dbl_bnd_count = dbl_bnd_count + 1

        # Suffice it to say, RDKit does not deal with cis-trans isomerization
        # in an intuitive way...
        idxs_of_other_bnds_frm_atm1 = [b.GetIdx() for b in atom1.GetBonds()]
        idxs_of_other_bnds_frm_atm1.remove(dbl_bnd_idx)

        idxs_of_other_bnds_frm_atm2 = [b.GetIdx() for b in atom2.GetBonds()]
        idxs_of_other_bnds_frm_atm2.remove(dbl_bnd_idx)

        all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm1)
        all_sngl_bnd_idxs |= set(idxs_of_other_bnds_frm_atm2)

    # Now come up with all possible up/down combinations for those bonds.
    all_sngl_bnd_idxs = list(all_sngl_bnd_idxs)
    all_atom_config_options = list(
        itertools.product([True, False], repeat=len(all_sngl_bnd_idxs))
    )

    # Let the user know.
    if dbl_bnd_count > 0:
        Utils.log(
            "\t"
            + mol.smiles(True)
            + " has "
            # + str(dbl_bnd_count)
            + str(
                # Not exactly right, I think, because should be dbl_bnd_count, but ok.
                unasignd_dbl_bnd_idxs_orig_count
            )
            + " double bond(s) with unspecified stereochemistry."
        )

    # Go through and consider each of the retained combinations.
    smiles_to_consider = set([])
    for atom_config_options in all_atom_config_options:
        # Make a copy of the original RDKit molecule.
        a_rd_mol = copy.copy(mol.rdkit_mol)
        # a_rd_mol = Chem.MolFromSmiles(mol.smiles())

        for bond_idx, direc in zip(all_sngl_bnd_idxs, atom_config_options):
            # Always done with reference to the atom in the double bond.
            if direc:
                a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDUPRIGHT)
            else:
                a_rd_mol.GetBondWithIdx(bond_idx).SetBondDir(Chem.BondDir.ENDDOWNRIGHT)

        # Assign the StereoChemistry. Required to actually set it.
        a_rd_mol.ClearComputedProps()
        Chem.AssignStereochemistry(a_rd_mol, force=True)

        # Add to list of ones to consider
        try:
            smiles_to_consider.add(
                Chem.MolToSmiles(a_rd_mol, isomericSmiles=True, canonical=True)
            )
        except:
            # Some molecules still give troubles. Unfortunate, but these are
            # rare cases. Let's just skip these. Example:
            # CN1C2=C(C=CC=C2)C(C)(C)[C]1=[C]=[CH]C3=CC(=C(O)C(=C3)I)I
            continue

    # Remove ones that don't have "/" or "\". These are not real enumerated ones.
    smiles_to_consider = [s for s in smiles_to_consider if "/" in s or "\\" in s]

    # Get the maximum number of / + \ in any string.
    cnts = [s.count("/") + s.count("\\") for s in smiles_to_consider]

    if len(cnts) == 0:
        # There are no appropriate double bonds. Move on...
        return [mol]

    max_cnts = max(cnts)

    # Only keep those with that same max count. The others have double bonds
    # that remain unspecified.
    smiles_to_consider = [
        s[0] for s in zip(smiles_to_consider, cnts) if s[1] == max_cnts
    ]
    results = []
    for smile_to_consider in smiles_to_consider:
        # Make a new MyMol.MyMol object with the specified smiles.
        new_mol = MyMol.MyMol(smile_to_consider)

        if new_mol.can_smi != False and new_mol.can_smi != None:
            # Sometimes you get an error if there's a bad structure otherwise.

            # Add the new molecule to the list of results, if it does not have
            # a bizarre substructure.
            if not new_mol.remove_bizarre_substruc():
                new_mol.contnr_idx = mol.contnr_idx
                new_mol.name = mol.name
                new_mol.genealogy = mol.genealogy[:]
                new_mol.genealogy.append(
                    new_mol.smiles(True) + " (cis-trans isomerization)"
                )
                results.append(new_mol)

    # Return the results.
    return results
示例#25
0
def run_test():
    script_dir = os.path.dirname(os.path.realpath(__file__))
    output_folder = script_dir + os.sep + "gypsum_dl_test_output" + os.sep

    # Delete test output directory if it exists.
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)

    # Make the directory
    os.mkdir(output_folder)

    # Make the Gypsum-DL parameters.
    params = {
        "source": script_dir + os.sep + "sample_molecules.smi",
        "separate_output_files": True,
        "job_manager": "serial",  # multiprocessing
        "output_folder": output_folder,
        "add_pdb_output": False,
        "max_variants_per_compound": 8,
        "thoroughness": 1,
        "min_ph": 4,
        "max_ph": 10,
        "pka_precision": 1,
        "use_durrant_lab_filters": True,
    }

    # Prepare the molecules.
    prepare_molecules(params)
    Utils.log("")
    Utils.log("TEST RESULTS")
    Utils.log("============")

    # Get the output sdf files.
    sdf_files = glob.glob(output_folder + "*")

    # There should be seven sdf files.
    msg = "Expected 15 output files, got " + str(len(sdf_files)) + "."
    if len(sdf_files) != 15:
        Utils.exception("FAILED. " + msg)
    else:
        Utils.log("PASSED. " + msg)

    # Get all the smiles from the files.
    all_smiles = set([])
    for sdf_file in sdf_files:
        lines = open(sdf_file).readlines()
        for i, line in enumerate(lines):
            if "<SMILES>" in line:
                all_smiles.add(lines[i + 1].strip())

    # List what the smiles should be.
    target_smiles = set([])

    # salt_and_ionization should produce two models (ionized and
    # deionized).
    target_smiles |= set(["[O-]c1ccccc1", "Oc1ccccc1"])

    # tautomer_and_cis_trans should produce three models (two tautomers, one
    # of them with alternate cis/trans).
    target_smiles |= set([r"C/C=C\O", "C/C=C/O", "CCC=O"])

    # two_chiral_one_unspecified_and_tautomer should produce four models.
    target_smiles |= set([
        "CC(C)C(=O)[C@@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)=C(O)[C@@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)C(=O)[C@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)=C(O)[C@](F)(Cl)C[C@@](C)(F)Cl",
    ])

    # two_double_bonds_one_chiral_center should produce eight models.
    target_smiles |= set([
        r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl",
        "CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl",
        r"CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl",
        r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl",
        "CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl",
    ])

    # two_double_bonds_one_unspecified should produce two models.
    target_smiles |= set(
        [r"CC/C(C)=C(\Cl)C/C(I)=C(\C)F", r"CC/C(C)=C(/Cl)C/C(I)=C(\C)F"])

    # non_aromatic_ring should produce one model. It will list it several
    # times, because different ring conformations of the same model.
    target_smiles |= set(["CC(C)(C)[C@H]1CC[C@@H](C(C)(C)C)CC1"])

    # There should be no =[N-] if Durrant lab filters are turned on. Note:
    # Removed "CC(=N)O" from below list because durrant lab filters now remove
    # iminols.
    target_smiles |= set(["CC([NH-])=O", "CC(N)=O"])

    # There should be no [N-]C=[N+] (CC(=O)[N-]C=[N+](C)C).
    target_smiles |= set([
        r"C/C(O)=N\C=[N+](C)C",
        r"CC(=O)/N=C\[NH+](C)C",
        "CC(=O)/N=C/[NH+](C)C",
        "CC(=O)NC=[N+](C)C",
        "C/C(O)=N/C=[N+](C)C",
    ])

    # There should be no [nH+]c[n-] (c1c[nH+]c[n-]1)
    target_smiles |= set(["c1c[n-]cn1", "c1c[nH+]c[nH]1", "c1c[nH]cn1"])

    # There should be no [#7+]~[#7+] (c1cc[nH+][nH+]c1)
    target_smiles |= set(["c1ccnnc1", "c1cc[nH+]nc1"])

    # There should be no [#7-]~[#7-] (CC(=O)[N-][N-]C(C)=O). Note that some
    # are commented out because Python2 and Python3 given different SMILES
    # strings that are all valid. See below to see how things are
    # consolodated. (Really this was probably a bad example to pick because
    # there are so many forms...)
    target_smiles |= set([
        "CC(=O)NNC(C)=O",
        # r"CC(=O)N/N=C(\C)O",
        # r"CC(=O)[N-]/N=C(/C)O",
        # r"C/C(O)=N/N=C(\C)O",
        r"C/C(O)=N\N=C(/C)O",
        # r"CC(=O)[N-]/N=C(\C)O",
        # "CC(=O)[N-]NC(C)=O",
        # "CC(=O)N/N=C(/C)O"
    ])

    # There should be no [!#7]~[#7+]~[#7-]~[!#7] (c1c[n-][nH+]c1)
    target_smiles |= set(["c1cn[n-]c1", "c1cn[nH]c1", "c1c[nH][nH+]c1"])

    # Azides can have adjacent +/- nitrogens.
    target_smiles |= set(["CN=[N+]=[N-]", "CN=[N+]=N"])

    # msg = "Expected " + str(len(target_smiles)) + " total SMILES, got " + \
    #     str(len(all_smiles)) + "."
    # if len(all_smiles) != len(target_smiles):
    #     Utils.exception("FAILED. " + msg)
    # else:
    #     Utils.log("PASSED. " + msg)

    # Python3 gives some smiles that are different than thsoe obtain with
    # Python2. But they are just different representations of the same thing.
    # Let's make the switch to the Python2 form for this test.
    all_smiles = set(
        ["CN=[N+]=N" if s == "[H]N=[N+]=NC" else s for s in all_smiles])

    # Note: Commented out below because durrant lab filters now remove
    # iminols.
    # all_smiles = set(
    #     ["CC(=N)O" if s in [r"[H]/N=C(\C)O", "[H]/N=C(/C)O"] else s for s in all_smiles]
    # )

    all_smiles = set([
        r"C/C(O)=N\N=C(/C)O" if s == r"C/C(O)=N/N=C(/C)O" else
        s  # Different one that turns up sometimes
        for s in all_smiles
    ])
    all_smiles = set([
        r"CC(=O)NNC(C)=O" if s in [
            r"CC(=O)[N-]/N=C(\C)O",
            r"C/C(O)=N/N=C(\C)O",
            r"CC(=O)N/N=C(\C)O",
            r"CC(=O)[N-]/N=C(/C)O",
            r"CC(=O)[N-]NC(C)=O",
            r"CC(=O)N/N=C(/C)O",
        ] else s  # Different one that turns up sometimes
        for s in all_smiles
    ])

    if len(all_smiles ^ target_smiles) > 0:
        print(all_smiles)
        print(target_smiles)
        import pdb
        pdb.set_trace()

        Utils.exception(
            "FAILED. " +
            "Got some SMILES I didn't expect (either in output or target list): "
            + " ".join(list(all_smiles ^ target_smiles)))
    else:
        Utils.log(
            "PASSED. Gypsum-DL output the very SMILES strings I was expecting."
        )

    Utils.log("")

    # Delete test output directory if it exists.
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
示例#26
0
def enumerate_chiral_molecules(
    contnrs,
    max_variants_per_compound,
    thoroughness,
    num_procs,
    job_manager,
    parallelizer_obj,
):
    """Enumerates all possible enantiomers of a molecule. If the chirality of
       an atom is given, that chiral center is not varied. Only the chirality
       of unspecified chiral centers is varied.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param num_procs: The number of processors to use.
    :type num_procs: int
    :param job_manager: The multiprocess mode.
    :type job_manager: string
    :param parallelizer_obj: The Parallelizer object.
    :type parallelizer_obj: Parallelizer.Parallelizer
    """

    # No point in continuing none requested.
    if max_variants_per_compound == 0:
        return

    Utils.log("Enumerating all possible enantiomers for all molecules...")

    # Group the molecules so you can feed them to parallelizer.
    params = []
    for contnr in contnrs:
        for mol in contnr.mols:
            params.append(tuple([mol, thoroughness, max_variants_per_compound]))
    params = tuple(params)

    # Run it through the parallelizer.
    tmp = []
    if parallelizer_obj != None:
        tmp = parallelizer_obj.run(params, parallel_get_chiral, num_procs, job_manager)
    else:
        for i in params:
            tmp.append(parallel_get_chiral(i[0], i[1], i[2]))

    # Remove Nones (failed molecules)
    clean = Parallelizer.strip_none(tmp)

    # Flatten the data into a single list.
    flat = Parallelizer.flatten_list(clean)

    # Get the indexes of the ones that failed to generate.
    contnr_idxs_of_failed = Utils.fnd_contnrs_not_represntd(contnrs, flat)

    # Go through the missing ones and throw a message.
    for miss_indx in contnr_idxs_of_failed:
        Utils.log(
            "\tCould not generate valid enantiomers for "
            + contnrs[miss_indx].orig_smi
            + " ("
            + contnrs[miss_indx].name
            + "), so using existing "
            + "(unprocessed) structures."
        )
        for mol in contnrs[miss_indx].mols:
            mol.genealogy.append("(WARNING: Unable to generate enantiomers)")
            clean.append(mol)

    # Keep only the top few compound variants in each container, to prevent a
    # combinatorial explosion.
    ChemUtils.bst_for_each_contnr_no_opt(
        contnrs, flat, max_variants_per_compound, thoroughness
    )
示例#27
0
def parallel_make_taut(contnr, mol_index, max_variants_per_compound):
    """Makes alternate tautomers for a given molecule container. This is the
       function that gets fed into the parallelizer.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param mol_index: The molecule index.
    :type mol_index: int
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :return: A list of MyMol.MyMol objects, containing the alternate
        tautomeric forms.
    :rtype: list
    """

    # Get the MyMol.MyMol within the molecule container corresponding to the
    # given molecule index.
    mol = contnr.mols[mol_index]

    # Create a temporary RDKit mol object, since that's what MolVS works with.
    # TODO: There should be a copy function
    m = MyMol.MyMol(mol.smiles()).rdkit_mol

    # For tautomers to work, you need to not have any explicit hydrogens.
    m = Chem.RemoveHs(m)

    # Make sure it's not None.
    if m is None:
        Utils.log("\tCould not generate tautomers for " + contnr.orig_smi +
                  ". I'm deleting it.")
        return

    # Molecules should be kekulized already, but let's double check that.
    # Because MolVS requires kekulized input.
    Chem.Kekulize(m)
    m = MOH.check_sanitization(m)
    if m is None:
        return None

    # Limit to max_variants_per_compound tauts. Note that another batch could
    # add more, so you'll need to once again trim to this number later. But
    # this could at least help prevent the combinatorial explosion at this
    # stage.
    enum = tautomer.TautomerEnumerator(max_tautomers=max_variants_per_compound)
    tauts_rdkit_mols = enum.enumerate(m)

    # Make all those tautomers into MyMol objects.
    tauts_mols = [MyMol.MyMol(m) for m in tauts_rdkit_mols]

    # Keep only those that have reasonable substructures.
    tauts_mols = [
        t for t in tauts_mols if t.remove_bizarre_substruc() == False
    ]

    # If there's more than one, let the user know that.
    if len(tauts_mols) > 1:
        Utils.log("\t" + mol.smiles(True) + " has tautomers.")

    # Now collect the final results.
    results = []

    for tm in tauts_mols:
        tm.inherit_contnr_props(contnr)
        tm.genealogy = mol.genealogy[:]
        tm.name = mol.name

        if tm.smiles() != mol.smiles():
            tm.genealogy.append(tm.smiles(True) + " (tautomer)")

        results.append(tm)

    return results
示例#28
0
def prepare_molecules(args):
    """A function for preparing small-molecule models for docking. To work, it
    requires that the python module rdkit be installed on the system.

    :param args: The arguments, from the commandline.
    :type args: dict
    """

    # Keep track of the tim the program starts.
    start_time = datetime.now()

    # A list of command-line parameters that will be ignored if using a json
    # file.
    json_warning_list = [
        "source",
        "output_folder",
        "num_processors",
        "min_ph",
        "max_ph",
        "delta_ph_increment",
        "thoroughness",
        "max_variants_per_compound",
        "pka_precision",
    ]

    # Whether to warn the user that the above parameters, if specified, will
    # be ignored.
    need_to_print_override_warning = False

    if "json" in args:
        # "json" is one of the parameters, so we'll be ignoring the rest.
        try:
            params = json.load(open(args["json"]))
        except:
            Utils.exception("Is your input json file properly formed?")

        params = set_parameters(params)
        if [i for i in json_warning_list if i in list(args.keys())]:
            need_to_print_override_warning = True
    else:
        # We're actually going to use all the command-line parameters. No
        # warning necessary.
        params = set_parameters(args)

    # If running in serial mode, make sure only one processor is used.
    if params["job_manager"] == "serial":
        if params["num_processors"] != 1:
            Utils.log(
                "Because --job_manager was set to serial, this will be run on a single processor."
            )
        params["num_processors"] = 1

    # Handle mpi errors if mpi4py isn't installed
    if params["job_manager"] == "mpi":

        # Before executing Parallelizer with mpi4py (which override python raise Exceptions)
        # We must check that it is being run with the "-m mpi4py" runpy flag
        sys_modules = sys.modules
        if "runpy" not in sys_modules.keys():
            printout = "\nTo run in mpi mode you must run with -m flag. ie) mpirun -n $NTASKS python -m mpi4py run_gypsum_dl.py\n"
            print(printout)
            Utils.exception(printout)

        # Check mpi4py import
        try:
            import mpi4py
        except:
            printout = "\nmpi4py not installed but --job_manager is set to mpi. \n Either install mpi4py or switch job_manager to multiprocessing or serial.\n"
            print(printout)
            Utils.exception(printout)

        # Check mpi4py import version. This must be at version 2.1.0 and higher
        mpi4py_version = mpi4py.__version__
        mpi4py_version = [int(x) for x in mpi4py_version.split(".")]

        if mpi4py_version[0] == 2:
            if mpi4py_version[1] < 1:
                printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n"
                print(printout)
                Utils.exception(printout)
        elif mpi4py_version[0] < 2:
            printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n"
            print(printout)
            Utils.exception(printout)

    # Throw a message if running on windows. Windows doesn't deal with with
    # multiple processors, so use only 1.
    if sys.platform == "win32":
        Utils.log(
            "WARNING: Multiprocessing is not supported on Windows. Tasks will be run in Serial mode."
        )
        params["num_processors"] = 1
        params["job_manager"] = "serial"

    # Launch mpi workers if that's what's specified.
    if params["job_manager"] == "mpi":
        params["Parallelizer"] = Parallelizer(
            params["job_manager"], params["num_processors"]
        )
    else:
        # Lower-level mpi (i.e. making a new Parallelizer within an mpi) has
        # problems with importing the MPI environment and mpi4py. So we will
        # flag it to skip the MPI mode and just go to multiprocess/serial.
        # This is a saftey precaution
        params["Parallelizer"] = Parallelizer(
            params["job_manager"], params["num_processors"], True
        )

    # Let the user know that their command-line parameters will be ignored, if
    # they have specified a json file.
    if need_to_print_override_warning == True:
        Utils.log("WARNING: Using the --json flag overrides all other flags.")

    # If running in mpi mode, separate_output_files must be set to true.
    if params["job_manager"] == "mpi" and params["separate_output_files"] == False:
        Utils.log(
            "WARNING: Running in mpi mode, but separate_output_files is not set to True. Setting separate_output_files to True anyway."
        )
        params["separate_output_files"] = True

    # Outputing HTML files not supported in mpi mode.
    if params["job_manager"] == "mpi" and params["add_html_output"] == True:
        Utils.log(
            "WARNING: Running in mpi mode, but add_html_output is set to True. HTML output is not supported in mpi mode."
        )
        params["add_html_output"] = False

    # Warn the user if he or she is not using the Durrant lab filters.
    if params["use_durrant_lab_filters"] ==- False:
        Utils.log(
            "WARNING: Running Gypsum-DL without the Durrant-lab filters. In looking over many Gypsum-DL-generated " +
            "variants, we have identified a number of substructures that, though technically possible, strike us " +
            "as improbable or otherwise poorly suited for virtual screening. We strongly recommend removing these " +
            "by running Gypsum-DL with the --use_durrant_lab_filters option.",
            trailing_whitespace="\n"
        )

    # Load SMILES data
    if isinstance(params["source"], str):
        Utils.log("Loading molecules from " + os.path.basename(params["source"]) + "...")

        # Smiles must be array of strs.
        src = params["source"]
        if src.lower().endswith(".smi") or src.lower().endswith(".can"):
            # It's an smi file.
            smiles_data = load_smiles_file(src)
        elif params["source"].lower().endswith(".sdf"):
            # It's an sdf file. Convert it to a smiles.
            smiles_data = load_sdf_file(src)
        else:
            smiles_data = [params["source"]]
    else:
        pass  # It's already in the required format.

    # Make the output directory if necessary.
    if os.path.exists(params["output_folder"]) == False:
        os.mkdir(params["output_folder"])
        if os.path.exists(params["output_folder"]) == False:
            Utils.exception("Output folder directory couldn't be found or created.")

    # For Debugging
    # print("")
    # print("###########################")
    # print("num_procs  :  ", params["num_processors"])
    # print("chosen mode  :  ", params["job_manager"])
    # print("Parallel style:  ", params["Parallelizer"].return_mode())
    # print("Number Nodes:  ", params["Parallelizer"].return_node())
    # print("###########################")
    # print("")

    # Make the molecule containers.
    contnrs = []
    idx_counter = 0
    for i in range(0, len(smiles_data)):
        try:
            smiles, name, props = smiles_data[i]
        except:
            msg = 'Unexpected error. Does your "source" parameter specify a '
            msg = msg + "filename that ends in a .can, .smi, or .sdf extension?"
            Utils.exception(msg)

        if detect_unassigned_bonds(smiles) is None:
            Utils.log(
                "WARNING: Throwing out SMILES because of unassigned bonds: " + smiles
            )
            continue

        new_contnr = MolContainer(smiles, name, idx_counter, props)
        if (
            new_contnr.orig_smi_canonical == None
            or type(new_contnr.orig_smi_canonical) != str
        ):
            Utils.log(
                "WARNING: Throwing out SMILES because of it couldn't convert to mol: "
                + smiles
            )
            continue

        contnrs.append(new_contnr)
        idx_counter += 1

    # Remove None types from failed conversion
    contnrs = [x for x in contnrs if x.orig_smi_canonical != None]
    if len(contnrs) != idx_counter:
        Utils.exception("There is a corrupted container")

    # In multiprocessing mode, Gypsum-DL parallelizes each small-molecule
    # preparation step separately. But this scheme is inefficient in MPI mode
    # because it increases the amount of communication required between nodes.
    # So for MPI mode, we will run all the preparation steps for a given
    # molecule container on a single thread.
    if params["Parallelizer"].return_mode() != "mpi":
        # Non-MPI (e.g., multiprocessing)
        execute_gypsum_dl(contnrs, params)
    else:
        # MPI mode. Group the molecule containers so they can be passed to the
        # parallelizer.
        job_input = []
        temp_param = {}
        for key in list(params.keys()):
            if key == "Parallelizer":
                temp_param["Parallelizer"] = None
            else:
                temp_param[key] = params[key]

        for contnr in contnrs:
            contnr.contnr_idx = 0  # Because each container being run in isolation.
            job_input.append(tuple([[contnr], temp_param]))
        job_input = tuple(job_input)

        params["Parallelizer"].run(job_input, execute_gypsum_dl)

    # Calculate the total run time.
    end_time = datetime.now()
    run_time = end_time - start_time
    params["start_time"] = str(start_time)
    params["end_time"] = str(end_time)
    params["run_time"] = str(run_time)

    Utils.log("\nStart time at: " + str(start_time))
    Utils.log("End time at:   " + str(end_time))
    Utils.log("Total time at: " + str(run_time))

    # Kill mpi workers if necessary.
    params["Parallelizer"].end(params["job_manager"])
示例#29
0
def load_sdf_file(filename):
    """Loads an sdf file.

    :param filename: The filename.
    :type filename: str
    :return: A list of tuples, (SMILES, Name).
    :rtype: list
    """

    suppl = Chem.SDMolSupplier(filename)
    data = []
    duplicate_names = {}
    missing_name_counter = 0
    mol_obj_counter = 0
    name_list = []
    for mol in suppl:
        # Convert mols to smiles. That's what the rest of the program is
        # designed to deal with.
        smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)

        try:
            name = mol.GetProp("_Name")
        except:
            name = ""

        # Handle unnamed ligands
        if name == "":
            Utils.log("\tUntitled ligand for the {} molecule in the input SDF".
                      format(mol_obj_counter))
            name = "untitled_{}_molnum_{}".format(missing_name_counter,
                                                  mol_obj_counter)
            Utils.log("\tNaming that ligand {}".format(name))
            Utils.log(
                "\tAll associated files will be refered to with this name")
            missing_name_counter += 1

            # Handle duplicate ligands in same list.
            if name in name_list:
                # If multiple names.
                if name in list(duplicate_names.keys()):
                    duplicate_names[name] = duplicate_names[name] + 1

                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe veresion of the ligand for the {} molecule in the SDF file will be retitled {}"
                        .format(mol_obj_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be refered to with this name"
                    )
                    name = new_name
                else:
                    duplicate_names[name] = 2
                    new_name = "{}_copy_{}".format(name, duplicate_names[name])
                    Utils.log(
                        "\nMultiple entries with the ligand name: {}".format(
                            name))
                    Utils.log(
                        "\tThe veresion of the ligand for the {} molecule in the SDF file will be retitled {}"
                        .format(mol_obj_counter, new_name))
                    Utils.log(
                        "\tAll associated files will be refered to with this name"
                    )
                    name = new_name

            mol_obj_counter += 1
            name_list.append(name)

        # SDF files may also contain properties. Get those as well.
        try:
            properties = mol.GetPropsAsDict()
        except:
            properties = {}

        if smiles != "":
            data.append((smiles, name, properties))

    return data
示例#30
0
def bst_for_each_contnr_no_opt(
    contnrs,
    mol_lst,
    max_variants_per_compound,
    thoroughness,
    crry_ovr_frm_lst_step_if_no_fnd=True,
):
    """Keep only the top few compound variants in each container, to prevent a
       combinatorial explosion. This is run periodically on the growing
       containers to keep them in check.

    :param contnrs: A list of containers (MolContainer.MolContainer).
    :type contnrs: list
    :param mol_lst: The list of MyMol.MyMol objects.
    :type mol_lst: list
    :param max_variants_per_compound: To control the combinatorial explosion,
       only this number of variants (molecules) will be advanced to the next
       step.
    :type max_variants_per_compound: int
    :param thoroughness: How many molecules to generate per variant (molecule)
       retained, for evaluation. For example, perhaps you want to advance five
       molecules (max_variants_per_compound = 5). You could just generate five
       and advance them all. Or you could generate ten and advance the best
       five (so thoroughness = 2). Using thoroughness > 1 increases the
       computational expense, but it also increases the chances of finding good
       molecules.
    :type thoroughness: int
    :param crry_ovr_frm_lst_step_if_no_fnd: If it can't find any low-energy
       conformers, determines whether to just keep the old ones. Defaults to
       True.
    :param crry_ovr_frm_lst_step_if_no_fnd: bool, optional
    """

    # Remove duplicate ligands from each container.
    for mol_cont in contnrs:
        mol_cont.remove_identical_mols_from_contnr()

    # Group the smiles by contnr_idx.
    data = Utils.group_mols_by_container_index(mol_lst)

    # Go through each container.
    for contnr_idx, contnr in enumerate(contnrs):
        contnr_idx = contnr.contnr_idx
        none_generated = False

        # Pick just the lowest-energy conformers from the new candidates.
        # Possible a compound was eliminated early on, so doesn't exist.
        if contnr_idx in list(data.keys()):
            mols = data[contnr_idx]

            # Remove molecules with unusually high charges.
            mols = remove_highly_charged_molecules(mols)

            # Pick the lowest-energy molecules. Note that this creates a
            # conformation if necessary, but it is not minimized and so is not
            # computationally expensive.
            mols = pick_lowest_enrgy_mols(mols, max_variants_per_compound,
                                          thoroughness)

            if len(mols) > 0:
                # Now remove all previously determined mols for this
                # container.
                contnr.mols = []

                # Add in the lowest-energy conformers back to the container.
                for mol in mols:
                    contnr.add_mol(mol)
            else:
                none_generated = True
        else:
            none_generated = True

        # No low-energy conformers were generated.
        if none_generated:
            if crry_ovr_frm_lst_step_if_no_fnd:
                # Just use previous ones.
                Utils.log(
                    "\tWARNING: Unable to find low-energy conformations: " +
                    contnr.orig_smi_deslt + " (" + contnr.name +
                    "). Keeping original " + "conformers.")
            else:
                # Discard the conformation.
                Utils.log(
                    "\tWARNING: Unable to find low-energy conformations: " +
                    contnr.orig_smi_deslt + " (" + contnr.name +
                    "). Discarding conformer.")
                contnr.mols = []