Exemplo n.º 1
0
def make_type_dict(dictionary):
    """Creates a types dictionary from an existant dictionary. Keys are
       preserved, but values are the types.

    :param dictionary: A dictionary, with keys are values.
    :type dictionary: dict
    :return: A dictionary with the same keys, but the values are the types.
    :rtype: dict
    """

    type_dict = {}
    allowed_types = [int, float, bool, str]
    # Go through the dictionary keys.
    for key in dictionary:
        # Get the the type of the value.
        val = dictionary[key]
        for allowed in allowed_types:
            if isinstance(val, allowed):
                # Add it to the type_dict.
                type_dict[key] = allowed

        # The value ha san unacceptable type. Throw an error.
        if key not in type_dict:
            Utils.exception(
                "ERROR: There appears to be an error in your parameter "
                + "JSON file. No value can have type "
                + str(type(val))
                + "."
            )

    return type_dict
Exemplo n.º 2
0
def parallel_add_H(contnr, protonation_settings):
    """Creates alternate ionization variants for a given molecule container.
       This is the function that gets fed into the parallelizer.

    :param contnr: The molecule container.
    :type contnr: MolContainer.MolContainer
    :param protonation_settings: Protonation settings to pass to Dimorphite-DL.
    :type protonation_settings: dict
    :return: [description]
    :rtype: [type]
    """

    # Make sure the canonical SMILES is actually a string.
    if type(contnr.orig_smi_canonical) != str:
        Utils.log("container.orig_smi_canonical: " + contnr.orig_smi_canonical)
        Utils.log("type container.orig_smi_canonical: " +
                  str(type(contnr.orig_smi_canonical)))
        Utils.exception("container.orig_smi_canonical: " +
                        contnr.orig_smi_canonical)

    # Add the SMILES string to the protonation parameters.
    protonation_settings["smiles"] = contnr.orig_smi_canonical

    # Protonate the SMILESstring. This is Dimorphite-DL.
    smis = Protonate(protonation_settings)

    # Convert the protonated SMILES strings into a list of rdkit molecule
    # objects.
    rdkit_mols = [Chem.MolFromSmiles(smi.strip()) for smi in smis]

    # Convert from rdkit mols to MyMol.MyMol.
    addH_mols = [MyMol.MyMol(mol) for mol in rdkit_mols if mol is not None]

    # Remove MyMols with odd substructures.
    addH_mols = [
        mol for mol in addH_mols if mol.remove_bizarre_substruc() is False
    ]

    # I once saw it add a "C+"" here. So do a secondary check at this point to
    # make sure it's valid. Recreate the list, moving new MyMol.MyMol objects
    # into the return_values list.

    return_values = []

    orig_mol = contnr.mol_orig_frm_inp_smi
    for Hm in addH_mols:
        Hm.inherit_contnr_props(contnr)
        Hm.genealogy = orig_mol.genealogy[:]
        Hm.name = orig_mol.name

        if Hm.smiles() != orig_mol.smiles():
            Hm.genealogy.append(Hm.smiles(True) + " (protonated)")

        return_values.append(Hm)

    return return_values
Exemplo n.º 3
0
    def update_idx(self, new_idx):
        """Updates the index of this container.

        :param new_idx: The new index.
        :type new_idx: int
        """

        if type(new_idx)!= int:
            Utils.exception("New idx value must be an int.")
        self.contnr_idx = new_idx
        self.mol_orig_frm_inp_smi.contnr_idx = self.contnr_idx
Exemplo n.º 4
0
def merge_parameters(default, params):
    """Add default values if missing from parameters.

    :param default: The parameters.
    :type default: dict
    :param params: The default values
    :type params: dict
    :raises KeyError: Unrecognized parameter.
    :raises TypeError: Input parameter has a different type than the default.
    """

    # Generate a dictionary with the same keys, but the types for the values.
    type_dict = make_type_dict(default)

    # Move user-specified values into the parameter.
    for param in params:
        # Throw an error if there's an unrecognized parameter.
        if param not in default:
            Utils.log('Parameter "' + str(param) + '" not recognized!')
            Utils.log("Here are the options:")
            Utils.log(" ".join(sorted(list(default.keys()))))
            Utils.exception("Unrecognized parameter: " + str(param))

        # Throw an error if the input parameter has a different type than
        # the default one.
        if not isinstance(params[param], type_dict[param]):
            # Cast int to float if necessary
            if type(params[param]) is int and type_dict[param] is float:
                params[param] = float(params[param])
            else:
                # Seems to be a type mismatch.
                Utils.exception(
                    'The parameter "'
                    + param
                    + '" must be of '
                    + "type "
                    + str(type_dict[param])
                    + ", but it is of type "
                    + str(type(params[param]))
                    + "."
                )

        # Update the parameter value with the user-defined one.
        default[param] = params[param]
Exemplo n.º 5
0
def finalize_params(params):
    """Checks and updates parameters to their final values.

    :param params: The parameters.
    :type params: dict
    :raises NotImplementedError: Missing parameter.
    :return: The parameters, corrected/updated where needed.
    :rtype: dict
    """

    # Throw an error if there's a missing parameter.
    if params["source"] == "":
        Utils.exception(
            'Missing parameter "source". You need to specify ' +
            "the source of the input molecules (probably a SMI or SDF " +
            "file).")

    # Note on parameter "source", the data source. If it's a string that
    # ends in ".smi", it's treated as a smiles file. If it's a string that
    # ends in ".sdf", it's treated as an sdf file. If it's any other
    # string, it's assumed to be a smiles string itself and is assigned a
    # name of "". If it's a list, it's assumed to be a list of tuples,
    # [SMILES, Name].

    # Check some required variables.
    try:
        params["source"] = os.path.abspath(params["source"])
    except:
        Utils.exception("Source file doesn't exist.")
    source_dir = params["source"].strip(os.path.basename(params["source"]))

    if params["output_folder"] == "" and params["source"] != "":
        params["output_folder"] = source_dir + "output" + str(os.sep)

    if params["add_pdb_output"] == True and params["output_folder"] == "":
        Utils.exception("To output files as .pdbs, specify the output_folder.")

    if params["separate_output_files"] == True and params[
            "output_folder"] == "":
        Utils.exception(
            "For separate_output_files, specify the output_folder.")

    # if not os.path.exists(params["output_folder"]) or not os.path.isdir(params["output_folder"]):
    #     Utils.exception(
    #         "The specified \"output_folder\", " + params["output_folder"] +
    #         ", either does not exist or is a file rather than a folder. " +
    #         "Please provide the path to an existing folder instead."
    #     )

    # Make sure job_manager is always lower case.
    params["job_manager"] = params["job_manager"].lower()

    return params
Exemplo n.º 6
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A module for loading in files.
"""

import __future__
from gypsum_dl import Utils

try:
    from rdkit import Chem
except:
    Utils.exception("You need to install rdkit and its dependencies.")


def load_smiles_file(filename):
    """Loads a smiles file.

    :param filename: The filename.
    :type filename: str
    :return: A list of tuples, (SMILES, Name).
    :rtype: list
    """

    # A smiles file contains one molecule on each line. Each line is a string,
    # separated by white space, followed by the molecule name.
    data = []
    duplicate_names = {}
Exemplo n.º 7
0
def run_test():
    script_dir = os.path.dirname(os.path.realpath(__file__))
    output_folder = script_dir + os.sep + "gypsum_dl_test_output" + os.sep

    # Delete test output directory if it exists.
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)

    # Make the directory
    os.mkdir(output_folder)

    # Make the Gypsum-DL parameters.
    params = {
        "source": script_dir + os.sep + "sample_molecules.smi",
        "separate_output_files": True,
        "job_manager": "serial",  # multiprocessing
        "output_folder": output_folder,
        "add_pdb_output": False,
        "max_variants_per_compound": 8,
        "thoroughness": 1,
        "min_ph": 4,
        "max_ph": 10,
        "pka_precision": 1,
        "use_durrant_lab_filters": True,
    }

    # Prepare the molecules.
    prepare_molecules(params)
    Utils.log("")
    Utils.log("TEST RESULTS")
    Utils.log("============")

    # Get the output sdf files.
    sdf_files = glob.glob(output_folder + "*")

    # There should be seven sdf files.
    msg = "Expected 15 output files, got " + str(len(sdf_files)) + "."
    if len(sdf_files) != 15:
        Utils.exception("FAILED. " + msg)
    else:
        Utils.log("PASSED. " + msg)

    # Get all the smiles from the files.
    all_smiles = set([])
    for sdf_file in sdf_files:
        lines = open(sdf_file).readlines()
        for i, line in enumerate(lines):
            if "<SMILES>" in line:
                all_smiles.add(lines[i + 1].strip())

    # List what the smiles should be.
    target_smiles = set([])

    # salt_and_ionization should produce two models (ionized and
    # deionized).
    target_smiles |= set(["[O-]c1ccccc1", "Oc1ccccc1"])

    # tautomer_and_cis_trans should produce three models (two tautomers, one
    # of them with alternate cis/trans).
    target_smiles |= set([r"C/C=C\O", "C/C=C/O", "CCC=O"])

    # two_chiral_one_unspecified_and_tautomer should produce four models.
    target_smiles |= set([
        "CC(C)C(=O)[C@@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)=C(O)[C@@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)C(=O)[C@](F)(Cl)C[C@@](C)(F)Cl",
        "CC(C)=C(O)[C@](F)(Cl)C[C@@](C)(F)Cl",
    ])

    # two_double_bonds_one_chiral_center should produce eight models.
    target_smiles |= set([
        r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl",
        "CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl",
        r"CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl",
        r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl",
        "CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl",
        r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl",
    ])

    # two_double_bonds_one_unspecified should produce two models.
    target_smiles |= set(
        [r"CC/C(C)=C(\Cl)C/C(I)=C(\C)F", r"CC/C(C)=C(/Cl)C/C(I)=C(\C)F"])

    # non_aromatic_ring should produce one model. It will list it several
    # times, because different ring conformations of the same model.
    target_smiles |= set(["CC(C)(C)[C@H]1CC[C@@H](C(C)(C)C)CC1"])

    # There should be no =[N-] if Durrant lab filters are turned on. Note:
    # Removed "CC(=N)O" from below list because durrant lab filters now remove
    # iminols.
    target_smiles |= set(["CC([NH-])=O", "CC(N)=O"])

    # There should be no [N-]C=[N+] (CC(=O)[N-]C=[N+](C)C).
    target_smiles |= set([
        r"C/C(O)=N\C=[N+](C)C",
        r"CC(=O)/N=C\[NH+](C)C",
        "CC(=O)/N=C/[NH+](C)C",
        "CC(=O)NC=[N+](C)C",
        "C/C(O)=N/C=[N+](C)C",
    ])

    # There should be no [nH+]c[n-] (c1c[nH+]c[n-]1)
    target_smiles |= set(["c1c[n-]cn1", "c1c[nH+]c[nH]1", "c1c[nH]cn1"])

    # There should be no [#7+]~[#7+] (c1cc[nH+][nH+]c1)
    target_smiles |= set(["c1ccnnc1", "c1cc[nH+]nc1"])

    # There should be no [#7-]~[#7-] (CC(=O)[N-][N-]C(C)=O). Note that some
    # are commented out because Python2 and Python3 given different SMILES
    # strings that are all valid. See below to see how things are
    # consolodated. (Really this was probably a bad example to pick because
    # there are so many forms...)
    target_smiles |= set([
        "CC(=O)NNC(C)=O",
        # r"CC(=O)N/N=C(\C)O",
        # r"CC(=O)[N-]/N=C(/C)O",
        # r"C/C(O)=N/N=C(\C)O",
        r"C/C(O)=N\N=C(/C)O",
        # r"CC(=O)[N-]/N=C(\C)O",
        # "CC(=O)[N-]NC(C)=O",
        # "CC(=O)N/N=C(/C)O"
    ])

    # There should be no [!#7]~[#7+]~[#7-]~[!#7] (c1c[n-][nH+]c1)
    target_smiles |= set(["c1cn[n-]c1", "c1cn[nH]c1", "c1c[nH][nH+]c1"])

    # Azides can have adjacent +/- nitrogens.
    target_smiles |= set(["CN=[N+]=[N-]", "CN=[N+]=N"])

    # msg = "Expected " + str(len(target_smiles)) + " total SMILES, got " + \
    #     str(len(all_smiles)) + "."
    # if len(all_smiles) != len(target_smiles):
    #     Utils.exception("FAILED. " + msg)
    # else:
    #     Utils.log("PASSED. " + msg)

    # Python3 gives some smiles that are different than thsoe obtain with
    # Python2. But they are just different representations of the same thing.
    # Let's make the switch to the Python2 form for this test.
    all_smiles = set(
        ["CN=[N+]=N" if s == "[H]N=[N+]=NC" else s for s in all_smiles])

    # Note: Commented out below because durrant lab filters now remove
    # iminols.
    # all_smiles = set(
    #     ["CC(=N)O" if s in [r"[H]/N=C(\C)O", "[H]/N=C(/C)O"] else s for s in all_smiles]
    # )

    all_smiles = set([
        r"C/C(O)=N\N=C(/C)O" if s == r"C/C(O)=N/N=C(/C)O" else
        s  # Different one that turns up sometimes
        for s in all_smiles
    ])
    all_smiles = set([
        r"CC(=O)NNC(C)=O" if s in [
            r"CC(=O)[N-]/N=C(\C)O",
            r"C/C(O)=N/N=C(\C)O",
            r"CC(=O)N/N=C(\C)O",
            r"CC(=O)[N-]/N=C(/C)O",
            r"CC(=O)[N-]NC(C)=O",
            r"CC(=O)N/N=C(/C)O",
        ] else s  # Different one that turns up sometimes
        for s in all_smiles
    ])

    if len(all_smiles ^ target_smiles) > 0:
        print(all_smiles)
        print(target_smiles)
        import pdb
        pdb.set_trace()

        Utils.exception(
            "FAILED. " +
            "Got some SMILES I didn't expect (either in output or target list): "
            + " ".join(list(all_smiles ^ target_smiles)))
    else:
        Utils.log(
            "PASSED. Gypsum-DL output the very SMILES strings I was expecting."
        )

    Utils.log("")

    # Delete test output directory if it exists.
    if os.path.exists(output_folder):
        shutil.rmtree(output_folder)
Exemplo n.º 8
0
def prepare_molecules(args):
    """A function for preparing small-molecule models for docking. To work, it
    requires that the python module rdkit be installed on the system.

    :param args: The arguments, from the commandline.
    :type args: dict
    """

    # Keep track of the tim the program starts.
    start_time = datetime.now()

    # A list of command-line parameters that will be ignored if using a json
    # file.
    json_warning_list = [
        "source",
        "output_folder",
        "num_processors",
        "min_ph",
        "max_ph",
        "delta_ph_increment",
        "thoroughness",
        "max_variants_per_compound",
        "pka_precision",
    ]

    # Whether to warn the user that the above parameters, if specified, will
    # be ignored.
    need_to_print_override_warning = False

    if "json" in args:
        # "json" is one of the parameters, so we'll be ignoring the rest.
        try:
            params = json.load(open(args["json"]))
        except:
            Utils.exception("Is your input json file properly formed?")

        params = set_parameters(params)
        if [i for i in json_warning_list if i in list(args.keys())]:
            need_to_print_override_warning = True
    else:
        # We're actually going to use all the command-line parameters. No
        # warning necessary.
        params = set_parameters(args)

    # If running in serial mode, make sure only one processor is used.
    if params["job_manager"] == "serial":
        if params["num_processors"] != 1:
            Utils.log(
                "Because --job_manager was set to serial, this will be run on a single processor."
            )
        params["num_processors"] = 1

    # Handle mpi errors if mpi4py isn't installed
    if params["job_manager"] == "mpi":

        # Before executing Parallelizer with mpi4py (which override python raise Exceptions)
        # We must check that it is being run with the "-m mpi4py" runpy flag
        sys_modules = sys.modules
        if "runpy" not in sys_modules.keys():
            printout = "\nTo run in mpi mode you must run with -m flag. ie) mpirun -n $NTASKS python -m mpi4py run_gypsum_dl.py\n"
            print(printout)
            Utils.exception(printout)

        # Check mpi4py import
        try:
            import mpi4py
        except:
            printout = "\nmpi4py not installed but --job_manager is set to mpi. \n Either install mpi4py or switch job_manager to multiprocessing or serial.\n"
            print(printout)
            Utils.exception(printout)

        # Check mpi4py import version. This must be at version 2.1.0 and higher
        mpi4py_version = mpi4py.__version__
        mpi4py_version = [int(x) for x in mpi4py_version.split(".")]

        if mpi4py_version[0] == 2:
            if mpi4py_version[1] < 1:
                printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n"
                print(printout)
                Utils.exception(printout)
        elif mpi4py_version[0] < 2:
            printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n"
            print(printout)
            Utils.exception(printout)

    # Throw a message if running on windows. Windows doesn't deal with with
    # multiple processors, so use only 1.
    if sys.platform == "win32":
        Utils.log(
            "WARNING: Multiprocessing is not supported on Windows. Tasks will be run in Serial mode."
        )
        params["num_processors"] = 1
        params["job_manager"] = "serial"

    # Launch mpi workers if that's what's specified.
    if params["job_manager"] == "mpi":
        params["Parallelizer"] = Parallelizer(
            params["job_manager"], params["num_processors"]
        )
    else:
        # Lower-level mpi (i.e. making a new Parallelizer within an mpi) has
        # problems with importing the MPI environment and mpi4py. So we will
        # flag it to skip the MPI mode and just go to multiprocess/serial.
        # This is a saftey precaution
        params["Parallelizer"] = Parallelizer(
            params["job_manager"], params["num_processors"], True
        )

    # Let the user know that their command-line parameters will be ignored, if
    # they have specified a json file.
    if need_to_print_override_warning == True:
        Utils.log("WARNING: Using the --json flag overrides all other flags.")

    # If running in mpi mode, separate_output_files must be set to true.
    if params["job_manager"] == "mpi" and params["separate_output_files"] == False:
        Utils.log(
            "WARNING: Running in mpi mode, but separate_output_files is not set to True. Setting separate_output_files to True anyway."
        )
        params["separate_output_files"] = True

    # Outputing HTML files not supported in mpi mode.
    if params["job_manager"] == "mpi" and params["add_html_output"] == True:
        Utils.log(
            "WARNING: Running in mpi mode, but add_html_output is set to True. HTML output is not supported in mpi mode."
        )
        params["add_html_output"] = False

    # Warn the user if he or she is not using the Durrant lab filters.
    if params["use_durrant_lab_filters"] ==- False:
        Utils.log(
            "WARNING: Running Gypsum-DL without the Durrant-lab filters. In looking over many Gypsum-DL-generated " +
            "variants, we have identified a number of substructures that, though technically possible, strike us " +
            "as improbable or otherwise poorly suited for virtual screening. We strongly recommend removing these " +
            "by running Gypsum-DL with the --use_durrant_lab_filters option.",
            trailing_whitespace="\n"
        )

    # Load SMILES data
    if isinstance(params["source"], str):
        Utils.log("Loading molecules from " + os.path.basename(params["source"]) + "...")

        # Smiles must be array of strs.
        src = params["source"]
        if src.lower().endswith(".smi") or src.lower().endswith(".can"):
            # It's an smi file.
            smiles_data = load_smiles_file(src)
        elif params["source"].lower().endswith(".sdf"):
            # It's an sdf file. Convert it to a smiles.
            smiles_data = load_sdf_file(src)
        else:
            smiles_data = [params["source"]]
    else:
        pass  # It's already in the required format.

    # Make the output directory if necessary.
    if os.path.exists(params["output_folder"]) == False:
        os.mkdir(params["output_folder"])
        if os.path.exists(params["output_folder"]) == False:
            Utils.exception("Output folder directory couldn't be found or created.")

    # For Debugging
    # print("")
    # print("###########################")
    # print("num_procs  :  ", params["num_processors"])
    # print("chosen mode  :  ", params["job_manager"])
    # print("Parallel style:  ", params["Parallelizer"].return_mode())
    # print("Number Nodes:  ", params["Parallelizer"].return_node())
    # print("###########################")
    # print("")

    # Make the molecule containers.
    contnrs = []
    idx_counter = 0
    for i in range(0, len(smiles_data)):
        try:
            smiles, name, props = smiles_data[i]
        except:
            msg = 'Unexpected error. Does your "source" parameter specify a '
            msg = msg + "filename that ends in a .can, .smi, or .sdf extension?"
            Utils.exception(msg)

        if detect_unassigned_bonds(smiles) is None:
            Utils.log(
                "WARNING: Throwing out SMILES because of unassigned bonds: " + smiles
            )
            continue

        new_contnr = MolContainer(smiles, name, idx_counter, props)
        if (
            new_contnr.orig_smi_canonical == None
            or type(new_contnr.orig_smi_canonical) != str
        ):
            Utils.log(
                "WARNING: Throwing out SMILES because of it couldn't convert to mol: "
                + smiles
            )
            continue

        contnrs.append(new_contnr)
        idx_counter += 1

    # Remove None types from failed conversion
    contnrs = [x for x in contnrs if x.orig_smi_canonical != None]
    if len(contnrs) != idx_counter:
        Utils.exception("There is a corrupted container")

    # In multiprocessing mode, Gypsum-DL parallelizes each small-molecule
    # preparation step separately. But this scheme is inefficient in MPI mode
    # because it increases the amount of communication required between nodes.
    # So for MPI mode, we will run all the preparation steps for a given
    # molecule container on a single thread.
    if params["Parallelizer"].return_mode() != "mpi":
        # Non-MPI (e.g., multiprocessing)
        execute_gypsum_dl(contnrs, params)
    else:
        # MPI mode. Group the molecule containers so they can be passed to the
        # parallelizer.
        job_input = []
        temp_param = {}
        for key in list(params.keys()):
            if key == "Parallelizer":
                temp_param["Parallelizer"] = None
            else:
                temp_param[key] = params[key]

        for contnr in contnrs:
            contnr.contnr_idx = 0  # Because each container being run in isolation.
            job_input.append(tuple([[contnr], temp_param]))
        job_input = tuple(job_input)

        params["Parallelizer"].run(job_input, execute_gypsum_dl)

    # Calculate the total run time.
    end_time = datetime.now()
    run_time = end_time - start_time
    params["start_time"] = str(start_time)
    params["end_time"] = str(end_time)
    params["run_time"] = str(run_time)

    Utils.log("\nStart time at: " + str(start_time))
    Utils.log("End time at:   " + str(end_time))
    Utils.log("Total time at: " + str(run_time))

    # Kill mpi workers if necessary.
    params["Parallelizer"].end(params["job_manager"])