def make_type_dict(dictionary): """Creates a types dictionary from an existant dictionary. Keys are preserved, but values are the types. :param dictionary: A dictionary, with keys are values. :type dictionary: dict :return: A dictionary with the same keys, but the values are the types. :rtype: dict """ type_dict = {} allowed_types = [int, float, bool, str] # Go through the dictionary keys. for key in dictionary: # Get the the type of the value. val = dictionary[key] for allowed in allowed_types: if isinstance(val, allowed): # Add it to the type_dict. type_dict[key] = allowed # The value ha san unacceptable type. Throw an error. if key not in type_dict: Utils.exception( "ERROR: There appears to be an error in your parameter " + "JSON file. No value can have type " + str(type(val)) + "." ) return type_dict
def parallel_add_H(contnr, protonation_settings): """Creates alternate ionization variants for a given molecule container. This is the function that gets fed into the parallelizer. :param contnr: The molecule container. :type contnr: MolContainer.MolContainer :param protonation_settings: Protonation settings to pass to Dimorphite-DL. :type protonation_settings: dict :return: [description] :rtype: [type] """ # Make sure the canonical SMILES is actually a string. if type(contnr.orig_smi_canonical) != str: Utils.log("container.orig_smi_canonical: " + contnr.orig_smi_canonical) Utils.log("type container.orig_smi_canonical: " + str(type(contnr.orig_smi_canonical))) Utils.exception("container.orig_smi_canonical: " + contnr.orig_smi_canonical) # Add the SMILES string to the protonation parameters. protonation_settings["smiles"] = contnr.orig_smi_canonical # Protonate the SMILESstring. This is Dimorphite-DL. smis = Protonate(protonation_settings) # Convert the protonated SMILES strings into a list of rdkit molecule # objects. rdkit_mols = [Chem.MolFromSmiles(smi.strip()) for smi in smis] # Convert from rdkit mols to MyMol.MyMol. addH_mols = [MyMol.MyMol(mol) for mol in rdkit_mols if mol is not None] # Remove MyMols with odd substructures. addH_mols = [ mol for mol in addH_mols if mol.remove_bizarre_substruc() is False ] # I once saw it add a "C+"" here. So do a secondary check at this point to # make sure it's valid. Recreate the list, moving new MyMol.MyMol objects # into the return_values list. return_values = [] orig_mol = contnr.mol_orig_frm_inp_smi for Hm in addH_mols: Hm.inherit_contnr_props(contnr) Hm.genealogy = orig_mol.genealogy[:] Hm.name = orig_mol.name if Hm.smiles() != orig_mol.smiles(): Hm.genealogy.append(Hm.smiles(True) + " (protonated)") return_values.append(Hm) return return_values
def update_idx(self, new_idx): """Updates the index of this container. :param new_idx: The new index. :type new_idx: int """ if type(new_idx)!= int: Utils.exception("New idx value must be an int.") self.contnr_idx = new_idx self.mol_orig_frm_inp_smi.contnr_idx = self.contnr_idx
def merge_parameters(default, params): """Add default values if missing from parameters. :param default: The parameters. :type default: dict :param params: The default values :type params: dict :raises KeyError: Unrecognized parameter. :raises TypeError: Input parameter has a different type than the default. """ # Generate a dictionary with the same keys, but the types for the values. type_dict = make_type_dict(default) # Move user-specified values into the parameter. for param in params: # Throw an error if there's an unrecognized parameter. if param not in default: Utils.log('Parameter "' + str(param) + '" not recognized!') Utils.log("Here are the options:") Utils.log(" ".join(sorted(list(default.keys())))) Utils.exception("Unrecognized parameter: " + str(param)) # Throw an error if the input parameter has a different type than # the default one. if not isinstance(params[param], type_dict[param]): # Cast int to float if necessary if type(params[param]) is int and type_dict[param] is float: params[param] = float(params[param]) else: # Seems to be a type mismatch. Utils.exception( 'The parameter "' + param + '" must be of ' + "type " + str(type_dict[param]) + ", but it is of type " + str(type(params[param])) + "." ) # Update the parameter value with the user-defined one. default[param] = params[param]
def finalize_params(params): """Checks and updates parameters to their final values. :param params: The parameters. :type params: dict :raises NotImplementedError: Missing parameter. :return: The parameters, corrected/updated where needed. :rtype: dict """ # Throw an error if there's a missing parameter. if params["source"] == "": Utils.exception( 'Missing parameter "source". You need to specify ' + "the source of the input molecules (probably a SMI or SDF " + "file).") # Note on parameter "source", the data source. If it's a string that # ends in ".smi", it's treated as a smiles file. If it's a string that # ends in ".sdf", it's treated as an sdf file. If it's any other # string, it's assumed to be a smiles string itself and is assigned a # name of "". If it's a list, it's assumed to be a list of tuples, # [SMILES, Name]. # Check some required variables. try: params["source"] = os.path.abspath(params["source"]) except: Utils.exception("Source file doesn't exist.") source_dir = params["source"].strip(os.path.basename(params["source"])) if params["output_folder"] == "" and params["source"] != "": params["output_folder"] = source_dir + "output" + str(os.sep) if params["add_pdb_output"] == True and params["output_folder"] == "": Utils.exception("To output files as .pdbs, specify the output_folder.") if params["separate_output_files"] == True and params[ "output_folder"] == "": Utils.exception( "For separate_output_files, specify the output_folder.") # if not os.path.exists(params["output_folder"]) or not os.path.isdir(params["output_folder"]): # Utils.exception( # "The specified \"output_folder\", " + params["output_folder"] + # ", either does not exist or is a file rather than a folder. " + # "Please provide the path to an existing folder instead." # ) # Make sure job_manager is always lower case. params["job_manager"] = params["job_manager"].lower() return params
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ A module for loading in files. """ import __future__ from gypsum_dl import Utils try: from rdkit import Chem except: Utils.exception("You need to install rdkit and its dependencies.") def load_smiles_file(filename): """Loads a smiles file. :param filename: The filename. :type filename: str :return: A list of tuples, (SMILES, Name). :rtype: list """ # A smiles file contains one molecule on each line. Each line is a string, # separated by white space, followed by the molecule name. data = [] duplicate_names = {}
def run_test(): script_dir = os.path.dirname(os.path.realpath(__file__)) output_folder = script_dir + os.sep + "gypsum_dl_test_output" + os.sep # Delete test output directory if it exists. if os.path.exists(output_folder): shutil.rmtree(output_folder) # Make the directory os.mkdir(output_folder) # Make the Gypsum-DL parameters. params = { "source": script_dir + os.sep + "sample_molecules.smi", "separate_output_files": True, "job_manager": "serial", # multiprocessing "output_folder": output_folder, "add_pdb_output": False, "max_variants_per_compound": 8, "thoroughness": 1, "min_ph": 4, "max_ph": 10, "pka_precision": 1, "use_durrant_lab_filters": True, } # Prepare the molecules. prepare_molecules(params) Utils.log("") Utils.log("TEST RESULTS") Utils.log("============") # Get the output sdf files. sdf_files = glob.glob(output_folder + "*") # There should be seven sdf files. msg = "Expected 15 output files, got " + str(len(sdf_files)) + "." if len(sdf_files) != 15: Utils.exception("FAILED. " + msg) else: Utils.log("PASSED. " + msg) # Get all the smiles from the files. all_smiles = set([]) for sdf_file in sdf_files: lines = open(sdf_file).readlines() for i, line in enumerate(lines): if "<SMILES>" in line: all_smiles.add(lines[i + 1].strip()) # List what the smiles should be. target_smiles = set([]) # salt_and_ionization should produce two models (ionized and # deionized). target_smiles |= set(["[O-]c1ccccc1", "Oc1ccccc1"]) # tautomer_and_cis_trans should produce three models (two tautomers, one # of them with alternate cis/trans). target_smiles |= set([r"C/C=C\O", "C/C=C/O", "CCC=O"]) # two_chiral_one_unspecified_and_tautomer should produce four models. target_smiles |= set([ "CC(C)C(=O)[C@@](F)(Cl)C[C@@](C)(F)Cl", "CC(C)=C(O)[C@@](F)(Cl)C[C@@](C)(F)Cl", "CC(C)C(=O)[C@](F)(Cl)C[C@@](C)(F)Cl", "CC(C)=C(O)[C@](F)(Cl)C[C@@](C)(F)Cl", ]) # two_double_bonds_one_chiral_center should produce eight models. target_smiles |= set([ r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl", "CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl", r"CC/C(C[C@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl", r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl", r"CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(\C)Cl", r"CC/C(C[C@@](C)(Cl)I)=C(I)\C(F)=C(\C)Cl", "CC/C(C[C@@](C)(Cl)I)=C(I)/C(F)=C(/C)Cl", r"CC/C(C[C@](C)(Cl)I)=C(I)\C(F)=C(/C)Cl", ]) # two_double_bonds_one_unspecified should produce two models. target_smiles |= set( [r"CC/C(C)=C(\Cl)C/C(I)=C(\C)F", r"CC/C(C)=C(/Cl)C/C(I)=C(\C)F"]) # non_aromatic_ring should produce one model. It will list it several # times, because different ring conformations of the same model. target_smiles |= set(["CC(C)(C)[C@H]1CC[C@@H](C(C)(C)C)CC1"]) # There should be no =[N-] if Durrant lab filters are turned on. Note: # Removed "CC(=N)O" from below list because durrant lab filters now remove # iminols. target_smiles |= set(["CC([NH-])=O", "CC(N)=O"]) # There should be no [N-]C=[N+] (CC(=O)[N-]C=[N+](C)C). target_smiles |= set([ r"C/C(O)=N\C=[N+](C)C", r"CC(=O)/N=C\[NH+](C)C", "CC(=O)/N=C/[NH+](C)C", "CC(=O)NC=[N+](C)C", "C/C(O)=N/C=[N+](C)C", ]) # There should be no [nH+]c[n-] (c1c[nH+]c[n-]1) target_smiles |= set(["c1c[n-]cn1", "c1c[nH+]c[nH]1", "c1c[nH]cn1"]) # There should be no [#7+]~[#7+] (c1cc[nH+][nH+]c1) target_smiles |= set(["c1ccnnc1", "c1cc[nH+]nc1"]) # There should be no [#7-]~[#7-] (CC(=O)[N-][N-]C(C)=O). Note that some # are commented out because Python2 and Python3 given different SMILES # strings that are all valid. See below to see how things are # consolodated. (Really this was probably a bad example to pick because # there are so many forms...) target_smiles |= set([ "CC(=O)NNC(C)=O", # r"CC(=O)N/N=C(\C)O", # r"CC(=O)[N-]/N=C(/C)O", # r"C/C(O)=N/N=C(\C)O", r"C/C(O)=N\N=C(/C)O", # r"CC(=O)[N-]/N=C(\C)O", # "CC(=O)[N-]NC(C)=O", # "CC(=O)N/N=C(/C)O" ]) # There should be no [!#7]~[#7+]~[#7-]~[!#7] (c1c[n-][nH+]c1) target_smiles |= set(["c1cn[n-]c1", "c1cn[nH]c1", "c1c[nH][nH+]c1"]) # Azides can have adjacent +/- nitrogens. target_smiles |= set(["CN=[N+]=[N-]", "CN=[N+]=N"]) # msg = "Expected " + str(len(target_smiles)) + " total SMILES, got " + \ # str(len(all_smiles)) + "." # if len(all_smiles) != len(target_smiles): # Utils.exception("FAILED. " + msg) # else: # Utils.log("PASSED. " + msg) # Python3 gives some smiles that are different than thsoe obtain with # Python2. But they are just different representations of the same thing. # Let's make the switch to the Python2 form for this test. all_smiles = set( ["CN=[N+]=N" if s == "[H]N=[N+]=NC" else s for s in all_smiles]) # Note: Commented out below because durrant lab filters now remove # iminols. # all_smiles = set( # ["CC(=N)O" if s in [r"[H]/N=C(\C)O", "[H]/N=C(/C)O"] else s for s in all_smiles] # ) all_smiles = set([ r"C/C(O)=N\N=C(/C)O" if s == r"C/C(O)=N/N=C(/C)O" else s # Different one that turns up sometimes for s in all_smiles ]) all_smiles = set([ r"CC(=O)NNC(C)=O" if s in [ r"CC(=O)[N-]/N=C(\C)O", r"C/C(O)=N/N=C(\C)O", r"CC(=O)N/N=C(\C)O", r"CC(=O)[N-]/N=C(/C)O", r"CC(=O)[N-]NC(C)=O", r"CC(=O)N/N=C(/C)O", ] else s # Different one that turns up sometimes for s in all_smiles ]) if len(all_smiles ^ target_smiles) > 0: print(all_smiles) print(target_smiles) import pdb pdb.set_trace() Utils.exception( "FAILED. " + "Got some SMILES I didn't expect (either in output or target list): " + " ".join(list(all_smiles ^ target_smiles))) else: Utils.log( "PASSED. Gypsum-DL output the very SMILES strings I was expecting." ) Utils.log("") # Delete test output directory if it exists. if os.path.exists(output_folder): shutil.rmtree(output_folder)
def prepare_molecules(args): """A function for preparing small-molecule models for docking. To work, it requires that the python module rdkit be installed on the system. :param args: The arguments, from the commandline. :type args: dict """ # Keep track of the tim the program starts. start_time = datetime.now() # A list of command-line parameters that will be ignored if using a json # file. json_warning_list = [ "source", "output_folder", "num_processors", "min_ph", "max_ph", "delta_ph_increment", "thoroughness", "max_variants_per_compound", "pka_precision", ] # Whether to warn the user that the above parameters, if specified, will # be ignored. need_to_print_override_warning = False if "json" in args: # "json" is one of the parameters, so we'll be ignoring the rest. try: params = json.load(open(args["json"])) except: Utils.exception("Is your input json file properly formed?") params = set_parameters(params) if [i for i in json_warning_list if i in list(args.keys())]: need_to_print_override_warning = True else: # We're actually going to use all the command-line parameters. No # warning necessary. params = set_parameters(args) # If running in serial mode, make sure only one processor is used. if params["job_manager"] == "serial": if params["num_processors"] != 1: Utils.log( "Because --job_manager was set to serial, this will be run on a single processor." ) params["num_processors"] = 1 # Handle mpi errors if mpi4py isn't installed if params["job_manager"] == "mpi": # Before executing Parallelizer with mpi4py (which override python raise Exceptions) # We must check that it is being run with the "-m mpi4py" runpy flag sys_modules = sys.modules if "runpy" not in sys_modules.keys(): printout = "\nTo run in mpi mode you must run with -m flag. ie) mpirun -n $NTASKS python -m mpi4py run_gypsum_dl.py\n" print(printout) Utils.exception(printout) # Check mpi4py import try: import mpi4py except: printout = "\nmpi4py not installed but --job_manager is set to mpi. \n Either install mpi4py or switch job_manager to multiprocessing or serial.\n" print(printout) Utils.exception(printout) # Check mpi4py import version. This must be at version 2.1.0 and higher mpi4py_version = mpi4py.__version__ mpi4py_version = [int(x) for x in mpi4py_version.split(".")] if mpi4py_version[0] == 2: if mpi4py_version[1] < 1: printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n" print(printout) Utils.exception(printout) elif mpi4py_version[0] < 2: printout = "\nmpi4py version 2.1.0 or higher is required. Use the 'python -m mpi4py' flag to run in mpi mode.\nPlease update mpi4py to a newer version, or switch job_manager to multiprocessing or serial.\n" print(printout) Utils.exception(printout) # Throw a message if running on windows. Windows doesn't deal with with # multiple processors, so use only 1. if sys.platform == "win32": Utils.log( "WARNING: Multiprocessing is not supported on Windows. Tasks will be run in Serial mode." ) params["num_processors"] = 1 params["job_manager"] = "serial" # Launch mpi workers if that's what's specified. if params["job_manager"] == "mpi": params["Parallelizer"] = Parallelizer( params["job_manager"], params["num_processors"] ) else: # Lower-level mpi (i.e. making a new Parallelizer within an mpi) has # problems with importing the MPI environment and mpi4py. So we will # flag it to skip the MPI mode and just go to multiprocess/serial. # This is a saftey precaution params["Parallelizer"] = Parallelizer( params["job_manager"], params["num_processors"], True ) # Let the user know that their command-line parameters will be ignored, if # they have specified a json file. if need_to_print_override_warning == True: Utils.log("WARNING: Using the --json flag overrides all other flags.") # If running in mpi mode, separate_output_files must be set to true. if params["job_manager"] == "mpi" and params["separate_output_files"] == False: Utils.log( "WARNING: Running in mpi mode, but separate_output_files is not set to True. Setting separate_output_files to True anyway." ) params["separate_output_files"] = True # Outputing HTML files not supported in mpi mode. if params["job_manager"] == "mpi" and params["add_html_output"] == True: Utils.log( "WARNING: Running in mpi mode, but add_html_output is set to True. HTML output is not supported in mpi mode." ) params["add_html_output"] = False # Warn the user if he or she is not using the Durrant lab filters. if params["use_durrant_lab_filters"] ==- False: Utils.log( "WARNING: Running Gypsum-DL without the Durrant-lab filters. In looking over many Gypsum-DL-generated " + "variants, we have identified a number of substructures that, though technically possible, strike us " + "as improbable or otherwise poorly suited for virtual screening. We strongly recommend removing these " + "by running Gypsum-DL with the --use_durrant_lab_filters option.", trailing_whitespace="\n" ) # Load SMILES data if isinstance(params["source"], str): Utils.log("Loading molecules from " + os.path.basename(params["source"]) + "...") # Smiles must be array of strs. src = params["source"] if src.lower().endswith(".smi") or src.lower().endswith(".can"): # It's an smi file. smiles_data = load_smiles_file(src) elif params["source"].lower().endswith(".sdf"): # It's an sdf file. Convert it to a smiles. smiles_data = load_sdf_file(src) else: smiles_data = [params["source"]] else: pass # It's already in the required format. # Make the output directory if necessary. if os.path.exists(params["output_folder"]) == False: os.mkdir(params["output_folder"]) if os.path.exists(params["output_folder"]) == False: Utils.exception("Output folder directory couldn't be found or created.") # For Debugging # print("") # print("###########################") # print("num_procs : ", params["num_processors"]) # print("chosen mode : ", params["job_manager"]) # print("Parallel style: ", params["Parallelizer"].return_mode()) # print("Number Nodes: ", params["Parallelizer"].return_node()) # print("###########################") # print("") # Make the molecule containers. contnrs = [] idx_counter = 0 for i in range(0, len(smiles_data)): try: smiles, name, props = smiles_data[i] except: msg = 'Unexpected error. Does your "source" parameter specify a ' msg = msg + "filename that ends in a .can, .smi, or .sdf extension?" Utils.exception(msg) if detect_unassigned_bonds(smiles) is None: Utils.log( "WARNING: Throwing out SMILES because of unassigned bonds: " + smiles ) continue new_contnr = MolContainer(smiles, name, idx_counter, props) if ( new_contnr.orig_smi_canonical == None or type(new_contnr.orig_smi_canonical) != str ): Utils.log( "WARNING: Throwing out SMILES because of it couldn't convert to mol: " + smiles ) continue contnrs.append(new_contnr) idx_counter += 1 # Remove None types from failed conversion contnrs = [x for x in contnrs if x.orig_smi_canonical != None] if len(contnrs) != idx_counter: Utils.exception("There is a corrupted container") # In multiprocessing mode, Gypsum-DL parallelizes each small-molecule # preparation step separately. But this scheme is inefficient in MPI mode # because it increases the amount of communication required between nodes. # So for MPI mode, we will run all the preparation steps for a given # molecule container on a single thread. if params["Parallelizer"].return_mode() != "mpi": # Non-MPI (e.g., multiprocessing) execute_gypsum_dl(contnrs, params) else: # MPI mode. Group the molecule containers so they can be passed to the # parallelizer. job_input = [] temp_param = {} for key in list(params.keys()): if key == "Parallelizer": temp_param["Parallelizer"] = None else: temp_param[key] = params[key] for contnr in contnrs: contnr.contnr_idx = 0 # Because each container being run in isolation. job_input.append(tuple([[contnr], temp_param])) job_input = tuple(job_input) params["Parallelizer"].run(job_input, execute_gypsum_dl) # Calculate the total run time. end_time = datetime.now() run_time = end_time - start_time params["start_time"] = str(start_time) params["end_time"] = str(end_time) params["run_time"] = str(run_time) Utils.log("\nStart time at: " + str(start_time)) Utils.log("End time at: " + str(end_time)) Utils.log("Total time at: " + str(run_time)) # Kill mpi workers if necessary. params["Parallelizer"].end(params["job_manager"])