def test_seed_produces_same_conformers(self): import rdkit.Chem from rdkit.Chem import AllChem from e3fp.conformer.util import ( mol_from_smiles, mol_to_standardised_mol, ) from e3fp.conformer.generate import generate_conformers ntrials = 10 confgen_params = {'num_conf': 1, 'seed': 42} smiles = "C" * 20 # long flexible molecule mol = mol_from_smiles(smiles, "tmp") mols = [ generate_conformers(mol, **confgen_params)[0] for i in range(ntrials) ] fail = False for i in range(ntrials): for j in range(i + 1, ntrials): rms = AllChem.GetBestRMS(mols[i], mols[j]) if rms > 1e-2: fail = True break self.assertFalse(fail)
def run(smiles_file, bits=1024, radius=2, use_chiral=False, out_file="molecules.csv.bz2", log=None): setup_logging(log) smiles_dict = smiles_to_dict(smiles_file) mol_list_dict = {} for name, smiles in smiles_dict.iteritems(): try: mol = mol_from_smiles(smiles, name) logging.info("Generating fingerprint for {}".format(name)) fp = fprint2d_from_mol(mol, bits=bits, radius=radius, use_chiral=use_chiral) logging.info("Generated fingerprint for {}".format(name)) mol_list_dict.setdefault(name, []).append(fprint_to_native_tuple(fp)) except Exception: logging.warning("Fingerprinting {} failed.".format(name)) fp_type = get_fprint2d_fptype(bits=bits, radius=radius, use_chiral=use_chiral) lists_dicts_to_molecules(out_file, smiles_dict, mol_list_dict, fp_type)
def get_canonical_smiles(smiles_dict): canonical_smiles_dict = {} for i, (mol_name, smiles) in enumerate(smiles_dict.items()): try: mol = mol_from_smiles(smiles, mol_name, standardise=True) canon_smiles = MolToSmiles(mol, isomericSmiles=True) canonical_smiles_dict[mol_name] = canon_smiles except: canonical_smiles_dict[mol_name] = smiles return canonical_smiles_dict
def main(sdf_dir, smiles_file, num_mols=10000, first=3, out_props_file="random_mols_props.txt", out_smiles_file="random_mols.csv.bz2"): mol_names = set() if os.path.isfile(out_smiles_file): logging.info("Loading existing random molecules.") smiles_dict = smiles_to_dict(out_smiles_file) mol_names.update(set(smiles_dict)) out_sdf_files_dict = {k: get_sdf_file(sdf_dir, k) for k in mol_names} else: logging.info("Loading SMILES file.") smiles_dict = smiles_to_dict(smiles_file) remaining_mol_names = set(smiles_dict.keys()) out_smiles_dict = {} out_sdf_files_dict = {} logging.info("Picking random molecules.") while len(mol_names) < num_mols: print(len(mol_names)) proto_name = random.choice(smiles_dict.keys()) if proto_name not in remaining_mol_names: continue remaining_mol_names.remove(proto_name) sdf_file = get_sdf_file(sdf_dir, proto_name) if not os.path.isfile(sdf_file): continue mol_names.add(proto_name) out_smiles_dict[proto_name] = smiles_dict[proto_name] out_sdf_files_dict[proto_name] = sdf_file if len(mol_names) % 100 == 0: logging.info(len(mol_names)) dict_to_smiles(out_smiles_file, out_smiles_dict) mol_names = sorted(mol_names) logging.info("Computing mol properties.") mol_props = {} for name, smiles in smiles_dict.items(): mol = mol_from_smiles(smiles, name) nheavy = mol.GetNumHeavyAtoms() nrot = AllChem.CalcNumRotatableBonds(mol) mol_props[name] = (nheavy, nrot) with open(out_props_file, "w") as f: f.write("mol_name\tnheavy\tnrot\n") for mol_name in mol_names: nheavy, nrot = mol_props[mol_name] f.write("{}\t{:d}\t{:d}\n".format(mol_name, nheavy, nrot))
def test_standardisation(self): import rdkit.Chem from e3fp.conformer.util import ( mol_from_smiles, mol_to_standardised_mol, ) smiles = "C[N-]c1cccc[n+]1C" mol = mol_from_smiles(smiles, "tmp") self.assertEqual(rdkit.Chem.MolToSmiles(mol), smiles) mol = mol_to_standardised_mol(mol) self.assertEqual(rdkit.Chem.MolToSmiles(mol), "CN=c1ccccn1C")
def run( mol2=None, smiles=None, standardise=STANDARDISE_DEF, num_conf=NUM_CONF_DEF, first=FIRST_DEF, pool_multiplier=POOL_MULTIPLIER_DEF, rmsd_cutoff=RMSD_CUTOFF_DEF, max_energy_diff=MAX_ENERGY_DIFF_DEF, forcefield=FORCEFIELD_DEF, seed=SEED_DEF, params=None, prioritize=False, out_dir=OUTDIR_DEF, compress=COMPRESS_DEF, overwrite=False, values_file=None, log=None, num_proc=None, parallel_mode=None, verbose=False, ): """Run conformer generation.""" setup_logging(log, verbose=verbose) if params is not None: params = read_params(params) standardise = get_value(params, "preprocessing", "standardise", bool) num_conf = get_value(params, "conformer_generation", "num_conf", int) first = get_value(params, "conformer_generation", "first", int) pool_multiplier = get_value(params, "conformer_generation", "pool_multiplier", int) rmsd_cutoff = get_value(params, "conformer_generation", "rmsd_cutoff", float) max_energy_diff = get_value(params, "conformer_generation", "max_energy_diff", float) forcefield = get_value(params, "conformer_generation", "forcefield") seed = get_value(params, "conformer_generation", "seed", int) # check args if forcefield not in FORCEFIELD_CHOICES: raise ValueError( "Specified forcefield {} is not in valid options {!r}".format( forcefield, FORCEFIELD_CHOICES)) para = Parallelizer(num_proc=num_proc, parallel_mode=parallel_mode) # Check to make sure args make sense if mol2 is None and smiles is None: if para.is_master(): parser.print_usage() logging.error("Please provide mol2 file or a SMILES file.") sys.exit() if mol2 is not None and smiles is not None: if para.is_master(): parser.print_usage() logging.error("Please provide only a mol2 file OR a SMILES file.") sys.exit() if num_proc and num_proc < 1: if para.is_master(): parser.print_usage() logging.error( "Please provide more than one processor with `--num_proc`.") sys.exit() # Set up input type if mol2 is not None: in_type = "mol2" elif smiles is not None: in_type = "smiles" if para.is_master(): if in_type == "mol2": logging.info("Input type: mol2 file(s)") logging.info("Input file number: {:d}".format(len(mol2))) mol_iter = (mol_from_mol2(_mol2_file, _name, standardise=standardise) for _mol2_file, _name in mol2_generator(*mol2)) else: logging.info("Input type: Detected SMILES file(s)") logging.info("Input file number: {:d}".format(len(smiles))) mol_iter = (mol_from_smiles(_smiles, _name, standardise=standardise) for _smiles, _name in smiles_generator(*smiles)) if prioritize: logging.info(("Prioritizing mols with low rotatable bond number" " and molecular weight first.")) mols_with_properties = [( AllChem.CalcNumRotatableBonds(mol), AllChem.CalcExactMolWt(mol), mol, ) for mol in mol_iter if mol is not None] data_iterator = make_data_iterator( (x[-1] for x in sorted(mols_with_properties))) else: data_iterator = make_data_iterator( (x for x in mol_iter if x is not None)) # Set up parallel-specific options logging.info("Parallel Type: {}".format(para.parallel_mode)) # Set other options touch_dir(out_dir) if not num_conf: num_conf = -1 logging.info("Out Directory: {}".format(out_dir)) logging.info("Overwrite Existing Files: {}".format(overwrite)) if values_file is not None: if os.path.exists(values_file) and overwrite is not True: value_args = (values_file, "a") logging.info("Values file: {} (append)".format((values_file))) else: value_args = (values_file, "w") logging.info("Values file: {} (new file)".format( (values_file))) if num_conf is None or num_conf == -1: logging.info("Target Conformer Number: auto") else: logging.info("Target Conformer Number: {:d}".format(num_conf)) if first is None or first == -1: logging.info("First Conformers Number: all") else: logging.info("First Conformers Number: {:d}".format(first)) logging.info("Pool Multiplier: {:d}".format(pool_multiplier)) logging.info("RMSD Cutoff: {:.4g}".format(rmsd_cutoff)) if max_energy_diff is None: logging.info("Maximum Energy Difference: None") else: logging.info("Maximum Energy Difference: {:.4g} kcal".format( max_energy_diff)) logging.info("Forcefield: {}".format(forcefield.upper())) if seed != -1: logging.info("Seed: {:d}".format(seed)) logging.info("Starting.") else: data_iterator = iter([]) gen_conf_kwargs = { "out_dir": out_dir, "num_conf": num_conf, "rmsd_cutoff": rmsd_cutoff, "max_energy_diff": max_energy_diff, "forcefield": forcefield, "pool_multiplier": pool_multiplier, "first": first, "seed": seed, "save": True, "overwrite": overwrite, "compress": compress, } run_kwargs = {"kwargs": gen_conf_kwargs} results_iterator = para.run_gen(generate_conformers, data_iterator, **run_kwargs) if para.is_master() and values_file is not None: hdf5_buffer = HDF5Buffer(*value_args) for result, data in results_iterator: if (para.is_master() and values_file is not None and result is not False): values_to_hdf5(hdf5_buffer, result) if para.is_master() and values_file is not None: hdf5_buffer.flush() hdf5_buffer.close()